R语言批量获取CFH网站植物信息
安装包
library("xml2")
library("rvest")
library("dplyr")
library("stringr")
library(xlsx)
代码及注释
data <- data.frame()#创建数据框
a<-read.xlsx("CFH.xlsx",1)#读取文件,第一列为植物的代号
d<-a[,1]
for (i in 1:length(d)) {
site <- paste("http://www.cfh.ac.cn/",d[i],".sp",sep="")#每一个植物d额网址
file <- paste("./",d[i],".xml",sep="")
JudgeXML<-try(download.file(site, destfile = file, quiet = TRUE),silent=TRUE)
if(JudgeXML==0){#判断是否存在植物信息,未存在则跳过
web <- readLines(site,encoding="UTF-8")#第一种从网址获取数据的方法
web1 <- read_html(site, encoding = "UTF-8")#第二种从网址获取数据的方法
Title <- web1%>%html_nodes("title")%>%html_text()#便于判断是否包含植物信息
if(Title=="未找到物种信息"){
}else{
SpLatinName <- str_extract_all(string = web, pattern = "<span class='SpLatinName' level='Species'>.+</span>")#判定该条植物信息是界门纲目科属种等
SpLatinName <- unlist(SpLatinName)
level="Species"
if(length(SpLatinName)==0){
SpLatinName <- str_extract_all(string = web, pattern = "<span class='SpLatinName' level='Genus'>.+</span>")
SpLatinName <- unlist(SpLatinName)
level="Genus"
if(length(SpLatinName)==0){
SpLatinName <- str_extract_all(string = web, pattern = "<span class='SpLatinName' level='Family'>.+</span>")
SpLatinName <- unlist(SpLatinName)
level="Family"
if(length(SpLatinName)==0){
SpLatinName <- str_extract_all(string = web, pattern = "<span class='SpLatinName' level='Subser.'>.+</span>")
SpLatinName <- unlist(SpLatinName)
level="Subser"
if(length(SpLatinName)==0){
SpLatinName <- str_extract_all(string = web, pattern = "<span class='SpLatinName' level='Kingdom'>.+</span>")
SpLatinName <- unlist(SpLatinName)
level="Kingdom"
if(length(SpLatinName)==0){
SpLatinName <- str_extract_all(string = web, pattern = "<span class='SpLatinName' level='Phylum'>.+</span>")
SpLatinName <- unlist(SpLatinName)
level="Phylum"
if(length(SpLatinName)==0){
SpLatinName <- str_extract_all(string = web, pattern = "<span class='SpLatinName' level='Class'>.+</span>")
SpLatinName <- unlist(SpLatinName)
level="Class"
if(length(SpLatinName)==0){
SpLatinName <- str_extract_all(string = web, pattern = "<span class='SpLatinName' level='Subclass'>.+</span>")
SpLatinName <- unlist(SpLatinName)
level="Subclass"
if(length(SpLatinName)==0){
SpLatinName <- str_extract_all(string = web, pattern = "<span class='SpLatinName' level='Order'>.+</span>")
SpLatinName <- unlist(SpLatinName)
level="Order"
if(length(SpLatinName)==0){
SpLatinName <- str_extract_all(string = web, pattern = "<span class='SpLatinName' level='Subfamily'>.+</span>")
SpLatinName <- unlist(SpLatinName)
level="Subfamily"
if(length(SpLatinName)==0){
SpLatinName <- str_extract_all(string = web, pattern = "<span class='SpLatinName' level='Tribe'>.+</span>")
SpLatinName <- unlist(SpLatinName)
level="Tribe"
if(length(SpLatinName)==0){
SpLatinName <- str_extract_all(string = web, pattern = "<span class='SpLatinName' level='Subgenus'>.+</span>")
SpLatinName <- unlist(SpLatinName)
level="Subgenus"
if(length(SpLatinName)==0){
SpLatinName <- str_extract_all(string = web, pattern = "<span class='SpLatinName' level='Section'>.+</span>")
SpLatinName <- unlist(SpLatinName)
level="Section"
if(length(SpLatinName)==0){
SpLatinName <- str_extract_all(string = web, pattern = "<span class='SpLatinName' level='Series'>.+</span>")
SpLatinName <- unlist(SpLatinName)
level="Series"
if(length(SpLatinName)==0){
SpLatinName <- str_extract_all(string = web, pattern = "<span class='SpLatinName' level='Domain'>.+</span>")
SpLatinName <- unlist(SpLatinName)
level="Domain"
if(length(SpLatinName)==0){
SpLatinName <- str_extract_all(string = web, pattern = "<span class='SpLatinName' level='Subphylum'>.+</span>")
SpLatinName <- unlist(SpLatinName)
level="Subphylum"
if(length(SpLatinName)==0){
SpLatinName="无"
level="Others"
}else{}
}else{}
}else{}
}else{}
}else{}
}else{}
}else{}
}else{}
}else{}
}else{}
}else{}
}else{}
}else{}
}else{}
}else{}
}else{}
Latin.names <- str_extract(string = SpLatinName, pattern = ">[^&].+<") %>% str_replace_all(string = ., pattern = ">|<",replacement = "")#获取拉丁名
Author <- web1%>%html_nodes(".SpLatinNameSAuthor")%>%html_text()#拉丁人名
CNname <- web1%>%html_nodes(".SpCName")%>%html_text()#中文名
p <- str_extract_all(string = web, pattern = "<td>.+</td>")#获取td下的数据列表
p <- unlist(p)#展示出列表
LatinYM="无"#先假设拉丁异名无
BM="无"#先假设别名无
ZM="无"#先假设正名无,然后开始循环查找
for(j in 1:length(p)){
if(stringr::str_detect(p[j],"异名")){YM <- web1%>%html_nodes(".plantname")%>%html_text()#如果数据列表中包含“异名”两个字,则获取YM
if(length(YM)==0){LatinYM="无"
}else{x=1
LatinYM<-YM[1]
while(x < length(YM)){LatinYM <- paste(LatinYM,"/",YM[x+1],sep="")#异名多个的时候可以将其合并放在一行
x=x+1}}
}else if(stringr::str_detect(p[j],"别名")){#如果包含别名则获取别名
BM <- str_extract_all(string = web, pattern = "<span style='margin-left:5px; margin-right:5px;'>.+</span>")
BM<- unlist(BM)
BM <- str_extract(string = BM, pattern = ">[^&].+<") %>% str_replace_all(string = ., pattern = ">|<|span| style='margin-left:5px; margin-right:5px;'",replacement = "")
}else if(stringr::str_detect(p[j],"正名")){ZM <- str_extract_all(string = web, pattern = "<span class='plantname'>.+</span>")#获取正名
ZM <- unlist(ZM)
if(length(ZM)==0){ZM="无"#如果正名长度为0,那么它则不存在正名
}else{
ZM <- str_extract(string = ZM, pattern = ">[^&].+<") %>%
str_replace_all(string = ., pattern = ">|<",replacement = "")
}
}else{}}
CFH <- data.frame(d[i],CNname,level,Latin.names,Author,LatinYM,BM,ZM,site)#封装数据
data <- rbind(data,CFH)
print(paste("已完成第",i,"个",":",CNname,",","剩余",length(d)-i,"个待处理"))#打印进展信息
}
}else{}
}
write.csv(data,file="./CFH.csv")#导出数据
其他:
.......