静态网页爬虫-安居客
python+request+xpath 12小时1w3s条数据 动态验证码
难度:★★
"""
@version: python3.6
@author: ‘achai‘
@software: PyCharm
@file: demo_5.py
@time: 2020/12/26 22:03
"""
import urllib.request
from lxml import etree
import xlwt
import re
import time
import random
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet('housePrice')
sheet.write(0, 0, '经度')
sheet.write(0, 1, '纬度')
sheet.write(0, 2, '毛坯或者精装')
sheet.write(0, 3, '容积率')
sheet.write(0, 4, '绿化率')
sheet.write(0, 5, '物业费')
sheet.write(0, 6, '电梯')
sheet.write(0, 7, '低层或者高层')
sheet.write(0, 8, '室')
sheet.write(0, 9, '厅')
sheet.write(0, 10, '卫')
sheet.write(0, 11, '面积')
sheet.write(0, 12, '建造年')
sheet.write(0, 13, '房价')
sheet.write(0, 14, '小区名称')
sheet.write(0, 15, '小区链接')
n = 1
page_headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
"Cookie": "aQQ_ajkguid=A789826A-7A5C-311C-565E-05CDD916C93C; id58=e87rkF/E7Q5+DzArBOW3Ag==; 58tj_uuid=a6d317a6-9b64-4a76-b92b-5af34c96bde3; als=0; wmda_new_uuid=1; wmda_uuid=a1e5f2b84628e0abf6f31c71bfa07655; wmda_visited_projects=%3B6289197098934; isp=true; ctid=28; _ga=GA1.2.646930384.1606826267; sessid=B47733BE-C6A8-8FEE-E0D8-FB445F012E3B; twe=2; wmda_session_id_6289197098934=1608990327081-f825170b-4ec7-d5d4; obtain_by=1; new_session=1; init_refer=; new_uv=3; _gid=GA1.2.1955540.1608990327; _gat=1; xxzl_cid=d869ee07e4404df6b994853a3eba70ee; xzuid=8b5f7a17-6587-4c55-bcce-02b0664b781f"
}
def getAllurl(baseUrl):
listall=[]
for i in range(0,50):
i=i+1
#https://sjz.anjuke.com/sale/yuhuaf/p2/#filtersort
fullUrl=baseUrl+'p'+str(i)+'/'
#提取每一页的所有链接
req = urllib.request.Request(fullUrl, headers=page_headers)
data = urllib.request.urlopen(req).read().decode('utf-8', 'ignore')
html = etree.HTML(data) # 将页面转换成文档树
# 提取网页所有链接
house_url = html.xpath('//div[@class="house-details"]/div[@class="house-title"]/a/@href')
listall.extend(house_url)
print('该页所有链接提取完成!',i)
return listall
def getMsg(Ourl):
try:
print(Ourl)
req = urllib.request.Request(Ourl, headers=page_headers)
data = urllib.request.urlopen(req).read().decode('utf-8', 'ignore')
html_1 = etree.HTML(data) # 将页面转换成文档树
# 坐标提取
coord = html_1.xpath('//meta[@data-n-head="ssr"][3]/@content')
num_list_new = map(lambda x: str(x), coord) # 将列表转换为字符串
coord_1 = ",".join(num_list_new)
data_coord = re.findall(r"\d+\.?\d*", coord_1)
lan = data_coord[0]
lat = data_coord[1]
# 毛坯或者精装
data_decoration = html_1.xpath(
'//*[@id="__layout"]/div/div[2]/div[2]/div[2]/div[1]/div[3]/div[2]/div[2]/text()')
# 容积率
data_plotRatio = html_1.xpath('//*[@id="community"]/div[1]/div[2]/div[2]/p/text()')
# 绿化率
greeningRate = html_1.xpath('//*[@id="community"]/div[1]/div[3]/div[1]/p[2]/text()')
# 将score_list列表转换为以“,”为分隔符的字符串
name_score_list_string = ",".join(greeningRate)
# 提取数字
data_greeningRate = re.findall(r"\d+\.?\d*", name_score_list_string)
# 物业费 1.2 元/平米/月
propertyFee = html_1.xpath('//*[@id="community"]/div[1]/div[2]/div[1]/p[2]/text()')
# 将score_list列表转换为以“,”为分隔符的字符串
fee_list_string = ",".join(propertyFee)
# 提取数字
data_propertyFee = re.findall(r"\d+\.?\d*", fee_list_string)
# 电梯//*[@id="__layout"]/div/div[2]/div[2]/div[2]/div[1]/div[4]/div[1]/span[3]
data_elevator = html_1.xpath('//*[@id="__layout"]/div/div[2]/div[2]/div[2]/div[1]/div[4]/div[1]/span/text()')
# 低层或者高层
data_floor = html_1.xpath('//*[@id="__layout"]/div/div[2]/div[2]/div[2]/div[1]/div[3]/div[1]/div[2]/text()')
# 户型-几室
data_jishi = html_1.xpath(
'//*[@id="__layout"]/div/div[2]/div[2]/div[2]/div[1]/div[3]/div[1]/div[1]/i[1]/text()')
data_jiting = html_1.xpath(
'//*[@id="__layout"]/div/div[2]/div[2]/div[2]/div[1]/div[3]/div[1]/div[1]/i[2]/text()')
data_jiwei = html_1.xpath(
'//*[@id="__layout"]/div/div[2]/div[2]/div[2]/div[1]/div[3]/div[1]/div[1]/i[3]/text()')
# 面积
house_area = html_1.xpath('//*[@id="__layout"]/div/div[2]/div[2]/div[2]/div[1]/div[3]/div[2]/div[1]/i/text()')
# 建造时间
house_time = html_1.xpath('//*[@id="__layout"]/div/div[2]/div[2]/div[2]/div[1]/div[3]/div[3]/div[2]/text()')
# 将score_list列表转换为以“,”为分隔符的字符串
fee_list_string = ",".join(house_time)
# 提取数字
house_time_2 = re.findall(r"\d+\.?\d*", fee_list_string)
# 房价
house_price = html_1.xpath('//*[@id="__layout"]/div/div[2]/div[2]/div[2]/div[1]/div[2]/div[1]/span[1]/text()')
# 小区名称
name = html_1.xpath('//*[@id="__layout"]/div/div[2]/div[2]/div[2]/div[1]/div[4]/div[2]/div[1]/a[1]/text()')
name_1 = " , ".join(name)
houseName = ''.join(re.findall('[\u4e00-\u9fa5]', name_1))
# 用此条件判断安居客网站是否出现了验证码。如果house_area为空值,则输出提示,否则继续执行
if len(house_area):
global n
sheet.write(n, 0, lan)
sheet.write(n, 1, lat)
sheet.write(n, 2, data_decoration)
sheet.write(n, 3, data_plotRatio)
sheet.write(n, 4, data_greeningRate)
sheet.write(n, 5, data_propertyFee)
sheet.write(n, 6, data_elevator)
sheet.write(n, 7, data_floor)
sheet.write(n, 8, data_jishi)
sheet.write(n, 9, data_jiting)
sheet.write(n, 10, data_jiwei)
sheet.write(n, 11, house_area)
sheet.write(n, 12, house_time_2)
sheet.write(n, 13, house_price)
sheet.write(n, 14, houseName)
sheet.write(n, 15, Ourl)
n = n + 1
book.save('anjuke_sjz.xls')
except:
print("不好!!安居客出现了验证码!!!!!!!!!!!!!!!")
time.sleep(30) # 30秒后执行程序
if __name__ == '__main__':
url_start = "https://sjz.anjuke.com/sale/kfqsjz/"
allurl=getAllurl(url_start)
print('共获取'+str(len(allurl)))
print('所有链接提取完成!')
k = 1
for i in allurl:
getMsg(i)
print('正在写入第'+str(k)+'条数据')
k = k + 1
sleep_time = random.uniform(0.8, 1.5) # random.uniform(x, y) 方法将随机生成一个实数,它在 [x,y] 范围内
# 网页操作太频繁的话会被冻结,这个时候可以用time库里面的sleep操作
time.sleep(sleep_time) # 暂停给定秒数后执行程序
不足:以上代码还未能解决动态验证码,如图所示。