vlambda博客
学习文章列表

静态网页爬虫-安居客

python+request+xpath 12小时1w3s条数据 动态验证码

难度:

"""
@version: python3.6
@author: ‘achai‘
@software: PyCharm
@file: demo_5.py
@time: 2020/12/26 22:03
"
""
import urllib.request
from lxml import etree
import xlwt
import re
import time
import random

book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet('housePrice')
sheet.write(0, 0, '经度')
sheet.write(0, 1, '纬度')
sheet.write(0, 2, '毛坯或者精装')
sheet.write(0, 3, '容积率')
sheet.write(0, 4, '绿化率')
sheet.write(0, 5, '物业费')
sheet.write(0, 6, '电梯')
sheet.write(0, 7, '低层或者高层')
sheet.write(0, 8, '室')
sheet.write(0, 9, '厅')
sheet.write(0, 10, '卫')
sheet.write(0, 11, '面积')
sheet.write(0, 12, '建造年')
sheet.write(0, 13, '房价')
sheet.write(0, 14, '小区名称')
sheet.write(0, 15, '小区链接')
n = 1

page_headers = {
    "User-Agent""Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
    "Cookie""aQQ_ajkguid=A789826A-7A5C-311C-565E-05CDD916C93C; id58=e87rkF/E7Q5+DzArBOW3Ag==; 58tj_uuid=a6d317a6-9b64-4a76-b92b-5af34c96bde3; als=0; wmda_new_uuid=1; wmda_uuid=a1e5f2b84628e0abf6f31c71bfa07655; wmda_visited_projects=%3B6289197098934; isp=true; ctid=28; _ga=GA1.2.646930384.1606826267; sessid=B47733BE-C6A8-8FEE-E0D8-FB445F012E3B; twe=2; wmda_session_id_6289197098934=1608990327081-f825170b-4ec7-d5d4; obtain_by=1; new_session=1; init_refer=; new_uv=3; _gid=GA1.2.1955540.1608990327; _gat=1; xxzl_cid=d869ee07e4404df6b994853a3eba70ee; xzuid=8b5f7a17-6587-4c55-bcce-02b0664b781f"
}

def getAllurl(baseUrl):
    listall=[]
    for i in range(0,50):
        i=i+1
        #https://sjz.anjuke.com/sale/yuhuaf/p2/#filtersort
        fullUrl=baseUrl+'p'+str(i)+'/'

        #提取每一页的所有链接
        req = urllib.request.Request(fullUrl, headers=page_headers)
        data = urllib.request.urlopen(req).read().decode('utf-8''ignore')
        html = etree.HTML(data)  # 将页面转换成文档树
        # 提取网页所有链接
        house_url = html.xpath('//div[@class="house-details"]/div[@class="house-title"]/a/@href')
        listall.extend(house_url)
        print('该页所有链接提取完成!',i)
    return listall

def getMsg(Ourl):
    try:
        print(Ourl)
        req = urllib.request.Request(Ourl, headers=page_headers)
        data = urllib.request.urlopen(req).read().decode('utf-8''ignore')
        html_1 = etree.HTML(data)  # 将页面转换成文档树
        # 坐标提取
        coord = html_1.xpath('//meta[@data-n-head="ssr"][3]/@content')
        num_list_new = map(lambda x: str(x), coord)  # 将列表转换为字符串
        coord_1 = ",".join(num_list_new)
        data_coord = re.findall(r"\d+\.?\d*", coord_1)
        lan = data_coord[0]
        lat = data_coord[1]
        # 毛坯或者精装
        data_decoration = html_1.xpath(
            '//*[@id="__layout"]/div/div[2]/div[2]/div[2]/div[1]/div[3]/div[2]/div[2]/text()')

        # 容积率
        data_plotRatio = html_1.xpath('//*[@id="community"]/div[1]/div[2]/div[2]/p/text()')

        # 绿化率
        greeningRate = html_1.xpath('//*[@id="community"]/div[1]/div[3]/div[1]/p[2]/text()')
        # 将score_list列表转换为以“,”为分隔符的字符串
        name_score_list_string = ",".join(greeningRate)
        # 提取数字
        data_greeningRate = re.findall(r"\d+\.?\d*", name_score_list_string)

        # 物业费 1.2 元/平米/月
        propertyFee = html_1.xpath('//*[@id="community"]/div[1]/div[2]/div[1]/p[2]/text()')
        # 将score_list列表转换为以“,”为分隔符的字符串
        fee_list_string = ",".join(propertyFee)
        # 提取数字
        data_propertyFee = re.findall(r"\d+\.?\d*", fee_list_string)

        # 电梯//*[@id="__layout"]/div/div[2]/div[2]/div[2]/div[1]/div[4]/div[1]/span[3]
        data_elevator = html_1.xpath('//*[@id="__layout"]/div/div[2]/div[2]/div[2]/div[1]/div[4]/div[1]/span/text()')

        # 低层或者高层
        data_floor = html_1.xpath('//*[@id="__layout"]/div/div[2]/div[2]/div[2]/div[1]/div[3]/div[1]/div[2]/text()')

        #  户型-几室
        data_jishi = html_1.xpath(
            '//*[@id="__layout"]/div/div[2]/div[2]/div[2]/div[1]/div[3]/div[1]/div[1]/i[1]/text()')
        data_jiting = html_1.xpath(
            '//*[@id="__layout"]/div/div[2]/div[2]/div[2]/div[1]/div[3]/div[1]/div[1]/i[2]/text()')
        data_jiwei = html_1.xpath(
            '//*[@id="__layout"]/div/div[2]/div[2]/div[2]/div[1]/div[3]/div[1]/div[1]/i[3]/text()')

        # 面积
        house_area = html_1.xpath('//*[@id="__layout"]/div/div[2]/div[2]/div[2]/div[1]/div[3]/div[2]/div[1]/i/text()')

        # 建造时间
        house_time = html_1.xpath('//*[@id="__layout"]/div/div[2]/div[2]/div[2]/div[1]/div[3]/div[3]/div[2]/text()')
        # 将score_list列表转换为以“,”为分隔符的字符串
        fee_list_string = ",".join(house_time)
        # 提取数字
        house_time_2 = re.findall(r"\d+\.?\d*", fee_list_string)

        # 房价
        house_price = html_1.xpath('//*[@id="__layout"]/div/div[2]/div[2]/div[2]/div[1]/div[2]/div[1]/span[1]/text()')

        # 小区名称
        name = html_1.xpath('//*[@id="__layout"]/div/div[2]/div[2]/div[2]/div[1]/div[4]/div[2]/div[1]/a[1]/text()')
        name_1 = " , ".join(name)
        houseName = ''.join(re.findall('[\u4e00-\u9fa5]', name_1))

        # 用此条件判断安居客网站是否出现了验证码。如果house_area为空值,则输出提示,否则继续执行
        if len(house_area):

            global n
            sheet.write(n, 0, lan)
            sheet.write(n, 1, lat)
            sheet.write(n, 2, data_decoration)
            sheet.write(n, 3, data_plotRatio)
            sheet.write(n, 4, data_greeningRate)
            sheet.write(n, 5, data_propertyFee)
            sheet.write(n, 6, data_elevator)
            sheet.write(n, 7, data_floor)
            sheet.write(n, 8, data_jishi)
            sheet.write(n, 9, data_jiting)
            sheet.write(n, 10, data_jiwei)
            sheet.write(n, 11, house_area)
            sheet.write(n, 12, house_time_2)
            sheet.write(n, 13, house_price)
            sheet.write(n, 14, houseName)
            sheet.write(n, 15, Ourl)
            n = n + 1
            book.save('anjuke_sjz.xls')

    except:
        print("不好!!安居客出现了验证码!!!!!!!!!!!!!!!")
        time.sleep(30)  # 30秒后执行程序

if __name__ == '__main__':
    url_start = "https://sjz.anjuke.com/sale/kfqsjz/"
    allurl=getAllurl(url_start)
    print('共获取'+str(len(allurl)))
    print('所有链接提取完成!')
    k = 1
    for i in allurl:
        getMsg(i)
        print('正在写入第'+str(k)+'条数据')
        k = k + 1
        sleep_time = random.uniform(0.8, 1.5)  # random.uniform(x, y) 方法将随机生成一个实数,它在 [x,y] 范围内
        # 网页操作太频繁的话会被冻结,这个时候可以用time库里面的sleep操作
        time.sleep(sleep_time)  # 暂停给定秒数后执行程序


不足:以上代码还未能解决动态验证码,如图所示。