搜文章
推荐 原创 视频 Java开发 iOS开发 前端开发 JavaScript开发 Android开发 PHP开发 数据库 开发工具 Python开发 Kotlin开发 Ruby开发 .NET开发 服务器运维 开放平台 架构师 大数据 云计算 人工智能 开发语言 其它开发
Lambda在线 > 租赁青年读书会 > 实战--链家爬虫框架

实战--链家爬虫框架

租赁青年读书会 2018-10-30

浮萍漂泊本无根 天涯游子君莫问


写在之前:时隔10年,我又回来啦! 

                                                    --君莫问

 

一个完整的爬虫框架包含了调度程序、url管理器、下载器、解析器、输出器。

 

好吧,偷懒,整个CSDN的流程图过来~君莫笑

 

调度程序为程序入口,负责模块间数据调度传输

url管理器负责整个框架的url管理

下载器请求万维网页面,并下载到本地

解析器解析下载器下载的数据,获得新的url和需要输出的数据,新的url由调度器传入url管理器

输出器输出数据至本地文件



调度程序:

 

classspider_main(object):

 

    def __init__(self):

 

        self.urlmanager_obj= UrlManager()

        self.parser_obj =HtmlParser()

        self.downloader_obj= HtmlDownloader()

        self.outputer_obj =HtmlOutputer()

 

    def craw(self,url):

        #导入root_url并下载,取得区域及区域url

        root_url_obj = self.downloader_obj.download(url)

        area_urls = self.parser_obj.parse_area_url(root_url_obj)

        #导入区域url并下载,取得镇及镇url

        for area_item in area_urls:

            #print("正在爬取地区 {} 房源信息".format(area_item))

            area_obj = self.downloader_obj.download(area_urls[area_item])

            town_urls = self.parser_obj.parse_town_url(area_obj)

            #导入镇url并下载,取得房源链接

            for town_item in town_urls:

                print("正在爬取 {}·{} 房源链接".format(area_item,town_item))

                for i in range(1,10000):

                    if i == 1:

                        town_url =town_urls[town_item]

                        #print(town_url)

                    else:

                        town_url =town_urls[town_item]+"pg"+str(i)+"/"

                    print(town_url)

                    town_obj = self.downloader_obj.download(town_url)

                    house_urls = self.parser_obj.parse_house_urls(town_obj)

                    if len(house_urls)== 0:

                        break

                    self.urlmanager_obj.add_new_urls(house_urls)

 

                print("{}·{} 房源链接获取完毕".format(area_item,town_item))

                print("开始爬取该地区房源数据")

                #爬取房源数据

                whileself.urlmanager_obj.has_url():

                    try:

                        print("倒计件",len(self.urlmanager_obj.new_urls))

                        house_url = self.urlmanager_obj.get_new_url()

                        house_obj = self.downloader_obj.download(house_url)

                        house_data = self.parser_obj.parse_house_data(house_obj,house_url)

                        self.outputer_obj.output(house_data)

                    except Exception as e:

                        print (e,house_url)

                print("该地区房源数据爬取结束")

 

#主程序函数

def main():

    root_url = "https://sh.lianjia.com/ershoufang/"

    crawer = spider_main()

crawer.craw(root_url)

#入口

if __name__ == "__main__":

    main()

 

 

url管理器:

 

classUrlManager(object):

 

    def __init__(self):

        self.new_urls =set()

        self.old_urls =set()

 

    def has_url(self):

        return len(self.new_urls) != 0

 

    defget_new_url(self):

        url = self.new_urls.pop()

        self.old_urls.add(url)

        return url

 

    defadd_new_url(self, url):

        if url notinself.new_urls or url notinself.old_urls:

            self.new_urls.add(url)

 

    defadd_new_urls(self, urls):

        for url in urls:

            self.add_new_url(url)

 

下载器:

 

import urllib.request

 

classHtmlDownloader(object):

    def download(self,url):

 

        try:

            response =urllib.request.urlopen(url)

            return response.read()

        except Exception as e:

            print(e, url)

            return""

 

解析器:

 

import re

from bs4 import BeautifulSoup

 

class HtmlParser(object):

 

    defparse_house_urls(self, html):

        urls = set()

        soup = BeautifulSoup(html, "html.parser", from_encoding="utf-8")

        links = soup.find_all("a", {"href":re.compile(r"https://sh\.lianjia\.com/ershoufang/\d+\.html"),"class":"title"})

        for link in links:

            urls.add(link["href"])

        return urls

 

    defparse_house_data(self, html,house_url):

        data = list()

        soup = BeautifulSoup(html,"html.parser",from_encoding="utf-8")

#房源唯一标识ID

        class_id = soup.find("div",{"class":"houseRecord"}).find("span",{"class":"info"})

        data.append(class_id.get_text()[:-2])

#房价,单位万元

        price = soup.find("span",{"class":"total"})

        data.append(price.get_text())

#面积、建成时间等

        house_area = soup.find("div",{"class":"overview"}).find("div",{"class":"area"})

        data.append(house_area.get_text())

#所在小区

        map = soup.find("div",{"class":"communityName"}).find("a",{"class":"info"})

        data.append(map.get_text())

        addr = soup.find("div",{"class":"areaName"})

        data.append(addr.get_text().replace("\xa0",""))

#房源链接

        data.append(house_url)

#详细信息

        elements = soup.find("div",{"class":"newwrapbaseinform","id":"introduction"}).find_all("li")

        for element in elements:

            data.append(element.get_text().replace("\xa0",""))

#其他信息(如卖点等)

        others = soup.find("div",{"class":"introContentshowbasemore"}).find_all("div",{"class":"baseattributeclear"})

        for other in others:

           data.append(other.get_text().replace("\xa0",""))

        return data

    defparse_area_url(self, html):

        urls = dict()

        url = "https://sh.lianjia.com"

        soup = BeautifulSoup(html, "html.parser", from_encoding="utf-8")

        links = soup.find_all(

            "a", {

                "title": re.compile(r"上海[\u4E00-\u9FA5]{2,4}在售二手房"),

                "href": re.compile(r"/ershoufang/[\w]+/")

            })

        for link in links:

            urls[link.get_text()] = url + link["href"]

        return urls

 

    defparse_town_url(self, html):

        urls = dict()

        url = "https://sh.lianjia.com"

        soup = BeautifulSoup(html, "html.parser", from_encoding="utf-8")

        links = soup.find("div",{"data-role":"ershoufang"}).find_all("div")

        for link in links[1].find_all("a"):

            urls[link.get_text()] = url + link["href"]

        return urls

 

 

输出器:

 

import csv

 

classHtmlOutputer(object):

#编码一定要用gb18030,部分gbk无法解析

    defoutput(self,data):

        with open("链家_data.csv","a",newline="",encoding="gb18030") as fp:

            csv_obj = csv.writer(fp,dialect="excel")

            csv_obj.writerow(data)

 

结果放张图,整个上海地区房源数据预估为50M,建议用数据库,否则...


 这个框架可以爬取上海、北京、深圳等地方的所有二手房源数据,由于并未采取多线程,整体数据爬取偏慢,待优化。


版权声明:本站内容全部来自于腾讯微信公众号,属第三方自助推荐收录。《实战--链家爬虫框架》的版权归原作者「租赁青年读书会」所有,文章言论观点不代表Lambda在线的观点, Lambda在线不承担任何法律责任。如需删除可联系QQ:516101458

文章来源: 阅读原文

相关阅读

关注租赁青年读书会微信公众号

租赁青年读书会微信公众号:gh_9e91137ab53a

租赁青年读书会

手机扫描上方二维码即可关注租赁青年读书会微信公众号

租赁青年读书会最新文章

精品公众号随机推荐