基于Scrapy爬虫框架的全站新闻数据采集案例
不知名菜鸟
爱分享,爱生活,爱工作。
Official Account
01 run执行文件:进行项目运行操作
1# -*- coding: utf-8 -*-
2from scrapy.cmdline import execute
3
4
5execute(['scrapy', 'crawl', 'dianxin'])
02 爬虫文件核心代码
1import scrapy
2from dianxin_news.items import DianxinNewsItem
3import requests
4import re
5
6
7class DianxinSpider(scrapy.Spider):
8 name = 'dianxin'
9 # allowed_domains = ['xxx.com']
10 start_urls = ['http://eie.hyit.edu.cn/'] # 某新闻网站,其他网站分析思路一致
11 model_urls = []
12
13 def parse(self, response):
14 i = [0, 2, 3, 4, 5, 6, 7] # 待抓取板块序号
15 lis = response.xpath('/html/body/div[4]/div/ul/li')
16 for index, li in enumerate(lis):
17 if index in i:
18 model_url = ['http://eie.hyit.edu.cn/'+j for j in li.xpath('./div[1]/a/@href').extract()]
19 self.model_urls.append([url for url in model_url])
20 self.model_urls = [url for urls in self.model_urls for url in urls] # 二维数组变为一维
21 for url in self.model_urls:
22 base_url = re.findall('(.*?).htm', url)[0] # 构造板块URL
23 yield scrapy.Request(url=url, callback=self.parse_model, meta={'url':base_url})
24
25 def parse_model(self, response):
26 item = DianxinNewsItem()
27 base_url = response.meta.get('url')
28 # 获取当前页面全部新闻的URL地址
29 lis = response.xpath('//*[@id="conmains"]/div/div[3]/ul/li')
30 for li in lis:
31 url = 'http://eie.hyit.edu.cn/info/'+li.xpath('./div[2]/a/@href').extract_first().split('info/')[-1]
32 item['url'] = url
33 yield scrapy.Request(url=url, callback=self.parse_detail, meta={'item': item})
34
35 # 获取下一页URL地址
36 try:
37 next_page = response.css('.Next::attr(href)').extract_first()
38 except Exception:
39 pass
40 else:
41 if next_page != None:
42 page = re.findall('(\d+).htm', next_page)[0]
43 next_url = base_url + '/' + page + '.htm'
44 yield scrapy.Request(url=next_url, callback=self.parse_model, meta={'url':base_url})
45
46 def parse_detail(self, response):
47 item = response.meta.get('item')
48 title = response.xpath('//*[@id="conmains"]/div/div[2]/form/h2/text()').extract_first()
49 author = response.xpath('//*[@id="conmains"]/div/div[2]/form/h3/span[1]/text()').extract_first().split(':')[-1]
50 item['author'] = author
51 time = response.xpath('//*[@id="conmains"]/div/div[2]/form/h3/span[2]/text()').extract_first().split(':')[-1]
52 item['time'] = time
53 click_webnews = eval(
54 response.xpath('//*[@id="conmains"]/div/div[2]/form/h3/span[3]/script/text()').extract_first().split(
55 'Clicks')[-1])
56 click_nums_url = f'http://eie.hyit.edu.cn/system/resource/code/news/click/dynclicks.jsp?clickid={click_webnews[-1]}&owner={click_webnews[-2]}&clicktype={click_webnews[0]}'
57 click = requests.get(click_nums_url).json()
58 item['click'] = click
59
60 item['title'] = title
61
62 yield item
03 items项目类:定义结果的数据结构
1import scrapy
2
3
4class DianxinNewsItem(scrapy.Item):
5 title = scrapy.Field()
6 author = scrapy.Field()
7 time = scrapy.Field()
8 click = scrapy.Field()
9 url = scrapy.Field()
04 Pipelines项目管道:清洗、验证、存储数据
1import pymysql
2
3class DianxinNewsPipeline:
4 def open_spider(self, spider):
5 """连接MySQL"""
6 self._conn = pymysql.connect(user='root', password='123456', host='127.0.0.1', port=3306,
7 database='scrapy_infos', autocommit=True)
8
9 def process_item(self, item, spider):
10 """基于管道的持久化存储"""
11 cursor = self._conn.cursor()
12 sql = 'insert into hyitdx(release_time, click_nums, title, author, url) values(%s,%s,%s,%s,%s)'
13 cursor.execute(sql, (item['time'], item['click'], item['title'], item['author'], item['url']))
14 print(item['title'], '保存成功!')
15
16 return item
17
18 def close_spider(self, spider):
19 """关闭MySQL"""
20 self._conn.close()
05 settings配置文件:全局配置相关参数
1BOT_NAME = 'dianxin_news'
2
3SPIDER_MODULES = ['dianxin_news.spiders']
4NEWSPIDER_MODULE = 'dianxin_news.spiders'
5
6
7# Crawl responsibly by identifying yourself (and your website) on the user-agent
8#USER_AGENT = 'dianxin_news (+http://www.yourdomain.com)'
9
10
11# Obey robots.txt rules
12ROBOTSTXT_OBEY = False
13USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
14LOG_LEVEL = 'ERROR'
15RETRY_ENABLED = False
16DOWNLOAD_TIMEOUT = 10
17COOKIES_ENABLED = False
18TELNETCONSOLE_ENABLED = False
19CONCURRENT_REQUESTS = 300
20# Configure item pipelines
21# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
22ITEM_PIPELINES = {
23 'dianxin_news.pipelines.DianxinNewsPipeline': 300,
24}
06 项目结果展示
***************************************************************
完