├── Python3网络爬虫中小型项目实战集中营
    ├── 01_python爬取电影天堂
    │   ├── dytt.py
    │   └── 电影天堂.csv
    ├── 02_python爬取斗罗大陆小说
    │   ├── dldl.py
    │   ├── 斗破苍穹小说.csv
    │   ├── 斗破苍穹小说.py
    │   └── 斗罗大陆小说.csv
    ├── 03_python爬取欧洲足球联赛数据
    │   └── footballData.py
    ├── 04_python爬取豆瓣电影Top250
    │   ├── douban_top250_movies.csv
    │   └── filmTop250.py
    ├── 05_python爬取股票数据
    │   └── stockInfo.py
    ├── 06_python爬取人人贷网数据
    │   └── peopleLoad.py
    ├── 07_python爬取创业邦创投库
    │   ├── python爬取创业邦创投库.py
    │   └── resultsDatas.csv
    ├── 08_python抓取美团网百万商家信息
    │   ├── meituan.csv
    │   └── python抓取美团网百万商家信息.py
    ├── 09_python爬取网易云音乐评论并把他们存入mysql数据库
    │   └── python爬取网易云音乐评论并把他们存入mysql数据库.py
    ├── 10_python爬取“网上购物”类APP
    │   ├── apps.csv
    │   ├── python爬取网上购物类APP数据py
    │   └── 网上购物类APP数据分析并展示.py
    ├── 11_python爬取链家网房价信息
    │   ├── Lianjia_Info_v1.py
    │   ├── Lianjia_Info_v2.py
    │   ├── Lianjia_Info_v3.py
    │   ├── Lianjia_Info_v4.py
    │   ├── Lianjia_Info_v4_analysis.py
    │   ├── lianjia.csv
    │   ├── lianjia_ershou_futian_100.xlsx
    │   └── lianjia_re_v4.csv
    ├── 12_python爬取并分析豆瓣中最新电影的影评(词云显示)
    │   ├── alice_mask.png
    │   ├── alice_mask1.png
    │   ├── python爬取并分析豆瓣中最新电影的影评.py
    │   ├── show_Chinese.png
    │   ├── stopwords.txt
    │   └── 豆瓣影评爬取入库.py
    ├── 13_python爬取豆瓣书籍信息
    │   ├── books.csv
    │   └── python爬取豆瓣书籍信息.py
    ├── 14_python爬取今日头条信息并导入mongodb数据库
    │   └── python爬取今日头条信息并导入mongodb数据库.py
    ├── 15_python使用selenium爬取百度招聘内容并存入mongodb数据库
    │   └── python使用selenium爬取百度招聘内容并入mongodb数据库.py
    ├── 16_python爬取熊猫直播用户信息
    │   └── python爬取熊猫直播用户信息.py
    ├── 17_scrapy爬取游天下南京短租房信息并存入mongodb数据库
    │   └── youtxNanJin
    │   │   ├── README.txt
    │   │   ├── scrapy.cfg
    │   │   ├── youtxNanJin
    │   │       ├── __init__.py
    │   │       ├── __pycache__
    │   │       │   ├── __init__.cpython-36.pyc
    │   │       │   ├── items.cpython-36.pyc
    │   │       │   ├── pipelines.cpython-36.pyc
    │   │       │   └── settings.cpython-36.pyc
    │   │       ├── items.py
    │   │       ├── middlewares.py
    │   │       ├── pipelines.py
    │   │       ├── settings.py
    │   │       └── spiders
    │   │       │   ├── __init__.py
    │   │       │   ├── __pycache__
    │   │       │       ├── __init__.cpython-36.pyc
    │   │       │       └── youtxNanJin_spider.cpython-36.pyc
    │   │       │   └── youtxNanJin_spider.py
    │   │   ├── 游天下南京.csv
    │   │   └── 游天下南京.json
    ├── 18_scrapy爬取中国医学人才网信息并以json格式保存
    │   └── chinadoctornet
    │   │   ├── README.txt
    │   │   ├── chinadoctornet
    │   │       ├── __init__.py
    │   │       ├── __pycache__
    │   │       │   ├── __init__.cpython-36.pyc
    │   │       │   ├── items.cpython-36.pyc
    │   │       │   ├── pipelines.cpython-36.pyc
    │   │       │   └── settings.cpython-36.pyc
    │   │       ├── items.py
    │   │       ├── middlewares.py
    │   │       ├── pipelines.py
    │   │       ├── settings.py
    │   │       └── spiders
    │   │       │   ├── __init__.py
    │   │       │   ├── __pycache__
    │   │       │       ├── __init__.cpython-36.pyc
    │   │       │       └── chinadoctornet_spider.cpython-36.pyc
    │   │       │   └── chinadoctornet_spider.py
    │   │   ├── scrapy.cfg
    │   │   ├── 中国医学人才网招聘最新招聘专栏.csv
    │   │   └── 中国医学人才网招聘最新招聘专栏.json
    ├── 19_scrapy框架爬取豆瓣电影top250信息
    │   └── doubanmovie
    │   │   ├── README.txt
    │   │   ├── doubanmovie
    │   │       ├── __init__.py
    │   │       ├── __pycache__
    │   │       │   ├── __init__.cpython-36.pyc
    │   │       │   ├── items.cpython-36.pyc
    │   │       │   └── settings.cpython-36.pyc
    │   │       ├── items.py
    │   │       ├── middlewares.py
    │   │       ├── pipelines.py
    │   │       ├── settings.py
    │   │       └── spiders
    │   │       │   ├── __init__.py
    │   │       │   ├── __pycache__
    │   │       │       ├── __init__.cpython-36.pyc
    │   │       │       └── doubanmovie_spider.cpython-36.pyc
    │   │       │   └── doubanmovie_spider.py
    │   │   ├── items.csv
    │   │   ├── items.json
    │   │   └── scrapy.cfg
    ├── 20_scrapy爬取织梦者网站信息并存入mongodb数据库
    │   └── makedream
    │   │   ├── makedream
    │   │       ├── __init__.py
    │   │       ├── __pycache__
    │   │       │   ├── __init__.cpython-36.pyc
    │   │       │   ├── items.cpython-36.pyc
    │   │       │   ├── pipelines.cpython-36.pyc
    │   │       │   └── settings.cpython-36.pyc
    │   │       ├── items.py
    │   │       ├── middlewares.py
    │   │       ├── pipelines.py
    │   │       ├── settings.py
    │   │       └── spiders
    │   │       │   ├── __init__.py
    │   │       │   ├── __pycache__
    │   │       │       ├── __init__.cpython-36.pyc
    │   │       │       └── makedream_spider.cpython-36.pyc
    │   │       │   └── makedream_spider.py
    │   │   └── scrapy.cfg
    ├── 21_python爬取豆瓣电影前任3评论(词云显示)
    │   ├── ComentsAnaylst.py
    │   ├── ciyun.jpg
    │   ├── ciyun.png
    │   ├── douban.txt
    │   └── douban_qianren3.py
    ├── 22_python爬取Bilibili用户信息并导入mysql数据库
    │   ├── bilibili_user.py
    │   ├── bilibili_user_info.sql
    │   └── user_agents.txt
    ├── 23_python爬取网易云音乐所有歌曲的评论数
    │   ├── README.md
    │   ├── album_by_artist.py
    │   ├── artists.py
    │   ├── comments_by_music.py
    │   ├── music_by_album.py
    │   └── sql.py
    ├── 24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库
    │   └── findtrip
    │   │   ├── ctrip_items.csv
    │   │   ├── findtrip
    │   │       ├── __init__.py
    │   │       ├── __pycache__
    │   │       │   ├── __init__.cpython-36.pyc
    │   │       │   ├── items.cpython-36.pyc
    │   │       │   ├── pipelines.cpython-36.pyc
    │   │       │   └── settings.cpython-36.pyc
    │   │       ├── items.py
    │   │       ├── middlewares.py
    │   │       ├── pipelines.py
    │   │       ├── settings.py
    │   │       └── spiders
    │   │       │   ├── __init__.py
    │   │       │   ├── __pycache__
    │   │       │       ├── __init__.cpython-36.pyc
    │   │       │       ├── ctrip_spider.cpython-36.pyc
    │   │       │       ├── qua_spider.cpython-36.pyc
    │   │       │       └── washctrip.cpython-36.pyc
    │   │       │   ├── ctrip_spider.py
    │   │       │   ├── qua_spider.py
    │   │       │   └── washctrip.py
    │   │   ├── qua_items.csv
    │   │   ├── qua_items.json
    │   │   └── scrapy.cfg
    ├── 25_scrapy爬取前程无忧网站python相关的工作信息
    │   └── pythonjobs
    │   │   ├── PythonJobs.csv
    │   │   ├── pythonjobs
    │   │       ├── __init__.py
    │   │       ├── __pycache__
    │   │       │   ├── __init__.cpython-36.pyc
    │   │       │   ├── items.cpython-36.pyc
    │   │       │   ├── pipelines.cpython-36.pyc
    │   │       │   └── settings.cpython-36.pyc
    │   │       ├── items.py
    │   │       ├── middlewares.py
    │   │       ├── pipelines.py
    │   │       ├── settings.py
    │   │       └── spiders
    │   │       │   ├── __init__.py
    │   │       │   ├── __pycache__
    │   │       │       ├── __init__.cpython-36.pyc
    │   │       │       └── job_spider.cpython-36.pyc
    │   │       │   └── job_spider.py
    │   │   └── scrapy.cfg
    ├── 26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库
    │   └── shuimujob
    │   │   ├── ghostdriver.log
    │   │   ├── scrapy.cfg
    │   │   └── shuimujob
    │   │       ├── __init__.py
    │   │       ├── __pycache__
    │   │           ├── __init__.cpython-36.pyc
    │   │           ├── items.cpython-36.pyc
    │   │           ├── platform.cpython-36.pyc
    │   │           └── settings.cpython-36.pyc
    │   │       ├── items.py
    │   │       ├── middlewares.py
    │   │       ├── pipelines.py
    │   │       ├── platform.py
    │   │       ├── settings.py
    │   │       └── spiders
    │   │           ├── __init__.py
    │   │           ├── __pycache__
    │   │               ├── __init__.cpython-36.pyc
    │   │               └── shuimu_spider.cpython-36.pyc
    │   │           └── shuimu_spider.py
    └── 27_scrapy爬取南京20000多套二手房信息
    │   └── nj_house
    │       ├── house.csv
    │       ├── nj_house
    │           ├── __init__.py
    │           ├── __pycache__
    │           │   ├── __init__.cpython-36.pyc
    │           │   ├── items.cpython-36.pyc
    │           │   └── settings.cpython-36.pyc
    │           ├── items.py
    │           ├── middlewares.py
    │           ├── pipelines.py
    │           ├── settings.py
    │           └── spiders
    │           │   ├── __init__.py
    │           │   ├── __pycache__
    │           │       ├── __init__.cpython-36.pyc
    │           │       └── lj_house.cpython-36.pyc
    │           │   └── lj_house.py
    │       └── scrapy.cfg
├── Python3网络爬虫快速入门篇
    ├── README.md
    ├── biqukan.py
    └── 一念永恒.txt
└── README.md


/Python3网络爬虫中小型项目实战集中营/01_python爬取电影天堂/dytt.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 今日主题：python抓取电影天堂最新电影迅雷下载地址链接信息
 3 | 所用模块：requests bs4 pandas数据分析
 4 |  '''
 5 | import requests
 6 | import re
 7 | import pandas as pd 
 8 | 
 9 | url = 'https://www.dy2018.com/html/gndy/dyzz/index.html'
10 | 
11 | headers = {
12 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
13 | }
14 | 
15 | items_list = []
16 | 
17 | html = requests.get(url,headers=headers)
18 | html.encoding = 'gb2312'
19 | data = re.findall('<a href="(.*?)" class="ulink" title="(.*?)>.*?</a>',html.text)
20 | for i in data:
21 | 	url_1 = 'https://www.dy2018.com/'+str(i[0])
22 | 	#print(i)
23 | 	#print(url_1)
24 | 	html_1= requests.get(url_1,headers=headers)
25 | 	html_1.encoding = 'gb2312'
26 | 	data_1 = re.findall('<a href="(.*?)">.*?</a></td>',html_1.text)
27 | 	#print(data_1[0])	
28 | 	list_1 = [i[1], url_1, data_1[0]]
29 | 
30 | 	# list_1 = [url_1]
31 | 
32 | 	items_list.append(list_1)
33 | 	#print (list_1)
34 | 
35 | #print ('==========================================================================================================')
36 | 
37 | for m  in range(2, 298):
38 | 	url_2 = 'https://www.dy2018.com/html/gndy/dyzz/index_'+str(m)+'.html'
39 | 	print(url_2)
40 | 	html_2 = requests.get(url_2,headers=headers)
41 | 	html_2.encoding = 'gb2312'
42 | 	data_2 = re.findall('<a href="(.*?)" class="ulink" title="(.*?)>.*?</a>',html_2.text)
43 | 	for n in data_2:
44 | 		url_3 = 'https://www.dy2018.com/'+str(n[0])
45 | 		#print(n)
46 | 		#print(url_3)
47 | 		html_3= requests.get(url_3,headers=headers)
48 | 		html_3.encoding = 'gb2312'
49 | 		data_3 = re.findall('<a href="(.*?)">.*?</a></td>',html_3.text)
50 | 		#print(data_3[0])
51 | 		if len(data_3) < 1:
52 | 			continue
53 | 		list_2 = [n[1], url_3, data_3[0]]
54 | 		# list_2 = [url_3]
55 | 		
56 | 
57 | 		items_list.append(list_2)
58 | 		#print (list_2)	
59 | 	#print ('=====================================================================================================')
60 | 
61 | df = pd.DataFrame(items_list, columns = ['电影名称','电影网址链接','电影迅雷下载链接'])
62 | 
63 | df.to_csv('dytt.csv')


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/01_python爬取电影天堂/电影天堂.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/01_python爬取电影天堂/电影天堂.csv


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/02_python爬取斗罗大陆小说/dldl.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 今日主题：python抓取斗罗大陆最新章节标题信息
 3 | 所用模块：requests re bs4 pandas数据分析
 4 |  '''
 5 | import requests
 6 | import re
 7 | import pandas as pd 
 8 | from bs4 import BeautifulSoup #分析网页 获取标签内容
 9 | 
10 | url = 'https://www.freexs.org/novel/0/896/index.html'
11 | 
12 | headers = {
13 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
14 | }
15 | 
16 | items_list = []
17 | 
18 | html = requests.get(url,headers=headers)
19 | html.encoding = 'gb2312'
20 | 
21 | data = re.findall('<dd><a href="(.*?)">(.*?)</a></dd>',html.text)
22 | for i in data:
23 | 	url_1 = 'https://www.freexs.org/novel/0/896/'+str(i[0])
24 | 	print (i[1])
25 | 	print (url_1)
26 | 	list = [url_1, i[1]]
27 | 	items_list.append(list)
28 | 
29 | 
30 | 	# html_1 = requests.get(url_1,headers=headers)
31 | 	# html_1.encoding = 'gb2312'
32 | 	# soup = BeautifulSoup(html_1.text,'lxml')
33 | 	# title = soup.find('div', class_='readout').text #标题
34 | 	# print (title)
35 | 
36 | df = pd.DataFrame(items_list, columns = ['链接','章节主题'])
37 | df.to_csv('斗罗大陆小说.csv')


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/02_python爬取斗罗大陆小说/斗破苍穹小说.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/02_python爬取斗罗大陆小说/斗破苍穹小说.csv


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/02_python爬取斗罗大陆小说/斗破苍穹小说.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #-*- coding: utf-8 -*-
 3 | '''
 4 | 今日主题：python抓取斗破苍穹最新章节标题信息
 5 | 所用模块：requests re bs4 pandas数据分析
 6 | '''
 7 | import requests
 8 | import re
 9 | import pandas as pd 
10 | from bs4 import BeautifulSoup #分析网页 获取标签内容
11 | 
12 | url = 'https://www.miaobige.com/read/68/'
13 | 
14 | headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
15 | 
16 | items_list = []
17 | 
18 | html = requests.get(url, headers=headers)
19 | # html.encoding = 'gb2312'
20 | soup = BeautifulSoup(html.text,'html.parser')
21 | title = soup.find('div', id='readerlists')
22 | datas = re.findall('<li><a href="(.*?)">(.*?)</a></li>',title.text)
23 | for data in datas:
24 | 	url_1 = 'https://www.miaobige.com/' + data[0]
25 | 	print (data)
26 | 	item_list = [url_1, data[1]]
27 | 	items_list.append(item_list)
28 | 
29 | df = pd.DataFrame(items_list, columns = ['链接','章节主题'])
30 | df.to_csv('斗破苍穹小说.csv')


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/02_python爬取斗罗大陆小说/斗罗大陆小说.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/02_python爬取斗罗大陆小说/斗罗大陆小说.csv


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/03_python爬取欧洲足球联赛数据/footballData.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #-*- coding: utf-8 -*-
 3 | 
 4 | import requests
 5 | import urlparse
 6 | import bs4
 7 | import csv
 8 |  
 9 | BASE_URL = "http://soccerdata.sports.qq.com"
10 | PLAYER_LIST_QUERY = "/playerSearch.aspx?lega=%s&pn=%d"
11 | league = ['epl','seri','bund','liga','fran','scot','holl','belg']
12 | page_number_limit = 100
13 | player_fields = ['league_cn','img','name_cn','name','team','age','position_cn','nation','birth','query','id','teamid','league']
14 | 
15 | def get_players(baseurl):
16 |     html = requests.get(baseurl).text
17 |     soup = bs4.BeautifulSoup(html, "lxml")
18 |     players = [ dd for dd in soup.select('.searchResult tr') if dd.contents[1].name != 'th']
19 |     result = []
20 |     for player in players:
21 |         record = []
22 |         link = ''
23 |         query = []
24 |         for item in player.contents:
25 |             if type(item) is bs4.element.Tag:
26 |                 if not item.string and item.img:
27 |                     record.append(item.img['src'])
28 |                 else :
29 |                      record.append(item.string and item.string.strip() or 'na')
30 |                 try:
31 |                     o = urlparse.urlparse(item.a['href']).query
32 |                     if len(link) == 0:
33 |                         link = o
34 |                         query = dict([(k,v[0]) for k,v in urlparse.parse_qs(o).items()])
35 |                 except:
36 |                     pass
37 |              
38 |         if len(record) != 10:
39 |             for i in range(0, 10 - len(record)):
40 |                 record.append('na')
41 |         record.append(unicode(link,'utf-8'))
42 |         record.append(unicode(query["id"],'utf-8'))
43 |         record.append(unicode(query["teamid"],'utf-8'))
44 |         record.append(unicode(query["lega"],'utf-8'))
45 |         result.append(record)
46 |     return result
47 |     
48 | result = []
49 | for url in [ BASE_URL + PLAYER_LIST_QUERY % (l,n) for l in league for n in range(page_number_limit) ]:
50 |     result = result +  get_players(url)
51 | 
52 | 
53 | for i in league:
54 |     for j in range(0, 100):
55 |         url = BASE_URL + PLAYER_LIST_QUERY % (l,n)
56 |         ## send request to url and do scraping
57 | 
58 | 
59 | def write_csv(filename, content, header = None): 
60 |     file = open(filename, "wb")
61 |     file.write('\xEF\xBB\xBF')
62 |     writer = csv.writer(file, delimiter=',')
63 |     if header:
64 |         writer.writerow(header)
65 |     for row in content:
66 |         encoderow = [dd.encode('utf8') for dd in row]
67 |         writer.writerow(encoderow)
68 |  
69 | write_csv('players.csv',result,player_fields)
70 | 
71 | def get_player_match(url):
72 |     html = requests.get(url).text
73 |     soup = bs4.BeautifulSoup(html, "lxml")
74 |     matches = [ dd for dd in soup.select('.shtdm tr') if dd.contents[1].name != 'th']
75 |     records = []
76 |     for item in [ dd for dd in matches if len(dd.contents) > 11]: ## filter out the personal part
77 |         record = []
78 |         for match in [ dd for dd in item.contents if type(dd) is bs4.element.Tag]:
79 |             if match.string:
80 |                 record.append(match.string)
81 |             else:
82 |                 for d in [ dd for dd in match.contents if type(dd) is bs4.element.Tag]:
83 |                     query = dict([(k,v[0]) for k,v in urlparse.parse_qs(d['href']).items()])
84 |                     record.append('teamid' in query and query['teamid'] or query['id'])   
85 |                     record.append(d.string and d.string or 'na')                    
86 |         records.append(record)
87 |     return records[1:]  ##remove the first record as the header
88 |  
89 | def get_players_match(playerlist, baseurl = BASE_URL + '/player.aspx?'):
90 |     result = []
91 |     for item in playerlist:
92 |         url =  baseurl + item[10]
93 |         print (url)
94 |         result = result + get_player_match(url)
95 |     return result
96 | match_fields = ['date_cn','homeid','homename_cn','matchid','score','awayid','awayname_cn','league_cn','firstteam','playtime','goal','assist','shoot','run','corner','offside','foul','violation','yellowcard','redcard','save']    
97 | write_csv('m.csv',get_players_match(result),match_fields)


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/04_python爬取豆瓣电影Top250/douban_top250_movies.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/04_python爬取豆瓣电影Top250/douban_top250_movies.csv


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/04_python爬取豆瓣电影Top250/filmTop250.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python  
 2 | # -*- encoding:utf-8 -*-  
 3 |   
 4 | """ 
 5 | @author : Tom
 6 | @file : douban_movie 
 7 | @time : 2018/4/6 23:04 
 8 | @description :  
 9 |  
10 | """  
11 | 
12 | import requests  
13 | import re  
14 | from bs4 import BeautifulSoup  
15 | import csv  
16 |   
17 |   
18 | # 先创建一个csv文件，写好头部  
19 | with open("douban_top250_movies.csv", 'w') as filed:  # a+为添加，w为擦除重写  
20 |     csv_writer = csv.DictWriter(filed, [  
21 |         u'片名',  
22 |         u'评分',  
23 |         u'评分人数',  
24 |         u'一句话描述',  
25 |         u'豆瓣链接',  
26 |     ])  
27 |     csv_writer.writeheader()  
28 |   
29 |   
30 | def get_mov_info(response):  
31 |     mov_info = {}  
32 |     soup = BeautifulSoup(response.text, "lxml")  
33 |     movies = soup.find_all('div', class_="info")  
34 |   
35 |     for info in movies:  
36 |         # 获得电影的中文名  
37 |         mov_info['mov_name'] = info.find('span', class_='title').text  # find()只找到一个，结果以树结构返回  
38 |   
39 |         # 获得电影在豆瓣中的链接  
40 |         mov_info['mov_link'] = info.find('a').get('href')  
41 |   
42 |         # 找到评分以及评价人数  
43 |         rating_num = info.find(class_='rating_num')  
44 |         mov_info['rating_score'] = rating_num.text  
45 |         comment = rating_num.find_next_sibling().find_next_sibling()  
46 |         # 对评价字段切分  
47 |         comment_num = re.findall('\d{0,}', comment.text)  
48 |         mov_info['comment_nums'] = comment_num[0]    # 正则匹配re中没有find()，findall()以列表形式返回结果  
49 |   
50 |         # 获得一句话评价  
51 |         comment_one = info.find('span', class_='inq')  
52 |         if comment_one is None:  
53 |             mov_info['inq_comment'] = u' '  
54 |         else:  
55 |             mov_info['inq_comment'] = comment_one.text  
56 |         print (mov_info) 
57 |   
58 |         # 一条条存入csv文件  
59 |         write_csv(mov_info)  
60 |   
61 |   
62 | def write_csv(info_dict):  
63 |     with open("douban_top250_movies.csv", 'a+') as f:  
64 |         csv_write = csv.DictWriter(f, [  
65 |             u'片名',  
66 |             u'评分',  
67 |             u'评分人数',  
68 |             u'一句话描述',  
69 |             u'豆瓣链接',  
70 |         ])  
71 |         csv_write.writerow({                   # writerow()写入单行，writerows写入多行，这里只有一行数据，用writerows报错  
72 |             u'片名': info_dict['mov_name'],  
73 |             u'评分': info_dict['rating_score'],  
74 |             u'评分人数': info_dict['comment_nums'],  
75 |             u'一句话描述': info_dict['inq_comment'],  
76 |             u'豆瓣链接': info_dict['mov_link']  
77 |         })  
78 |   
79 | for num in range(0, 10):  
80 |     page = num * 25  
81 |     response = requests.get("https://movie.douban.com/top250?start=%d&filter=" % page)   
82 |     get_mov_info(response)


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/05_python爬取股票数据/stockInfo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 |  
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | import traceback
 6 | import re
 7 |  
 8 | def getHTMLText(url):
 9 |     try:
10 |         r = requests.get(url)
11 |         r.raise_for_status()
12 |         r.encoding = r.apparent_encoding
13 |         return r.text
14 |     except:
15 |         return ""
16 |  
17 | def getStockList(lst, stockURL):
18 |     html = getHTMLText(stockURL)
19 |     soup = BeautifulSoup(html, 'html.parser') 
20 |     a = soup.find_all('a')
21 |     for i in a:
22 |         try:
23 |             href = i.attrs['href']
24 |             lst.append(re.findall(r"[s][hz]\d{6}", href)[0])
25 |         except:
26 |             continue
27 |  
28 | def getStockInfo(lst, stockURL, fpath):
29 |     count = 0
30 |     for stock in lst:
31 |         url = stockURL + stock + ".html"
32 |         html = getHTMLText(url)
33 |         try:
34 |             if html=="":
35 |                 continue
36 |             infoDict = {}
37 |             soup = BeautifulSoup(html, 'html.parser')
38 |             stockInfo = soup.find('div',attrs={'class':'stock-bets'})
39 |  
40 |             name = stockInfo.find_all(attrs={'class':'bets-name'})[0]
41 |             infoDict.update({'股票名称': name.text.split()[0]})
42 |              
43 |             keyList = stockInfo.find_all('dt')
44 |             valueList = stockInfo.find_all('dd')
45 |             for i in range(len(keyList)):
46 |                 key = keyList[i].text
47 |                 val = valueList[i].text
48 |                 infoDict[key] = val
49 |              
50 |             with open(fpath, 'a', encoding='utf-8') as f:
51 |                 f.write( str(infoDict) + '\n' )
52 |                 count = count + 1
53 |                 print("\r当前进度: {:.2f}%".format(count*100/len(lst)),end="")
54 |         except:
55 |             count = count + 1
56 |             print("\r当前进度: {:.2f}%".format(count*100/len(lst)),end="")
57 |             continue
58 |  
59 | def main():
60 |     stock_list_url = 'http://quote.eastmoney.com/stocklist.html'
61 |     stock_info_url = 'https://gupiao.baidu.com/stock/'
62 |     output_file = 'BaiduStockInfo.csv'
63 |     slist=[]
64 |     getStockList(slist, stock_list_url)
65 |     getStockInfo(slist, stock_info_url, output_file)
66 |  
67 | main()


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/06_python爬取人人贷网数据/peopleLoad.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import re
 3 | import numpy as np
 4 | import requests
 5 | import time
 6 | import random
 7 | from bs4 import BeautifulSoup
 8 |  
 9 | s=requests.session()
10 |  
11 | headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
12 | #根据浏览器下自行修改
13 |  
14 | headers['Cookie'] = 'gr_user_id=022d0f46-4981-4224-9895-18bfe32d9276; rrdLoginCartoon=rrdLoginCartoon; pgv_pvi=905847926; Hm_lvt_16f9bb97b83369e62ee1386631124bb1=1474288518,1474332677,1474336816,1474368269; Hm_lpvt_16f9bb97b83369e62ee1386631124bb1=1474372985; JSESSIONID=7EB90C9967D8C42B08DFB18EB9A9F74ED2ACC468B7D56B9372E2A20684713847; jforumUserInfo=bEAY23pgyLLLjII69w9oS%2BtK2jljmxa8%0A; IS_MOBLIE_IDPASS=true-false; activeTimestamp=5195275; gr_session_id_9199126ed94d770d=70bbe285-4ac6-42c9-a49b-9255d0eb9c46; gr_cs1_70bbe285-4ac6-42c9-a49b-9255d0eb9c46=user_id%3A5195275'
15 | #根据浏览器F12下的Request Headers->Cookie自行复制上去即可
16 |  
17 |  
18 | def parse_userinfo(loanid):#自定义解析借贷人信息的函数
19 |     timestamp=str(int(time.time())) + '%03d' % random.randint(0,999)<br>
20 |     urll="http://www.we.com/lend/detailPage.action?loanId=%.0f&timestamp=" % loanid+timestamp<br>    #这个urll我也不知道怎么来的，貌似可以用urll="http://www.we.com/loan/%f" % loanid+timestamp  <br>    #(就是页面本身，我也没试过)
21 |     result = s.get(urll,headers=headers)
22 |     html = BeautifulSoup(result.text,'lxml')
23 |     info = html.find_all('table',class_="ui-table-basic-list")
24 |     info1= info[0]
25 |     info2 = info1.find_all('div',class_="basic-filed")
26 |     userinfo = {}
27 |     for item in info2:
28 |         vartag = item.find('span')
29 |         var = vartag.string
30 |         if var == '信用评级':
31 |             var = '信用评分'
32 |             pf1 = repr(item.find('em'))
33 |             value = re.findall(r'\d+',pf1)
34 |         else:
35 |             valuetag = item.find('em')
36 |             value = valuetag.string
37 |         userinfo[var]=value
38 |     data = pd.DataFrame(userinfo)
39 |     return data
40 |  
41 | rrd=pd.read_csv('loanId.csv') #loanId是之前散标数据中的loanId,将其单独整理为一个csv文档
42 | loanId=rrd.ix[:,'loanId']
43 | user_info = ['昵称', '信用评分',
44 |  
45 |             '年龄', '学历', '婚姻',
46 |  
47 |             '申请借款', '信用额度', '逾期金额', '成功借款', '借款总额', '逾期次数','还清笔数', '待还本息', '严重逾期',
48 |  
49 |             '收入', '房产', '房贷', '车产', '车贷',
50 |  
51 | '公司行业', '公司规模', '岗位职位', '工作城市', '工作时间']
52 |  
53 | table = pd.DataFrame(np.array(user_info).reshape(1, 24), columns=user_info)
54 |  
55 | i = 1
56 |  
57 | for loanid in loanId:
58 |     table = pd.concat([table, parse_userinfo(loanid)])
59 |     print(i)
60 |     i += 1 #看一下循环多少次
61 |  
62 | table.to_csv('userinfo.csv',header=False)


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/07_python爬取创业邦创投库/python爬取创业邦创投库.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import requests
  4 | import time
  5 | from bs4 import BeautifulSoup
  6 | import pandas as pd
  7 | # 导入pandas库
  8 | 
  9 | # 设置列表页面URL的固定部分
 10 | url = 'https://bj.lianjia.com/ershoufang/'
 11 | BASE_URL_U1 = "http://www.cyzone.cn/event/list-764-0-"
 12 | BASE_URL_U2 = "-0-0-0-0/"
 13 | 
 14 | # 最好在http请求中设置一个头部信息，否则很容易被封ip
 15 | headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
 16 | 'Accept':'text/html;q=0.9,*/*;q=0.8',
 17 | 'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
 18 | 'Accept-Encoding':'gzip',
 19 | 'Connection':'close',
 20 | 'Referer':'http://www.baidu.com/link?url=_andhfsjjjKRgEWkj7i9cFmYYGsisrnm2A-TN3XZDQXxvGsM9k9ZZSnikW2Yds4s&amp;amp;wd=&amp;amp;eqid=c3435a7d00146bd600000003582bfd1f'
 21 | }
 22 | 
 23 | # 循环抓取列表页信息
 24 | for i in range(1,31):  # 分页
 25 |         if i == 1:
 26 |             i=str(1)
 27 |             var_url = (BASE_URL_U1 + i + BASE_URL_U2)
 28 |             r = requests.get(url=var_url, headers=headers)
 29 |             html = r.content
 30 |             #print(html)
 31 |         else:
 32 |             i=str(i)
 33 |             var_url=(BASE_URL_U1 + i + BASE_URL_U2)
 34 |             var_url=requests.get(url=var_url,headers=headers)
 35 |             html2=r.content
 36 |             html = html + html2
 37 |             # 每次间隔1秒
 38 |             time.sleep(1)
 39 |         
 40 |         # 解析抓取的页面内容
 41 |         res = BeautifulSoup(html, 'html.parser')
 42 |         # 获取感兴趣目标信息：
 43 |         # 提取公司名称
 44 |         # table>tbody>tr.table-plate3>td.tp2>span.tp2_tit>a     
 45 |         companys = res.find_all('span', 'tp2_tit')
 46 |         cnames = []
 47 |         print(len(companys))
 48 |         for item in companys:
 49 |             cname =  item.a.string
 50 |             cnames.append(cname)
 51 |     
 52 |         #print(cnames)
 53 |         # 获取感兴趣目标信息：
 54 |         # 提取公司详情url
 55 |         companys = res.find_all('span', 'tp2_tit')
 56 |         urls = []
 57 |         for item in companys:
 58 |             url =  item.a['href']
 59 |             urls.append(url)
 60 |         # 获取感兴趣目标信息：
 61 |         # 提取当前融资轮次,行业，投资方和更新时间
 62 | 
 63 |         # res = BeautifulSoup(html, 'html5lib')
 64 |         # finances = res.select('div#main > div.list-table3 > table > tbody > tr')
 65 |         finances = res.find_all('tr', 'table-plate3')
 66 |         # 融资轮次,行业,投资方,更新时间
 67 |         financing_rounds, businesses, investors, update_times  = [],[],[],[]
 68 |         #print(len(finances))
 69 |         for i in range(0, len(finances)):
 70 |             # 获取第一行数据(范围)
 71 |             items = finances[i].find_all('td')
 72 |             # print(items)
 73 |             # 获取融资轮次
 74 |             fround =  items[-5].text.strip()
 75 |             #获取行业
 76 |             business = items[-4].text.strip()
 77 |             #获取投资方
 78 |             investor = items[-3].text.strip()
 79 |             #获取更新时间
 80 |             update_time = items[-2].text.strip()
 81 |             financing_rounds.append(fround)
 82 |             businesses.append(business)
 83 |             investors.append(investor)
 84 |             update_times.append(update_time)
 85 |             
 86 |             # 将获取的数据进行汇总：
 87 |             #print(len(cnames))
 88 |             #print(len(urls))
 89 |             #print(len(financing_rounds))
 90 |             #print(len(financing_rounds))
 91 |             #print(len(businesses))
 92 |             #print(len(investors))
 93 |             #print(len(update_times))
 94 |     
 95 |             # 创建数据表
 96 |         resultsDatas = pd.DataFrame({'公司名称':cnames,'详情URL':urls,'融资轮次':financing_rounds,'行业':businesses,'投资方':investors,'更新时间':update_times})
 97 |             # 查看数据表内容
 98 |         print(resultsDatas)
 99 | 
100 | 
101 | resultsDatas.to_csv("resultsDatas.csv")


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/07_python爬取创业邦创投库/resultsDatas.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/07_python爬取创业邦创投库/resultsDatas.csv


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/08_python抓取美团网百万商家信息/meituan.csv:
--------------------------------------------------------------------------------
1 | ,title,score,address,phone,Evaluation_number
2 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/08_python抓取美团网百万商家信息/python抓取美团网百万商家信息.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 今日主题：python抓取美团网百万商家信息
 3 | 所用模块：requests bs4 数据分析
 4 | 流程分析：1、获取主页源码
 5 |           2、获取二级菜单链接（美食、电影。。。）
 6 |           3、商品店家信息
 7 |  '''
 8 | import requests
 9 | from bs4 import BeautifulSoup #分析网页 获取标签内容
10 | import json
11 | import lxml
12 | import pandas as pd 
13 | 
14 | url = 'http://chs.meituan.com/'
15 | 
16 | headers = {
17 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
18 | }
19 |   
20 | #获取分类（电影、美食）
21 | def get_start_links(url, headers=None):
22 |     html = requests.get(url,headers=headers).text #发送请求获取主页文本
23 |     soup = BeautifulSoup(html, 'lxml') #解析网页
24 |     links = [link.find('div').find('div').find('dl').find('dt').find('a')['href'] for link in soup.find_all('div', class_='J-nav-item')]
25 |     return links
26 | 
27 | #获取分类链接中的店铺id
28 | def get_detail_id(url, headers=None):
29 |     html = requests.get(url,headers=headers).text
30 |     soup = BeautifulSoup(html,'lxml')
31 |     content_id = json.loads(soup.find('div', class_='J-scrollloader cf J-hub')['data-async-params'])
32 |     return json.loads(content_id.get('data')).get('poiidList') 
33 | 
34 | #获取店铺详情数据
35 | def get_item_info(url, headers=None):
36 |     html = requests.get(url,headers=headers).text
37 |     soup = BeautifulSoup(html,'lxml')
38 |     title = soup.find('span', class_='title').text #标题
39 |     score = soup.find('span', class_='biz-level').get_text() #评分
40 |     address = soup.find('span', class_='geo').text #地址
41 |     phone = soup.find_all('p', class_='under-title')[1].get_text() #电话
42 |     Evaluation_number = soup.find('a', class_='num rate-count').text #评价
43 |     print (u'店名： '+title) 
44 |     print (u'评论数量： '+Evaluation_number)
45 |     print (u'地址： '+address)
46 |     print (u'评分： '+score) 
47 |     print (u'电话： '+phone)
48 |     print ('======================================================')
49 |     return (title, score, address, phone, Evaluation_number)
50 | 
51 | 
52 | items_list = []
53 |    
54 | start_url_list = get_start_links(url)
55 | for j in start_url_list:#分类链接
56 |     for i in range(1,11): #多页
57 |         category_url = j+'/all/page()'.format(i) #完整的分类多页链接
58 |         shop_id_list = get_detail_id(category_url,headers=headers)
59 |         print (shop_id_list)
60 |         for shop_id in shop_id_list:
61 |             items = get_item_info(url+'shop/{}'.format(shop_id),headers)
62 |             items_list.append(items)
63 | 
64 | df = pd.DataFrame(items_list, columns = ['title','score','address','phone','Evaluation_number'])
65 | df.to_csv('meituan.csv')


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/09_python爬取网易云音乐评论并把他们存入mysql数据库/python爬取网易云音乐评论并把他们存入mysql数据库.py:
--------------------------------------------------------------------------------
  1 | # 爬取网易云音乐评论并把他们存入mysql数据库
  2 | import requests,json,os
  3 | import base64
  4 | import codecs
  5 | from Crypto.Cipher import AES
  6 | import pymysql
  7 | 
  8 | 
  9 | class Spider():
 10 | 
 11 |     def __init__(self):
 12 | 
 13 |         self.header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',
 14 |                        'Referer': 'http://music.163.com/'}
 15 |         self.url = 'http://music.163.com/weapi/v1/resource/comments/R_SO_4_531051217?csrf_token='
 16 | 
 17 | 
 18 |     def __get_jsons(self,url,page):
 19 |         # 获取两个参数
 20 |         music = WangYiYun()
 21 |         text = music.create_random_16()
 22 |         params = music.get_params(text,page)
 23 |         encSecKey = music.get_encSEcKey(text)
 24 |         fromdata = {'params' : params,'encSecKey' : encSecKey}
 25 |         jsons = requests.post(url, data=fromdata, headers=self.header)
 26 |         #print(jsons.raise_for_status())
 27 |         # 打印返回来的内容，是个json格式的
 28 |         #print(jsons.content)
 29 |         return jsons.text
 30 | 
 31 |     def json2list(self,jsons):
 32 |         '''把json转成字典，并把他重要的信息获取出来存入列表'''
 33 |         # 可以用json.loads()把他转成字典
 34 |         #print(json.loads(jsons.text))
 35 |         users = json.loads(jsons)
 36 |         comments = []
 37 |         for user in users['comments']:
 38 |             # print(user['user']['nickname']+' : '+user['content']+'   点赞数：'+str(user['likedCount']))
 39 |             name = user['user']['nickname']
 40 |             content = user['content']
 41 |             # 点赞数
 42 |             likedCount = user['likedCount']
 43 |             user_dict = {'name': name, 'content': content, 'likedCount': likedCount}
 44 |             comments.append(user_dict)
 45 |         return comments
 46 | 
 47 |     def write2sql(self,comments):
 48 |         '''把评论写入数据库'''
 49 |         music = Operate_SQL()
 50 |         print('第%d页正在获取' % self.page)
 51 |         for comment in comments:
 52 |             #print(comment)
 53 |             music.add_data(comment)
 54 |         print('   该页获取完成')
 55 | 
 56 | 
 57 | 
 58 |     def run(self):
 59 |         self.page = 1
 60 |         while True:
 61 |             jsons = self.__get_jsons(self.url,self.page)
 62 |             comments = self.json2list(jsons)
 63 |             print(comments[0])
 64 |             # 当这一页的评论数少于20条时，证明已经获取完
 65 |             self.write2sql(comments)
 66 |             if len(comments) < 20:
 67 |                 print('评论已经获取完')
 68 |                 break
 69 |             self.page +=1
 70 | 
 71 | # 找出post的两个参数params和encSecKey
 72 | class WangYiYun():
 73 | 
 74 |     def __init__(self):
 75 |         # 在网易云获取的三个参数
 76 | 
 77 |         self.second_param = '010001'
 78 |         self.third_param = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7'
 79 |         self.fourth_param = '0CoJUm6Qyw8W8jud'
 80 | 
 81 |     def create_random_16(self):
 82 |         '''获取随机十六个字母拼接成的字符串'''
 83 |         return (''.join(map(lambda xx: (hex(ord(xx))[2:]), str(os.urandom(16)))))[0:16]
 84 | 
 85 |     def aesEncrypt(self, text, key):
 86 |         # 偏移量
 87 |         iv = '0102030405060708'
 88 |         # 文本
 89 |         pad = 16 - len(text) % 16
 90 |         text = text + pad * chr(pad)
 91 |         encryptor = AES.new(key, 2, iv)
 92 |         ciphertext = encryptor.encrypt(text)
 93 |         ciphertext = base64.b64encode(ciphertext)
 94 |         return ciphertext
 95 | 
 96 |     def get_params(self,text,page):
 97 |         '''获取网易云第一个参数'''
 98 |         # 第一个参数
 99 |         if page == 1:
100 |             self.first_param = '{rid: "R_SO_4_400162138", offset: "0", total: "true", limit: "20", csrf_token: ""}'
101 |         else:
102 |             self.first_param = ('{rid: "R_SO_4_400162138", offset:%s, total: "false", limit: "20", csrf_token: ""}'%str((page-1)*20))
103 | 
104 |         params = self.aesEncrypt(self.first_param, self.fourth_param).decode('utf-8')
105 |         params = self.aesEncrypt(params, text)
106 |         return params
107 | 
108 |     def rsaEncrypt(self, pubKey, text, modulus):
109 |         '''进行rsa加密'''
110 |         text = text[::-1]
111 |         rs = int(codecs.encode(text.encode('utf-8'), 'hex_codec'), 16) ** int(pubKey, 16) % int(modulus, 16)
112 |         return format(rs, 'x').zfill(256)
113 | 
114 |     def get_encSEcKey(self,text):
115 |         '''获取第二个参数'''
116 |         pubKey = self.second_param
117 |         moudulus = self.third_param
118 |         encSecKey = self.rsaEncrypt(pubKey, text, moudulus)
119 |         return encSecKey
120 | 
121 | # 操作 mysql
122 | class Operate_SQL():
123 |     # 连接数据库
124 |     def __get_conn(self):
125 |         try:
126 |             # 我用的的本地数据库，所以host是127.0.0.1
127 |             self.conn = pymysql.connect(host='127.0.0.1',user='root',passwd='19980129.jie',port=3306,db='music',charset='utf8mb4')
128 |         except Exception as e:
129 |             print(e, '数据库连接失败')
130 | 
131 |     def __close_conn(self):
132 |         '''关闭数据库连接'''
133 |         try:
134 |             if self.conn:
135 |                 self.conn.close()
136 |         except pymysql.Error as e:
137 |             print(e, '关闭数据库失败')
138 | 
139 |     def add_data(self,comment):
140 |         '''增加一条数据到数据库'''
141 |         sql = 'INSERT INTO `comments`(`name`,`content`,`likedCount`) VALUE(%s,%s,%s)'
142 |         try:
143 |             self.__get_conn()
144 |             cursor = self.conn.cursor()
145 |             cursor.execute(sql, (comment['name'],comment['content'],comment['likedCount']))
146 |             self.conn.commit()
147 |             return 1
148 |         except AttributeError as e:
149 |             print(e,'添加数据失败')
150 |             # 添加失败就倒回数据
151 |             self.conn.rollback()
152 |             return 0
153 |         except pymysql.DataError as e:
154 |             print(e)
155 |             self.conn.rollback()
156 |             return 0
157 |         finally:
158 |             if cursor:
159 |                 cursor.close()
160 |             self.__close_conn()
161 | 
162 | 
163 | 
164 | def main():
165 |     spider = Spider()
166 |     spider.run()
167 | 
168 | if __name__ == '__main__':
169 |     main()


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/10_python爬取“网上购物”类APP/apps.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/10_python爬取“网上购物”类APP/apps.csv


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/10_python爬取“网上购物”类APP/python爬取网上购物类APP数据py:
--------------------------------------------------------------------------------
  1 | # =========== Python3.X Jupyter ===========
  2 | # =========== 步骤一、抓取每一个子分类的URL ===========
  3 | 
  4 | # 导入第三方包
  5 | import requests
  6 | from bs4 import BeautifulSoup
  7 | import numpy as np
  8 | import time
  9 | import pandas as pd
 10 | 
 11 | # 设置请求头
 12 | headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36'}
 13 | 
 14 | # 豌豆荚应用首页 > 安卓软件分类 > 网上购物 > 商城下载的链接
 15 | url = 'http://www.wandoujia.com/category/5017_591'
 16 | # 发送请求
 17 | res = requests.get(url, headers = headers).text
 18 | # 解析HTML
 19 | soup = BeautifulSoup(res, 'html.parser')
 20 | 
 21 | # 商城类app的5个分类链接及名称
 22 | category_urls = [i.findAll('a')[0]['href'] for i in soup.findAll('ul',{'class':'switch-tab cate-tab'})[0].findAll('li')[1:]]
 23 | category_names = [i.text.strip() for i in soup.findAll('ul',{'class':'switch-tab cate-tab'})[0].findAll('li')[1:]]
 24 | 
 25 | 
 26 | 
 27 | # =========== 步骤二、生成所有子分类及页码的URL ===========
 28 | 
 29 | # 各类别app的前10页urls
 30 | names = []
 31 | urls = []
 32 | 
 33 | for url,name in zip(category_urls,category_names):
 34 |     for i in range(1,11):
 35 |         names.append(name)
 36 |         urls.append(url+'/'+str(i))   
 37 | 		
 38 | 		
 39 | 		
 40 | 
 41 | # =========== 步骤三、抓取子分类页下各APP对应的URL ===========
 42 | 
 43 | # 根据每一页的url抓出app对应的链接
 44 | app_urls = []
 45 | 
 46 | for url in urls:
 47 |     res = requests.get(url, headers = headers).text
 48 |     soup = BeautifulSoup(res,'html.parser')
 49 |     
 50 |     # 返回每个页面中app的名称及对应的链接
 51 |     # 为防止报错，这里做了异常处理
 52 |     try:
 53 |         app_lists = soup.findAll('ul',{'id':'j-tag-list'})[0]
 54 |         app_urls.extend([i.findAll('a')[0]['href'] for i in app_lists.findAll('h2',{'class':'app-title-h2'})])
 55 |     except:
 56 |         pass
 57 | 
 58 | 
 59 | 
 60 | 
 61 | # =========== 步骤四、爬虫抓取各APP的详细信息 ===========
 62 | 
 63 | # 构建空的列表，用于数据的存储
 64 | appname = []
 65 | appcategory = []
 66 | install = []
 67 | love = []
 68 | comments = []
 69 | size = []
 70 | update = []
 71 | version = []
 72 | platform = []
 73 | company = []
 74 | 
 75 | for url in app_urls:
 76 |     res = requests.get(url, headers = headers).text
 77 |     soup = BeautifulSoup(res,'html.parser')
 78 |     
 79 |     try:
 80 |         # 抓取的信息
 81 |         appname.append(soup.find('p',{'class':'app-name'}).text.strip())
 82 |         appcategory.append('-'.join(soup.find('dl',{'class':'infos-list'}).findAll('dd')[1].text.strip().split('\n')))
 83 |         install.append(soup.find('span',{'class':'item install'}).find('i').text)
 84 |         love.append(soup.find('span',{'class':'item love'}).find('i').text)
 85 |         comments.append(soup.find('div',{'class':'comment-area'}).find('i').text)
 86 |         size.append(soup.find('dl',{'class':'infos-list'}).findAll('dd')[0].text.strip())
 87 |         update.append(soup.find('dl',{'class':'infos-list'}).findAll('dd')[3].text.strip())
 88 |         version.append(soup.find('dl',{'class':'infos-list'}).findAll('dd')[4].text.strip())
 89 |         platform.append(soup.find('dl',{'class':'infos-list'}).findAll('dd')[5].text.strip().split('\n')[0])
 90 |         company.append(soup.find('dl',{'class':'infos-list'}).findAll('dd')[6].text.strip())
 91 |     except:
 92 |         pass
 93 | 
 94 | 
 95 | 
 96 | 
 97 | # =========== 步骤五、数据存储 ===========
 98 | 
 99 | # 将存储的列表值写入到字典，并进行数据框的转换
100 | apps = pd.DataFrame({'appname':appname,'appcategory':appcategory,
101 |                       'install':install,'love':love,'comments':comments,'size':size,
102 |                       'update':update,'version':version,'platform':platform,'company':company})
103 |                       
104 | # 数据导出
105 | apps.to_csv('apps.csv', index = False)	


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/10_python爬取“网上购物”类APP/网上购物类APP数据分析并展示.py:
--------------------------------------------------------------------------------
  1 | # =========== Python3.X Jupyter ===========
  2 | 
  3 | # 导入第三方包
  4 | import pandas as pd
  5 | import numpy as np
  6 | import matplotlib.pyplot as plt
  7 | 
  8 | # 读取外部数据源
  9 | app_info = pd.read_csv('apps.csv')
 10 | 
 11 | # 数据集的观测数量及变量数
 12 | app_info.shape
 13 | 
 14 | # 窥探数据前5行信息
 15 | app_info.head()
 16 | 
 17 | # 查看数据集各变量的类型
 18 | app_info.dtypes
 19 | 
 20 | # 检查数据是否有重复（一般对于爬虫数据都需要检查）
 21 | any(app_info.duplicated())
 22 | 
 23 | # 数值变量的描述性分析
 24 | app_info.describe()
 25 | 
 26 | 
 27 | 
 28 | # 剔除重复观测
 29 | app_info.drop_duplicates(inplace=True)
 30 | app_info.shape
 31 | 
 32 | # 删除评论人数为-1的观测（因为只有2条记录）
 33 | app_info = app_info.loc[app_info.comments != -1,]
 34 | 
 35 | # 离散变量的统计描述
 36 | app_info.describe(include = ['object'])
 37 | 
 38 | 
 39 | 
 40 | # 自定义函数，处理安装人数的单位
 41 | def func(x):
 42 |     if x.find('亿') != -1:
 43 |         y = float(x[:-1])*10000
 44 |     elif x.find('万') != -1:
 45 |         y = float(x[:-1])
 46 |     else:
 47 |         y = float(x)/10000
 48 |     return(y) 
 49 | # 安装人数变量的类型转换
 50 | app_info['install_new'] = app_info.install.apply(func)
 51 | 
 52 | # 自定义匿名函数
 53 | y = lambda x : float(x[:-2]) if x.find('MB') != -1 else float(x[:-2])/1024
 54 | # 软件大小变量的类型转换 
 55 | app_info['size_new'] = app_info['size'].apply(y)
 56 | 
 57 | # 自定义匿名函数，将“暂无”设置为缺失值
 58 | y = lambda x : np.nan if x == '暂无' else float(x[:-1])/100
 59 | app_info['love_new'] = app_info['love'].apply(y)
 60 | 
 61 | # 用中位数对好评率进行填补
 62 | app_info['love_new'] = app_info.love_new.fillna(app_info.love_new.median())
 63 | 
 64 | # 日期类型的转换
 65 | app_info['update_new'] = pd.to_datetime(app_info['update'], format = '%Y年%m月%d日')
 66 | 
 67 | 
 68 | 
 69 | # 数值变量的描述性统计
 70 | app_info.describe()
 71 | 
 72 | # 删除不必要的变量
 73 | app_info.drop(['install','size','love','update'], axis = 1, inplace=True)
 74 | app_info.head()
 75 | 
 76 | 
 77 | 
 78 | # 各类应用安装量最多的前5个APP（产生绘图数据）
 79 | ls = []
 80 | 
 81 | categories = ['商城','团购','优惠','快递','全球导购']
 82 | for cate in categories:
 83 |     sub = app_info.loc[app_info.appcategory.apply(lambda x : x.find(cate) != -1),['appname','install_new']]
 84 |     
 85 |     # 取前5的安装量
 86 |     sub = sub.sort_values(by = ['install_new'],ascending=False)[:5]
 87 |     sub['type'] = cate
 88 |     ls.append(sub)
 89 | # 合并数据集    
 90 | app_install_cat = pd.concat(ls)    
 91 |     
 92 | 
 93 | # 设置绘图风格
 94 | plt.style.use('ggplot')
 95 | # 中文处理
 96 | plt.rcParams['font.sans-serif'] = 'Microsoft YaHei'
 97 | 
 98 | # 为了让多张子图在一张图中完成，设置子图的位置
 99 | ax1 = plt.subplot2grid((3,2),(0,0))
100 | ax2 = plt.subplot2grid((3,2),(0,1))
101 | ax3 = plt.subplot2grid((3,2),(1,0))
102 | ax4 = plt.subplot2grid((3,2),(1,1))
103 | ax5 = plt.subplot2grid((3,2),(2,0), colspan=2) # colspan指定跨过的列数
104 | 
105 | # 将图框存放起来，用于循环使用
106 | axes = [ax1,ax2,ax3,ax4,ax5]
107 | types = app_install_cat.type.unique()
108 | 
109 | # 循环的方式完成5张图的绘制
110 | for i in range(5):
111 |     # 准备绘图数据
112 |     data = app_install_cat.loc[app_install_cat.type == types[i]]
113 |     # 绘制条形图
114 |     axes[i].bar(range(5), data.install_new, color = 'steelblue', alpha = 0.7)
115 |     # 设置图框大小
116 |     gcf = plt.gcf()
117 |     gcf.set_size_inches(8, 6)
118 |     # 添加标题
119 |     axes[i].set_title(types[i]+'类APP下载量前5的应用', size = 9)
120 |     # 设置刻度位置
121 |     axes[i].set_xticks(np.arange(5) + 0.4)
122 |     # 为刻度添加标签值
123 |     axes[i].set_xticklabels(data.appname, fontdict={'fontsize':7},  color = 'red')
124 |     # 删除各子图上、右和下的边界刻度标记
125 |     axes[i].tick_params(top = 'off', bottom = 'off', right = 'off')
126 |    
127 | # 调整子图之间的水平间距和高度间距
128 | plt.subplots_adjust(hspace=0.6, wspace=0.3)
129 | 
130 | # 显示图形
131 | plt.show()
132 | 
133 | 
134 | 
135 | # 各类应用好评率最低的前5个APP（产生绘图数据）
136 | ls = []
137 | categories = ['商城','团购','优惠','快递','全球导购']
138 | for cate in categories:
139 |     sub = app_info.loc[app_info.appcategory.apply(lambda x : x.find(cate) != -1),['appname','love_new']]
140 |     # 取前5的安装量
141 |     sub = sub.sort_values(by = ['love_new'])[:5]
142 |     sub['type'] = cate
143 |     ls.append(sub)
144 | app_love_cat = pd.concat(ls)  
145 | 
146 | # 为了让多张子图在一张图中完成，设置子图的位置
147 | ax1 = plt.subplot2grid((3,2),(0,0))
148 | ax2 = plt.subplot2grid((3,2),(0,1))
149 | ax3 = plt.subplot2grid((3,2),(1,0))
150 | ax4 = plt.subplot2grid((3,2),(1,1))
151 | ax5 = plt.subplot2grid((3,2),(2,0), colspan=2) # colspan指定跨过的列数
152 | 
153 | # 将图框存放起来，用于循环使用
154 | axes = [ax1,ax2,ax3,ax4,ax5]
155 | types = app_love_cat.type.unique()
156 | 
157 | # 循环的方式完成5张图的绘制
158 | for i in range(5):
159 |     # 准备绘图数据
160 |     data = app_love_cat.loc[app_love_cat.type == types[i]]
161 |     # 绘制条形图
162 |     axes[i].bar(range(5), data.love_new, color = 'steelblue', alpha = 0.7)
163 |     # 设置图框大小
164 |     gcf = plt.gcf()
165 |     gcf.set_size_inches(8, 6)
166 |     # 添加标题
167 |     axes[i].set_title(types[i]+'类APP好评率后5的应用', size = 9)
168 |     # 设置x轴刻度位置
169 |     axes[i].set_xticks(np.arange(5) + 0.4)
170 |     # 为x轴刻度添加标签值
171 |     axes[i].set_xticklabels(data.appname, fontdict={'fontsize':7},  color = 'red')
172 |     # 设置y轴刻度位置
173 |     axes[i].set_yticks(np.arange(0,0.6,0.15))
174 |     # 为y轴刻度添加标签值
175 |     axes[i].set_yticklabels([str(i*100) + '%' for i in np.arange(0,0.6,0.15)])
176 |     # 删除各子图上、右和下的边界刻度标记
177 |     axes[i].tick_params(top = 'off', bottom = 'off', right = 'off')
178 |    
179 | 
180 | # 调整子图之间的水平间距和高度间距
181 | plt.subplots_adjust(hspace=0.6, wspace=0.3)
182 | # 显示图形
183 | plt.show()
184 | 
185 | 
186 | 
187 | # 导入第三方模块
188 | from sklearn.linear_model import LinearRegression
189 | 
190 | # 评价人数与好评率是否存在关系呢？
191 | # 散点图
192 | plt.scatter(app_info.comments, # 评价人数
193 |             app_info.love_new, # 好评率
194 |             s = 30, # 设置点的大小 
195 |             c = 'black', # 设置点的颜色
196 |             marker = 'o', # 设置点的形状
197 |             alpha = 0.9, # 设置点的透明度
198 |             linewidths = 0.3, # 设置散点边界的粗细
199 |             label = '观测点'
200 |             )
201 | 
202 | # 建模
203 | reg = LinearRegression().fit(app_info.comments.reshape(-1,1), app_info.love_new)
204 | # 回归预测值
205 | pred = reg.predict(app_info.comments.reshape(-1,1))
206 | 
207 | # 绘制回归线
208 | plt.plot(app_info.comments, pred, linewidth = 2, label = '回归线')
209 | plt.legend(loc = 'lower right')
210 | 
211 | # 添加轴标签和标题
212 | plt.title('评论人数与好评率的关系')
213 | plt.xlabel('评论人数')
214 | plt.ylabel('好评率')
215 | 
216 | # 去除图边框的顶部刻度和右边刻度
217 | plt.tick_params(top = 'off', right = 'off')
218 | # 显示图形
219 | plt.show()
220 | 
221 | 
222 | 
223 | # 评论人数的描述统计
224 | app_info.comments.describe(percentiles=np.arange(0,1.2,0.2))
225 | 
226 | #&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;有8成的APP，其评论人数不超过53人，数据太过偏态了。这里先筛选出评论人数不超过55人的app，然后，对其研究“评论人数”与“好评率”的关系。
227 | 
228 | # 散点图
229 | sub_data = app_info.loc[app_info.comments <= 55,]
230 | # sub_data = app_info.loc[app_info.comments > 55,]
231 | plt.scatter(sub_data.comments, # 评价人数
232 |             sub_data.love_new, # 好评率
233 |             s = 30, # 设置点的大小 
234 |             c = 'black', # 设置点的颜色
235 |             marker = 'o', # 设置点的形状
236 |             alpha = 0.9, # 设置点的透明度
237 |             linewidths = 0.3, # 设置散点边界的粗细
238 |             label = '观测点'
239 |             )
240 | 
241 | # 建模
242 | reg = LinearRegression().fit(sub_data.comments.reshape(-1,1), sub_data.love_new)
243 | # 回归预测值
244 | pred = reg.predict(sub_data.comments.reshape(-1,1))
245 | 
246 | # 绘制回归线
247 | plt.plot(sub_data.comments, pred, linewidth = 2, label = '回归线')
248 | plt.legend(loc = 'lower right')
249 | 
250 | # 添加轴标签和标题
251 | plt.title('评论人数与好评率的关系')
252 | plt.xlabel('评论人数')
253 | plt.ylabel('好评率')
254 | 
255 | # 显示图形
256 | plt.show()


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/11_python爬取链家网房价信息/Lianjia_Info_v1.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8  
 2 | from bs4 import BeautifulSoup  
 3 | import re  
 4 | import requests  
 5 | from parsel import Selector  
 6 | import pandas as pd  
 7 | import time  
 8 | #############################################################  
 9 | ''''' 
10 | 这个模块爬取链家网福田区的二手房信息；仅仅爬取了前100页的数据 
11 | 为了避免反爬虫策略，设定每5秒钟抓取一页信息 
12 | @time=2018-04-24 
13 | @author=Tom 
14 |  
15 | '''  
16 |   
17 | ###########################################################  
18 | # 进行网络请求的浏览器头部  
19 | headers={  
20 |     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.7 Safari/537.36'  
21 |   
22 | }  
23 | # pages是不同页码的网址列表  
24 | pages=['https://sz.lianjia.com/ershoufang/futianqu/pg{}/'.format(x) for x in range(1,100)]  
25 | ############################################################  
26 |   
27 | #############################################################  
28 | lj_futian = pd.DataFrame(columns=['code','dec','img'])  
29 | count=0  
30 | def l_par_html(url):  
31 |     # 这个函数是用来获取链家网福田区二手房的信息  
32 |     wr=requests.get(url,headers=headers,stream=True)  
33 |     sel=Selector(wr.text)  
34 |     # describ用来获取房源的文字信息  
35 |     describ=sel.xpath('//li[@class="clear"]//text()').extract()  
36 |     new_information=([x for x in describ if x != '关注'and x != '加入对比' ])  
37 |     sep_infor=' '.join(new_information).split(r'/平米')[:-1]  
38 |     # hou_code用来获取房源的编号  
39 |     hou_code=sel.xpath('//li[@class="clear"]/a/@data-housecode').extract()  
40 |     # hou_image用来获取房源的图片  
41 |     hou_image=sel.xpath('//li[@class="clear"]/a/img/@data-original').extract()  
42 |     # 将信息形成表格全部写到一起  
43 |     pages_info=pd.DataFrame(list(zip(hou_code,sep_infor,hou_image)),columns=['code','dec','img'])  
44 |     return pages_info  
45 |   
46 | for page in pages:  
47 |     a=l_par_html(page)  
48 |     count=count+1  
49 |     print ('the '+str(count)+' page is sucessful')  
50 |     time.sleep(5)  
51 |     lj_futian=pd.concat([lj_futian,a],ignore_index=True)  
52 |   
53 | # 将表格数据输出到excel文件  
54 | lj_futian.to_excel('d:\\lianjia_ershou_futian_100.xlsx')  
55 | 
56 | 
57 | #encoding:utf-8
58 | # import json  # 使用json解码 因为拉勾网的格式是json
59 | # import requests  # 使用这个requests是得到网页源码
60 | # import pandas as pd   # 使用这个数据进行存储
61 | 
62 | # items = []  # 定义空列表用来存放你得到的数据
63 | # #  循环两页 这里爬取的是两页内容
64 | # for i in range(1,2):
65 | #     # 传入data 因为这个url是post的请求方法 pn指的是页数 kd指的是你搜索的内容
66 | #     data = {'first': 'true', 'pn': i, 'kd': 'python'}
67 | #     # 拉钩网的链接是固定的就变化的是页数 因为是post的提交方法 所以传入data
68 | #     yuan = requests.post('https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false', data=data).text
69 | #     # 使用json进行解码 因为返回的是一个json的格式
70 | #     yuan = json.loads(yuan)
71 | #     # 得到14个数据
72 | #     for i in range(14):
73 | #         item = []
74 | #         # 看下面的图片item里面的是什么数据
75 | #         item.append(yuan['content']['positionResult']['result'][i]['positionName'])
76 | #         item.append(yuan['content']['positionResult']['result'][i]['companyFullName'])
77 | #         item.append(yuan['content']['positionResult']['result'][i]['salary'])
78 | #         item.append(yuan['content']['positionResult']['result'][i]['city'])
79 | #         item.append(yuan['content']['positionResult']['result'][i]['positionAdvantage'])
80 | #         items.append(item)
81 | # #  使用的是pands的存数据 存为xlsx就是excel格式
82 | # data = pd.DataFrame(items)
83 | # data.to_excel('拉钩.xlsx')


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/11_python爬取链家网房价信息/Lianjia_Info_v2.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8 
 2 | #爬取链家二手房信息  
 3 | import requests  
 4 | from bs4 import BeautifulSoup  
 5 | import csv  
 6 |   
 7 | def getHTMLText(url):  
 8 |     try:  
 9 |         r = requests.get(url,timeout=30)  
10 |         r.raise_for_status()  
11 |         r.encoding = r.apparent_encoding  
12 |         return r.text  
13 |     except:  
14 |         return '产生异常'  
15 |   
16 | def get_data(list,html):  
17 |     soup = BeautifulSoup(html,'html.parser')  
18 |     infos = soup.find('ul',{'class':'sellListContent'}).find_all('li')  
19 |     with open(r'lianjia.csv','a',encoding='utf-8') as f:  
20 |         for info in infos:  
21 |             name = info.find('div',{'class':'title'}).find('a').get_text()  
22 |             price =info.find('div',{'class':'priceInfo'}).find('div',{'class','totalPrice'}).find('span').get_text()  
23 |             f.write("{},{}\n".format(name,price))  
24 |         
25 | def main():  
26 |     start_url = 'https://sh.lianjia.com/ershoufang/pg'  
27 |     depth = 20  
28 |     info_list =[]  
29 |     for i in range(depth):  
30 |         url = start_url + str(i)  
31 |         html = getHTMLText(url)  
32 |         get_data(info_list,html)  
33 | main()  


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/11_python爬取链家网房价信息/Lianjia_Info_v3.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import requests
 3 | from requests.exceptions import RequestException
 4 | from bs4 import BeautifulSoup
 5 | from time import sleep
 6 | import csv
 7 | 
 8 | 
 9 | def write_to_file(content):
10 |     with open('lianjia_bs4.csv', 'w') as csvfile:
11 |         writer = csv.writer(csvfile)
12 |         writer.writerows(content)
13 |         csvfile.close()
14 | 
15 | 
16 | def get_one_page(url):
17 |     try:
18 |         response = requests.get(url)
19 |         if response.status_code == 200:
20 |             return response.text
21 |         else:
22 |             return None
23 |     except RequestException:
24 |         return None
25 | 
26 | 
27 | def parse_one_page(html):
28 |     soup = BeautifulSoup(html, 'lxml')
29 |     prefix = 'http://sh.lianjia.com'
30 |     for item in soup.select('.info-panel'):
31 |         houseUrl = prefix + item.find("h2").a["href"]
32 |         title = item.find("h2").a["title"]
33 |         spans = item.find(class_="where").find_all("span")
34 |         xiaoqu, huxing, mianji = spans[0].string, spans[1].string.split('\xa0')[0], spans[2].string.split('\xa0')[0]
35 |         cons = item.find(class_="con").find_all("a")
36 |         area, sub_area = cons[0].string, cons[1].string
37 |         subway = item.find(class_="fang-subway-ex").string
38 |         price = item.find(class_="price").find(class_="num").string
39 |         data = item.find(class_="price-pre").string.split('\n')[0]
40 |         watched = item.find(class_="square").find(class_="num").string
41 | 
42 |         yield [houseUrl, title, xiaoqu, huxing, mianji, area, sub_area, subway, price, data, watched]
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     results = []
47 |     for page in range(100):
48 |         sleep(1)
49 |         print (page)
50 |         url = 'http://sh.lianjia.com/zufang/d' + str(page)
51 |         html = get_one_page(url)
52 |         for item in parse_one_page(html):
53 |             results.append(item)
54 |     write_to_file(results)


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/11_python爬取链家网房价信息/Lianjia_Info_v4.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import requests
 3 | from requests.exceptions import RequestException
 4 | from time import sleep
 5 | import re
 6 | import csv
 7 | 
 8 | 
 9 | def write_to_file(content):
10 |     with open('lianjia_re_v4.csv', 'w') as csvfile:
11 |         writer = csv.writer(csvfile)
12 |         writer.writerows(content)
13 |         csvfile.close()
14 | 
15 | 
16 | def get_one_page(url):
17 |     try:
18 |         response = requests.get(url)
19 |         if response.status_code == 200:
20 |             return response.text
21 |         else:
22 |             return None
23 |     except RequestException:
24 |         return None
25 | 
26 | 
27 | def parse_one_page(html):
28 |     pattern = re.compile('<h2>.*?"\shref="(.*?)".*?title="(.*?)".*?</a>.*?<span>(.*?)&nb.*?<span>' +
29 |                          '(.*?)&nb.*?/">(.*?)</a>.*?/">(.*?)</a>.*?</span>(.*?)<.*?-ex"><span>(.*?)<' +
30 |                          '.*?-ex"><span>(.*?)</span>.*?num">(\d+)<.*?-pre">(.*?)<.*?num">(\d+)<.*?</li>', re.S)
31 |     prefix = 'http://sh.lianjia.com'
32 |     items = re.findall(pattern, html)
33 |     for item in items:
34 |         item = list(item)
35 |         item[0] = prefix + item[0]
36 |         item[6] = item[6].strip()
37 |         item[10] = item[10].split('\n')[0]
38 |         yield item
39 | 
40 | 
41 | def main(page, results):
42 |     url = 'http://sh.lianjia.com/zufang/d' + str(page)
43 |     html = get_one_page(url)
44 |     for item in parse_one_page(html):
45 |         results.append(item)
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     results = []
50 |     for i in range(100):
51 |         sleep(1)
52 |         print(i)
53 |         main(i+1, results)
54 |     write_to_file(results)


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/11_python爬取链家网房价信息/Lianjia_Info_v4_analysis.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import numpy as np
 3 | import csv
 4 | import re
 5 | import matplotlib.pyplot as plt
 6 | plt.rcParams['font.sans-serif'] = ['SimHei']
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 |     csv_reader = csv.reader(open('lianjia_re_v4.csv'))
11 |     content = []
12 |     for row in csv_reader:
13 |         content.append(row)
14 | 
15 |     all_region = []
16 |     regions = ['徐汇', '静安', '浦东', '杨浦', '闵行', '长宁', '宝山', '青浦',
17 |                '金山', '普陀','松江', '嘉定', '闸北', '虹口', '奉贤',
18 |                '崇明', '黄浦', '上海周边']
19 |     tmp = []
20 |     region_statistics = []
21 |     region_statistics_dict = {}
22 |     for item in content:
23 |         all_region.append(item[5])
24 | 
25 |     for region in regions:
26 |         if all_region.count(region):
27 |             region_statistics.append(all_region.count(region))
28 |             region_statistics_dict[region] = all_region.count(region)
29 |             tmp.append(region)
30 |     regions = tmp
31 | 
32 |     fangzu = {}
33 |     for region in regions:
34 |         fangzu[region] = 0
35 |     for item in content:
36 |         fangzu[item[5]] += int(item[-3])
37 |     fangzu_average = []
38 |     for region in regions:
39 |         fangzu_average.append(fangzu[region]/region_statistics_dict[region])
40 | 
41 |     area = {}
42 |     for region in regions:
43 |         area[region] = 0
44 |     for item in content:
45 |         tmp = item[4]
46 |         tmp = re.sub(r'[^\x00-\x7f]', '', tmp)
47 |         area[item[5]] += int(tmp)
48 |     area_average = []
49 |     for region in regions:
50 |         area_average.append(area[region] / region_statistics_dict[region])
51 |     for i in range(len(area_average)):
52 |         area_average[i] = fangzu_average[i]/area_average[i]
53 | 
54 |     # 地区分布
55 |     a = [i for i in range(1, len(regions) + 1)]
56 |     plt.bar(a, region_statistics, 0.4, color="blue")
57 |     xlocations = np.array(range(1, len(regions) + 1))
58 |     plt.xticks(xlocations, regions, rotation=60)
59 |     plt.ylabel("房屋数量")
60 |     plt.xlabel("地区")
61 |     plt.title("上海各区租房数量")
62 |     for a, b in zip(a, region_statistics):
63 |         plt.text(a, b, '%.0f' % b, ha='center', va='bottom', fontsize=7)
64 |     plt.savefig("上海各区租房数量.jpg", dpi=300)
65 |     plt.close()
66 | 
67 |     # 各区房租情况
68 |     a = [i for i in range(1, len(regions) + 1)]
69 |     plt.bar(a, fangzu_average, 0.4, color="blue")
70 |     xlocations = np.array(range(1, len(regions) + 1))
71 |     plt.xticks(xlocations, regions, rotation=60)
72 |     plt.ylabel("月租 元/月")
73 |     plt.xlabel("地区")
74 |     plt.title("上海各区租房平均月租")
75 |     for a, b in zip(a, fangzu_average):
76 |         plt.text(a, b, '%.0f' % b, ha='center', va='bottom', fontsize=7)
77 |     plt.savefig("上海各区租房房租信息.jpg", dpi=300)
78 |     plt.close()
79 | 
80 |     # 各区每平米平均月租
81 |     a = [i for i in range(1, len(regions) + 1)]
82 |     plt.bar(a, area_average, 0.4, color="blue")
83 |     xlocations = np.array(range(1, len(regions) + 1))
84 |     plt.xticks(xlocations, regions, rotation=60)
85 |     plt.ylabel("月租 元/月/平米")
86 |     plt.xlabel("地区")
87 |     plt.title("上海各区租房每平米平均月租")
88 |     for a, b in zip(a, area_average):
89 |         plt.text(a, b, '%.0f' % b, ha='center', va='bottom', fontsize=7)
90 |     plt.savefig("上海各区租房每平米房租信息.jpg", dpi=300)
91 |     plt.close()


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/11_python爬取链家网房价信息/lianjia_ershou_futian_100.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/11_python爬取链家网房价信息/lianjia_ershou_futian_100.xlsx


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/11_python爬取链家网房价信息/lianjia_re_v4.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/11_python爬取链家网房价信息/lianjia_re_v4.csv


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/12_python爬取并分析豆瓣中最新电影的影评(词云显示)/alice_mask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/12_python爬取并分析豆瓣中最新电影的影评(词云显示)/alice_mask.png


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/12_python爬取并分析豆瓣中最新电影的影评(词云显示)/alice_mask1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/12_python爬取并分析豆瓣中最新电影的影评(词云显示)/alice_mask1.png


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/12_python爬取并分析豆瓣中最新电影的影评(词云显示)/python爬取并分析豆瓣中最新电影的影评.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import warnings
  4 | warnings.filterwarnings("ignore")
  5 | import jieba  # 分词包
  6 | import numpy  # numpy计算包
  7 | import codecs  # codecs提供的open方法来指定打开的文件的语言编码，它会在读取的时候自动转换为内部unicode
  8 | import re
  9 | import pandas as pd
 10 | import matplotlib.pyplot as plt
 11 | from PIL import Image
 12 | from urllib import request
 13 | from bs4 import BeautifulSoup as bs
 14 | from wordcloud import WordCloud,ImageColorGenerator # 词云包
 15 | import matplotlib
 16 | matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
 17 | 
 18 | 
 19 | 
 20 | # 分析网页函数
 21 | def getNowPlayingMovie_list():
 22 |     resp = request.urlopen('https://movie.douban.com/nowplaying/hangzhou/')
 23 |     html_data = resp.read().decode('utf-8')
 24 |     soup = bs(html_data, 'html.parser')
 25 |     nowplaying_movie = soup.find_all('div', id='nowplaying')
 26 |     nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item')
 27 |     nowplaying_list = []
 28 |     for item in nowplaying_movie_list:
 29 |         nowplaying_dict = {}
 30 |         nowplaying_dict['id'] = item['data-subject']
 31 |         for tag_img_item in item.find_all('img'):
 32 |             nowplaying_dict['name'] = tag_img_item['alt']
 33 |             nowplaying_list.append(nowplaying_dict)
 34 |     return nowplaying_list
 35 | 
 36 | # 爬取评论函数
 37 | def getCommentsById(movieId, pageNum):
 38 |     eachCommentList = []
 39 |     if pageNum > 0:
 40 |         start = (pageNum - 1) * 20
 41 |     else:
 42 |         return False
 43 |     requrl = 'https://movie.douban.com/subject/' + movieId + '/comments' + '?' + 'start=' + str(start) + '&limit=20'
 44 |     print(requrl)
 45 |     resp = request.urlopen(requrl)
 46 |     html_data = resp.read().decode('utf-8')
 47 |     soup = bs(html_data, 'html.parser')
 48 |     comment_div_lits = soup.find_all('div', class_='comment')
 49 |     for item in comment_div_lits:
 50 |         if item.find_all('p')[0].string is not None:
 51 |             eachCommentList.append(item.find_all('p')[0].string)
 52 |     return eachCommentList
 53 | 
 54 | def main():
 55 |     # 循环获取第一个电影的前10页评论
 56 |     commentList = []
 57 |     NowPlayingMovie_list = getNowPlayingMovie_list()
 58 |     for i in range(10):
 59 |         num = i + 1
 60 |         commentList_temp = getCommentsById(NowPlayingMovie_list[0]['id'], num)
 61 |         commentList.append(commentList_temp)
 62 | 
 63 |     # 将列表中的数据转换为字符串
 64 |     comments = ''
 65 |     for k in range(len(commentList)):
 66 |         comments = comments + (str(commentList[k])).strip()
 67 | 
 68 |     # 使用正则表达式去除标点符号
 69 |     pattern = re.compile(r'[\u4e00-\u9fa5]+')
 70 |     filterdata = re.findall(pattern, comments)
 71 |     cleaned_comments = ''.join(filterdata)
 72 | 
 73 |     # 使用结巴分词进行中文分词
 74 |     segment = jieba.lcut(cleaned_comments)
 75 |     words_df = pd.DataFrame({'segment': segment})
 76 | 
 77 |     # 去掉停用词
 78 |     stopwords = pd.read_csv("stopwords.txt", index_col=False, quoting=3, sep="\t", names=['stopword'],
 79 |                             encoding='utf-8')  # quoting=3全不引用
 80 |     words_df = words_df[~words_df.segment.isin(stopwords.stopword)]
 81 | 
 82 |     # 统计词频
 83 |     words_stat = words_df.groupby(by=['segment'])['segment'].agg({"计数": numpy.size})
 84 |     words_stat = words_stat.reset_index().sort_values(by=["计数"], ascending=False)
 85 |     #  print(words_stat.head())
 86 | 
 87 |     bg_pic = numpy.array(Image.open("alice_mask.png"))
 88 | 
 89 |     # 用词云进行显示
 90 |     wordcloud = WordCloud(
 91 |         font_path="simhei.ttf",
 92 |         background_color="white",
 93 |         max_font_size=80,
 94 |         width = 2000,
 95 |         height = 1800,
 96 |         mask = bg_pic,
 97 |         mode = "RGBA"
 98 |     )
 99 |     word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values}
100 |     # print(word_frequence)
101 |     """
102 |     word_frequence_list = []
103 |     for key in word_frequence:
104 |         temp = (key, word_frequence[key])
105 |         word_frequence_list.append(temp)
106 |         #print(word_frequence_list)
107 |     """
108 |     wordcloud = wordcloud.fit_words(word_frequence)
109 | 
110 |     image_colors = ImageColorGenerator(bg_pic) # 根据图片生成词云颜色
111 | 
112 |     plt.imshow(wordcloud) #显示词云图片
113 |     plt.axis("off")
114 |     plt.show()
115 |     wordcloud.to_file('show_Chinese.png')  # 把词云保存下来
116 | 
117 | main()


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/12_python爬取并分析豆瓣中最新电影的影评(词云显示)/show_Chinese.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/12_python爬取并分析豆瓣中最新电影的影评(词云显示)/show_Chinese.png


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/13_python爬取豆瓣书籍信息/books.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/13_python爬取豆瓣书籍信息/books.csv


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/13_python爬取豆瓣书籍信息/python爬取豆瓣书籍信息.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | #爬虫爬取豆瓣书目录
 4 | import requests
 5 | from bs4 import BeautifulSoup
 6 | import json
 7 | import pandas #该库用于对爬取的信息进行表格性操作
 8 | from skimage import io#该库用于打印爬取到的照片
 9 | 
10 | url = 'https://market.douban.com/book/?utm_campaign=book_nav_freyr&utm_source=douban&utm_medium=pc_web&page={}&page_num=18'
11 | 
12 | #该函数式用来返回一个列表存放含有书籍信息的字典
13 | def bookList(url): 
14 |     newurl = requests.get(url)
15 |     soup = BeautifulSoup(newurl.text,'html.parser')
16 |     result_total = []
17 |     for book in soup.select('.book-item'):
18 |         if len(book.select('.book-brief'))>0:
19 | 
20 |             bookimag2 = io.imread(book.select('img')[1]['src'])#书的图片
21 |             io.imshow(bookimag2)
22 |             #io.show()#为了使爬取到的图片显示出来
23 | 
24 |             bookurl = book.select('a')[0]['href']#抓取书的链接url
25 |             #print('链接:   ',bookurl)
26 | 
27 |             result_total.append(booktextscore(bookurl))#将所抓取书的信息字典添加到列表里面
28 | 
29 |             bookimag1 = io.imread(book.select('img')[0]['src'])#背景图片
30 |             io.imshow(bookimag1)
31 |             #io.show()
32 |     return result_total #返回一个列
33 | 
34 | 
35 | #该函数式用来爬取书籍的名字，评分，评价人数以及书的简单介绍
36 | def booktextscore(url):
37 |     booktexturl = requests.get(url)
38 |     soup = BeautifulSoup(booktexturl.text,'html.parser')
39 |     result = {}#创建一个字典将相关书籍信息存入到字典中
40 |     bookname = soup.select('.book-breintro h3')[0].text
41 |     bookname2 = '《' + bookname + '》'
42 |     print(bookname2)
43 |     result['书籍名称'] = bookname2
44 | 
45 |     bookauthor = soup.select('.book-public')[0].text.lstrip('\n          ').rstrip('\n        ')
46 |     result['作者'] = bookauthor
47 |     print(bookauthor)
48 | 
49 |     print(url)
50 |     result['书籍链接'] = url
51 | 
52 |     score = soup.select('.total-score')[0].text#爬取该书评分，其中可能含有有些书籍由于评论人数不足导致没有评分，加一个判断默认该种情况成评分为0
53 |     if score == '评价人数不足':
54 |         score = 0
55 |     score = float(score)
56 |     result['书籍评分'] = float(score)#将评分强制转换成float类型的
57 |     print('评分:',score)
58 | 
59 |     commentnum = soup.select('.comment-number')[0].text#爬取本书评论人数
60 |     print(commentnum)
61 |     print('该书简介:\n')
62 |     result['书籍评论人数'] = commentnum 
63 | 
64 |     article = []#添加一个列表
65 |     for ench in soup.select('.layout-content'):#爬取的是图书详情
66 |         for p in ench.select('.paragraph-content  p')[:-1]:
67 |             article.append(p.text.strip())#将p标签中的文字添加到列表中
68 |         articlebook = '\n '.join(article)
69 |         #print(articlebook)
70 |     #result['书籍简介'] = articlebook 
71 |     return result
72 | 
73 | 
74 | 
75 | book_total = []
76 | #由于书籍信息有两页，所以加一个循环将两页书籍信息都添加进列表中方便生成表格
77 | for ench in range(1,4):
78 |     newurl = url.format(ench)#通过format将URL地址实现可变性，可以将两页书籍信息都打印出来
79 |     book_result = bookList(newurl)
80 |     book_total.extend(book_result)
81 | 
82 | df = pandas.DataFrame(book_total)
83 | df.to_csv('books.csv')#将爬取后的书籍信息通过pandas转换成表格形式
84 | 
85 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/14_python爬取今日头条信息并导入mongodb数据库/python爬取今日头条信息并导入mongodb数据库.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import requests
 4 | import re
 5 | # 导入mongo 数据库
 6 | import pymongo
 7 | import json
 8 | 
 9 | # 打开数据库连接，mongodb默认端口为27017
10 | conn = pymongo.MongoClient(host='localhost',port=27017)
11 | # 选择或创建数据库
12 | toutiao = conn['toutiao']
13 | # 选择或者创建数据集合
14 | newsdata = toutiao['news']
15 | 
16 | toutiaoUrl = 'http://www.toutiao.com/api/pc/focus/'
17 | reqData = requests.get(toutiaoUrl).text
18 | print(reqData)
19 | 
20 | jsonData = json.loads(reqData)
21 | newsData = jsonData['data']['pc_feed_focus']
22 | 
23 | # 存储到数据库
24 | for new in newsData:
25 |     title = new['title']
26 |     img_url = new['image_url']
27 |     url = new['media_url']
28 |     data = {
29 |         'title':title,
30 |         'img_url':img_url,
31 |         'url':url
32 |     }
33 |     # 插入一行数据
34 |     newsdata.insert_one(data)
35 | 
36 | for i in newsdata.find():
37 |     # 从数据库中读取出来
38 |     print('i'+str(i))


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/15_python使用selenium爬取百度招聘内容并存入mongodb数据库/python使用selenium爬取百度招聘内容并入mongodb数据库.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import re
  4 | import time
  5 | 
  6 | from selenium import webdriver
  7 | from selenium.webdriver.common.keys import Keys
  8 | from selenium.webdriver.common.action_chains import ActionChains
  9 | 
 10 | from bs4 import BeautifulSoup
 11 | 
 12 | import pymongo
 13 | 
 14 | # 多进程
 15 | from multiprocessing import Pool
 16 | 
 17 | # 1 打开数据库连接，mongodb默认端口为27017
 18 | conn = pymongo.MongoClient(host='localhost',port=27017)
 19 | # 2 选择或创建数据库
 20 | jobdata = conn['baidujobs']
 21 | # 3 选择或创建数据集合
 22 | ver_job = jobdata['verjob']
 23 | 
 24 | baidu_baseurl = 'http://zhaopin.baidu.com/quanzhi?tid=4139&ie=utf8&oe=utf8&query=python%E6%9D%AD%E5%B7%9E&city_sug=%E6%9D%AD%E5%B7%9E'
 25 | def set_winscroll(driver):
 26 |     time.sleep(2)
 27 |     driver.execute_script('window.scrollBy(0,2000)')
 28 |     time.sleep(3)
 29 |     driver.execute_script('window.scrollBy(0,3000)')
 30 |     time.sleep(3)
 31 | 
 32 | 
 33 | # 1 初始化driver
 34 | driver = webdriver.PhantomJS()
 35 | # 2 调用get方法
 36 | driver.get(baidu_baseurl)
 37 | # 3 进入网页
 38 | set_winscroll(driver)
 39 | 
 40 | # 4 获取资源（第一页的数据）
 41 | we_data = driver.page_source
 42 | # print('first_we_data ' + we_data)
 43 | 
 44 | 
 45 | def parse_html(html):
 46 |     soup = BeautifulSoup(html, 'lxml')
 47 |     item_url = soup.findAll('a', {'class': 'clearfix item line-bottom'})
 48 |     # for item in zip(item_url):
 49 |     #     print(item.get('href'))
 50 | 
 51 |     # 职位信息
 52 |     jobs = soup.findAll('div', {'class': 'title-h3 line-clamp1'})
 53 |     # for job in jobs:
 54 |     # print(job.string) # 职位信息
 55 |     # 地址 + 公司名
 56 |     compy = soup.findAll('p', {'class': 'area line-clamp1'})
 57 |     # for com in compy:
 58 |     #     print(com.string)
 59 | 
 60 |     # 薪资
 61 |     salarys = soup.findAll('p', {'class': 'salary'})
 62 |     # for salary in salarys:
 63 |     #     print(salary.string)
 64 |     # 发布时间跟发布来源网站
 65 |     addresss = soup.findAll('div', {'class': 'right time'})
 66 |     # print(addresss)
 67 |     reg = r'<p>(.*?)</p>'
 68 |     regx = re.compile(reg)
 69 |     ads = re.findall(regx, str(addresss))
 70 |     # print(ads)
 71 |     # for adds in ads:
 72 |     #     data = adds.split('|')
 73 |     #     print(data)
 74 |     for itm_url, job_detail, ver_compny, ver_salary, ver_addres in zip(item_url, jobs, compy, salarys, ads):
 75 |         data = {
 76 |             'itme_url': 'http://zhaopin.baidu.com'+itm_url.get('href'),
 77 |             'job_detail': job_detail.string,
 78 |             'ver_compny': str(ver_compny.string),
 79 |             'ver_salary': ver_salary.string,
 80 |             'ver_addres': str(ver_addres).split('|'),
 81 |         }
 82 |         print(data)
 83 |         # 插入数据库
 84 |         ver_job.insert_one(data) # 插入数据库失败
 85 |         f.write(str(data))
 86 | 
 87 | 
 88 | def get_page_source(page_num):
 89 |     time.sleep(2)
 90 |     driver.find_element_by_xpath('//*[@id="pagination"]/p/span/a[%s]' % page_num).click()
 91 |     # //*[@id="pagination"]/p/span/a[1]  为在第一页的按钮
 92 |     # //*[@id="pagination"]/p/span/a[2]  为第二页的按钮
 93 |     set_winscroll(driver)
 94 |     we_data = driver.page_source
 95 |     return we_data
 96 | 
 97 | f = open('百度招聘前30页杭州.csv', 'a',encoding='utf-8')
 98 | # 首页的数据
 99 | def getBaiduHangZhouJob(we_data):
100 |     parse_html(we_data)
101 |     for i in range(1, 50):
102 |         if i==1:
103 |             we_data = get_page_source(1)
104 |             parse_html(we_data)
105 |         elif i<=5:
106 |             we_data = get_page_source(str(2))
107 |             parse_html(we_data)
108 |         else:
109 |             we_data = get_page_source(str(3))
110 |             parse_html(we_data)
111 |     f.close()
112 | 
113 | 
114 | if __name__ == '__main__':
115 |     getBaiduHangZhouJob(we_data)
116 |     # pool = Pool(processes=10)
117 |     # pool.map_async(getBaiduHangZhouJob(we_data))
118 |     # pool.close()
119 |     # f.close()


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/16_python爬取熊猫直播用户信息/python爬取熊猫直播用户信息.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import requests
 4 | import json
 5 | import pandas as pd 
 6 | 
 7 | url = "https://www.panda.tv/live_lists?status=2&order=person_num&token=&pageno=%d&pagenum=120&_=%d".format(a=range(0,35),b=range(1501946526480,1501946526880))
 8 | 
 9 | headers = {
10 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0'
11 |     ,
12 |     'Cookie': '__guid=96554777.3243119502220345300.1500627276199.6702; smid=608e0bde-ffe2-4251-90ca-2938cabdc074; monitor_count=18'
13 |     ,
14 | }
15 | 
16 | 
17 | def getHtml(url):
18 |     req = requests.get(url, headers=headers)
19 |     print(req.text)
20 |     return req.text
21 | 
22 | 
23 | def printInfos(data):
24 |     jsondata = json.loads(data, "utf-8")
25 |     # print(jsondata)
26 |     itemsinfo = jsondata['data']['items']
27 |     items_list = []
28 |     for pinfo in itemsinfo:
29 |         name = pinfo['name']
30 |         person_num = pinfo['person_num']
31 |         nickName = pinfo['userinfo']['nickName']
32 |         lelvel = pinfo['host_level_info']
33 |         lable = pinfo['label']       
34 |         cname = pinfo['classification']
35 |         item_list = [name, person_num, nickName, lelvel, label, cname]
36 |         items_list.append(item_list)
37 |     df = pd.DataFrame(items_list, columns = ['name','person_num','nickName','host_level_info','label','classification'])
38 |     df.to_csv('熊猫直播用户信息.csv')
39 | 
40 | 	
41 | def mainStart():
42 |     for n in range(0, 3):
43 |         pageindex = 1 + n
44 |         pagetime = int(1501946526480 + n)
45 |         url = "https://www.panda.tv/live_lists?status=2&order=person_num&token=&pageno=%d&pagenum=120&_=%d"%(pageindex,pagetime)
46 |         data = getHtml(url)
47 |         printInfos(data)
48 | 
49 | mainStart()


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/README.txt:
--------------------------------------------------------------------------------
 1 | 输入：scrapy crawl youtx -o items.json 时以json格式保存下载数据
 2 | 输入：scrapy crawl youtx -o items.csv 时以csv格式保存下载数据
 3 | 
 4 | 
 5 | Scrapy必须背下来的命令：
 6 | 1 创建项目： scrapy startproject youtxNanJin
 7 | 			startproject: 表示创建项目
 8 | 			youtxNanJin： 表示创建的项目名
 9 | 
10 | 2 创建爬虫： scrapy genspider youtx "http://www.youtx.com"
11 | 			genspider： 表示生成一个爬虫（默认是scrapy.Spider类）
12 | 			youtx： 表示爬虫名（对应爬虫代码里的 name 参数）
13 | 			"http://www.youtx.com"： 表示允许爬虫爬取的域范围
14 | 
15 | 3 执行爬虫： scrapy crawl youtx
16 | 			crawl: 表示启动一个sc	rapy爬虫
17 | 			youtx: 表示需要启动的爬虫名（对应爬虫代码里的 name 参数）


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = youtxNanJin.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = youtxNanJin
12 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__init__.py


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class YoutxnanjinItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     # pass
15 |     
16 | 	# 房源名称
17 |     homeName = scrapy.Field()
18 |     # 房源链接
19 |     homeLine = scrapy.Field()
20 |     # 房租单价
21 |     homeSinglePrice = scrapy.Field()
22 |     # 房租地址
23 |     homeAddress = scrapy.Field()
24 |     # 房租近期信息
25 |     homeDetai = scrapy.Field()
26 |     # 满七天价格
27 |     homeSeven = scrapy.Field()
28 |     # 满30天价格
29 |     homeThirth = scrapy.Field()
30 | 
31 |     # 房东
32 |     homePerson = scrapy.Field()
33 |     # 房东头像
34 |     homePersonImg = scrapy.Field()
35 |     # 房东头像链接
36 |     homePersonLink = scrapy.Field()
37 | 
38 |     # 房子大图
39 |     homePicBg = scrapy.Field()
40 |     # 房子大图链接
41 |     homePicLink = scrapy.Field()
42 | 
43 |     # 品牌店铺信息
44 |     # homePinPai = scrapy.Field()
45 |     # 明星房东
46 |     # homeStarrPerson = scrapy.Field()
47 | 
48 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class YoutxnanjinSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import json
 8 | from scrapy.conf import settings
 9 | import pymongo
10 | 
11 | 
12 | class YoutxnanjinPipeline(object):
13 |     def process_item(self, item, spider):
14 |         return item
15 | 
16 | 
17 | class YouTXMongo(object):
18 |     def __init__(self):
19 |         self.client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT'])
20 |         self.db = self.client[settings['MONGO_DB']]
21 |         self.post = self.db[settings['MONGO_COLL']]
22 | 
23 |     def process_item(self, item, spider):
24 |         postItem = dict(item)
25 |         self.post.insert(postItem)
26 |         return item
27 | 
28 | # 写入json文件
29 | class JsonWritePipline(object):
30 |     def __init__(self):
31 |         self.file = open('游天下南京.json','w',encoding='utf-8')
32 | 
33 |     def process_item(self,item,spider):
34 |         line  = json.dumps(dict(item),ensure_ascii=False)+"\n"
35 |         self.file.write(line)
36 |         return item
37 | 
38 |     def spider_closed(self,spider):
39 |         self.file.close()


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for youtxNanJin project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'youtxNanJin'
 13 | 
 14 | SPIDER_MODULES = ['youtxNanJin.spiders']
 15 | NEWSPIDER_MODULE = 'youtxNanJin.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'youtxNanJin (+http://www.yourdomain.com)'
 20 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
 21 | 
 22 | 
 23 | # Obey robots.txt rules
 24 | ROBOTSTXT_OBEY = False
 25 | 
 26 | # 配置mongoDB
 27 | MONGO_HOST = "127.0.0.1"  # 主机IP
 28 | MONGO_PORT = 27017  # 端口号
 29 | MONGO_DB = "YouTianXia"  # 库名
 30 | MONGO_COLL = "house_nanjin"  # collection
 31 | 
 32 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 33 | #CONCURRENT_REQUESTS = 32
 34 | 
 35 | # Configure a delay for requests for the same website (default: 0)
 36 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 37 | # See also autothrottle settings and docs
 38 | #DOWNLOAD_DELAY = 3
 39 | # The download delay setting will honor only one of:
 40 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 41 | #CONCURRENT_REQUESTS_PER_IP = 16
 42 | 
 43 | # Disable cookies (enabled by default)
 44 | COOKIES_ENABLED = False
 45 | 
 46 | # Disable Telnet Console (enabled by default)
 47 | #TELNETCONSOLE_ENABLED = False
 48 | 
 49 | # Override the default request headers:
 50 | #DEFAULT_REQUEST_HEADERS = {
 51 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 52 | #   'Accept-Language': 'en',
 53 | #}
 54 | 
 55 | # Enable or disable spider middlewares
 56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 57 | #SPIDER_MIDDLEWARES = {
 58 | #    'youtxNanJin.middlewares.YoutxnanjinSpiderMiddleware': 543,
 59 | #}
 60 | 
 61 | # Enable or disable downloader middlewares
 62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 63 | #DOWNLOADER_MIDDLEWARES = {
 64 | #    'youtxNanJin.middlewares.MyCustomDownloaderMiddleware': 543,
 65 | #}
 66 | 
 67 | # Enable or disable extensions
 68 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 69 | #EXTENSIONS = {
 70 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 71 | #}
 72 | 
 73 | # Configure item pipelines
 74 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 75 | ITEM_PIPELINES = {
 76 |    # 'youtxNanJin.pipelines.YoutxnanjinPipeline': 300,
 77 |     'youtxNanJin.pipelines.YouTXMongo': 300,
 78 |     'youtxNanJin.pipelines.JsonWritePipline': 300,
 79 | }
 80 | 
 81 | # Enable and configure the AutoThrottle extension (disabled by default)
 82 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 83 | #AUTOTHROTTLE_ENABLED = True
 84 | # The initial download delay
 85 | #AUTOTHROTTLE_START_DELAY = 5
 86 | # The maximum download delay to be set in case of high latencies
 87 | #AUTOTHROTTLE_MAX_DELAY = 60
 88 | # The average number of requests Scrapy should be sending in parallel to
 89 | # each remote server
 90 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 91 | # Enable showing throttling stats for every response received:
 92 | #AUTOTHROTTLE_DEBUG = False
 93 | 
 94 | # Enable and configure HTTP caching (disabled by default)
 95 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 96 | #HTTPCACHE_ENABLED = True
 97 | #HTTPCACHE_EXPIRATION_SECS = 0
 98 | #HTTPCACHE_DIR = 'httpcache'
 99 | #HTTPCACHE_IGNORE_HTTP_CODES = []
100 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
101 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/__pycache__/youtxNanJin_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/__pycache__/youtxNanJin_spider.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/youtxNanJin_spider.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf8
 2 | import scrapy
 3 | from youtxNanJin.items import YoutxnanjinItem
 4 | 
 5 | class NanJinDefault(scrapy.Spider):
 6 |     name = 'youtx'
 7 |     allowed_domains = ['youtx.com']
 8 |     start_urls = ["http://www.youtx.com/nanjing/longrent1-page{}".format(n) for n in range(0,6)]
 9 |     def parse(self, response):
10 |         # print(response.body)
11 |         node_list = response.xpath("//div[@class='duanzu houseList']/ul/li[@class='clearfix']")
12 |         # print(node_list)
13 |         for node in node_list:
14 |             item = YoutxnanjinItem()
15 |             homeName = node.xpath("./div[@class='houseInfo clearfix']/div[@class='house-tit clearfix']/h3/a/text()").extract()
16 |             homeLink = node.xpath("./div[@class='houseInfo clearfix']/div[@class='house-tit clearfix']/h3/a/@href").extract()
17 |             print(homeName)
18 |             print(homeLink)
19 | 
20 |             # 单日价格
21 |             homeSinglePrice = node.xpath("./div[@class='houseInfo clearfix']/div[@class='house-tit clearfix']/div[@class='house-price mt9']/span/span[@class='housePrice']/text()").extract()
22 |             print(homeSinglePrice)
23 | 
24 |             # 获取房源地址
25 |             homeAddress = node.xpath("./div[@class='houseInfo clearfix']/div[@class='houseInfo-left mt2']/p[@class='clearfix mt5']/text()").extract()
26 |             # 房租信息
27 |             homeDesc =node.xpath("./div[@class='houseInfo clearfix']/div[@class='houseInfo-left mt2']/p[@class='mt5']/text()").extract()
28 |             homeDesc2 =node.xpath("./div[@class='houseInfo clearfix']/div[@class='houseInfo-left mt2']/p[@class='mt5']/span[2]/text()").extract()
29 |             print(homeAddress)
30 |             print(homeDesc)
31 |             print(homeDesc2)
32 | 
33 |             # 满30天的信息
34 |             homeThrty = node.xpath("./div[@class='houseInfo clearfix']/div[@class='house-tit clearfix']/div[@class='house-price mt9']/div[@class='mix12_5']/div[@class='discount']/div[@class='discount-price']/span//text()").extract()
35 |             print(homeThrty)
36 |             # 房东信息
37 |             homePerson = node.xpath("./div[@class='houseInfo clearfix']/div[@class='agentInfo mt16']/p[1]/a/text()").extract()
38 |             # 房东链接
39 |             homePersonLink = node.xpath("./div[@class='houseInfo clearfix']/div[@class='agentInfo mt16']/p[1]/a/@href").extract()
40 |             print(homePerson)
41 |             print(homePersonLink)
42 | 
43 |             # 房源大图图片
44 |             homeBigPic = node.xpath("./div[@class='house-img']/a[1]/img/@src").extract()
45 |             homeBigPicLink = node.xpath("./div[@class='house-img']/a[1]/@href").extract()
46 |             print(homeBigPic)
47 |             print(homeBigPicLink)
48 |             # 房东头像信息
49 |             personPic = node.xpath("./div[@class='house-img']/a[2]/img/@src").extract()
50 |             # 房东头像链接地址
51 |             personPicLink = node.xpath("./div[@class='house-img']/a[2]/img/@href").extract()
52 | 
53 |             print(personPic)
54 |             print(homePersonLink)
55 |             item['homeName'] ="".join(homeName)
56 |             item['homeLine'] ="".join(homeLink)
57 |             item['homeSinglePrice'] ="".join(homeSinglePrice)
58 |             item['homeAddress'] ="".join(homeAddress)
59 |             item['homeDetai'] ="".join(homeDesc)+"".join(homeDesc2)
60 |             # 这里的值暂时没有取出来
61 |             item['homeSeven'] ="".join(homeThrty)
62 |             item['homeThirth'] ="".join(homeThrty)
63 | 
64 |             item['homePerson'] ="".join(homePerson)
65 |             item['homePersonImg'] ="".join(personPic)
66 |             item['homePersonLink'] ="".join(homePersonLink)
67 |             item['homePicBg'] ="".join(homeBigPic)
68 |             item['homePicLink'] ="".join(homeBigPicLink)
69 |             yield item


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/README.txt:
--------------------------------------------------------------------------------
1 | ﻿输入：scrapy crawl docNet -o items.json 时以json格式保存下载数据
2 | 输入：scrapy crawl docNet -o items.csv 时以csv格式保存下载数据


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__init__.py


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class ChinadoctornetItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 | 
15 |     # 爬取中国医学人才网的条目(共5个条目)
16 |     # 医院名称
17 |     hospitalName = scrapy.Field()
18 |     # 医院规模
19 |     hospitalSize = scrapy.Field()
20 |     # 医院所在地
21 |     hospitalAddress = scrapy.Field()
22 |     # 医院科目
23 |     hospitalDesc = scrapy.Field()
24 |     # pass
25 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class ChinadoctornetSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | # import json
 8 | 
 9 | class ChinadoctornetPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 
13 | 
14 | # class JsonWriterPipeline(object):
15 | #     def __init__(self):
16 | #         self.file = open('中国医学人才网招聘最新招聘专栏2.json', 'w', encoding='utf-8')
17 | 
18 | #     def process_item(self, item, spider):
19 | #         line = json.dumps(dict(item), ensure_ascii=False) + "\n"
20 | #         self.file.write(line)
21 | #         return item
22 | 
23 | #     def spider_closed(self, spider):
24 | #         self.file.close()


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for chinadoctornet project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'chinadoctornet'
13 | 
14 | SPIDER_MODULES = ['chinadoctornet.spiders']
15 | NEWSPIDER_MODULE = 'chinadoctornet.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'chinadoctornet (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'chinadoctornet.middlewares.ChinadoctornetSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'chinadoctornet.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | # ITEM_PIPELINES = {
68 | #    # 'chinadoctornet.pipelines.ChinadoctornetPipeline': 300,
69 | #      'chinadoctornet.pipelines.JsonWritePipline': 300,
70 | # }
71 | 
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 | 
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/__pycache__/chinadoctornet_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/__pycache__/chinadoctornet_spider.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/chinadoctornet_spider.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf8
 2 | import scrapy
 3 | from chinadoctornet.items import ChinadoctornetItem
 4 | 
 5 | 
 6 | class ChinaDocNet(scrapy.Spider):
 7 |     # 启动爬虫的名称
 8 |     name = 'docNet'
 9 |     # 爬取域名的范围
10 |     allowed_domains = ['yixuezp.com']
11 |     # 爬虫第一个url地址
12 |     start_urls = ['http://www.yixuezp.com/zhaopin?page={}'.format(n) for n in range(0, 464)]  # 463
13 | 
14 |     def parse(self, response):
15 |         # 医院name
16 |         node_list = response.xpath("//div[@class='newsjob']/ul/li")
17 |         items = []
18 |         for node in node_list:
19 |             item = ChinadoctornetItem()
20 |             hospitalName = node.xpath("./a/text()").extract()
21 |             hospitalSize = node.xpath("./span[1]/text()").extract()
22 |             hospitalAddress = node.xpath("./span[2]/text()").extract()
23 |             hospitalDesc = node.xpath("./p/a/text()").extract()
24 | 
25 |             item['hospitalName'] = hospitalName
26 |             item['hospitalSize'] = hospitalSize
27 |             item['hospitalAddress'] = hospitalAddress
28 |             item['hospitalDesc'] = hospitalDesc
29 |             items.append(item)
30 |             # return items # 如果直接return的话，一页数据只会返回一条数据
31 |             yield item #用yield 的话，可以交给下载器，继续执行下一步操作。


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = chinadoctornet.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = chinadoctornet
12 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/中国医学人才网招聘最新招聘专栏.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/中国医学人才网招聘最新招聘专栏.csv


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/README.txt:
--------------------------------------------------------------------------------
1 | ﻿输入：scrapy crawl doubanMovie -o items.json 时以json格式保存下载数据
2 | 输入：scrapy crawl doubanMovie -o items.csv 时以csv格式保存下载数据


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__init__.py


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class DoubanmovieItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     # pass
15 | 
16 |     # 电影名字
17 |     name = scrapy.Field()
18 |     # 电影信息
19 |     info = scrapy.Field()
20 |     # 评分
21 |     rating = scrapy.Field()
22 |     # 评论人数
23 |     num = scrapy.Field()
24 |     # 经典语句
25 |     quote = scrapy.Field()
26 |     # 电影图片
27 |     img_url = scrapy.Field()
28 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class DoubanmovieSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class DoubanmoviePipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for doubanmovie project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'doubanmovie'
13 | 
14 | SPIDER_MODULES = ['doubanmovie.spiders']
15 | NEWSPIDER_MODULE = 'doubanmovie.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'doubanmovie (+http://www.yourdomain.com)'
20 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0'
21 | 
22 | # Obey robots.txt rules
23 | ROBOTSTXT_OBEY = False
24 | 
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | #CONCURRENT_REQUESTS = 32
27 | 
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | #DOWNLOAD_DELAY = 3
32 | # The download delay setting will honor only one of:
33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
34 | #CONCURRENT_REQUESTS_PER_IP = 16
35 | 
36 | # Disable cookies (enabled by default)
37 | COOKIES_ENABLED = False
38 | 
39 | # Disable Telnet Console (enabled by default)
40 | #TELNETCONSOLE_ENABLED = False
41 | 
42 | # Override the default request headers:
43 | #DEFAULT_REQUEST_HEADERS = {
44 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45 | #   'Accept-Language': 'en',
46 | #}
47 | 
48 | # Enable or disable spider middlewares
49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
50 | #SPIDER_MIDDLEWARES = {
51 | #    'doubanmovie.middlewares.DoubanmovieSpiderMiddleware': 543,
52 | #}
53 | 
54 | # Enable or disable downloader middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
56 | #DOWNLOADER_MIDDLEWARES = {
57 | #    'doubanmovie.middlewares.MyCustomDownloaderMiddleware': 543,
58 | #}
59 | 
60 | # Enable or disable extensions
61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | #    'scrapy.extensions.telnet.TelnetConsole': None,
64 | #}
65 | 
66 | # Configure item pipelines
67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
68 | #ITEM_PIPELINES = {
69 | #    'doubanmovie.pipelines.DoubanmoviePipeline': 300,
70 | #}
71 | 
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 | 
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/__pycache__/doubanmovie_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/__pycache__/doubanmovie_spider.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/doubanmovie_spider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from doubanmovie.items import DoubanmovieItem
 3 | 
 4 | class Movie(scrapy.Spider):
 5 |     # 爬虫唯一标识符
 6 |     name = 'doubanMovie'
 7 |     # 爬取域名
 8 |     allowed_domain = ['movie.douban.com']
 9 |     # 爬取页面地址
10 |     start_urls = ['https://movie.douban.com/top250']
11 | 
12 |     def parse(self, response):
13 |         selector = scrapy.Selector(response)
14 |         # 解析出各个电影
15 |         movies = selector.xpath('//div[@class="item"]')
16 |         # 存放电影信息
17 |         item = DoubanmovieItem()
18 | 
19 |         for movie in movies:
20 | 
21 |             # 电影各种语言名字的列表
22 |             titles = movie.xpath('.//span[@class="title"]/text()').extract()
23 |             # 将中文名与英文名合成一个字符串
24 |             name = ''
25 |             for title in titles:
26 |                 name += title.strip()
27 |             item['name'] = name
28 | 
29 |             # 电影信息列表
30 |             infos = movie.xpath('.//div[@class="bd"]/p/text()').extract()
31 |             # 电影信息合成一个字符串
32 |             fullInfo = ''
33 |             for info in infos:
34 |                 fullInfo += info.strip()
35 |             item['info'] = fullInfo
36 |             # 提取评分信息
37 |             item['rating'] = movie.xpath('.//span[@class="rating_num"]/text()').extract()[0].strip()
38 |             # 提取评价人数
39 |             item['num'] = movie.xpath('.//div[@class="star"]/span[last()]/text()').extract()[0].strip()[:-3]
40 |             # 提取经典语句，quote可能为空
41 |             quote = movie.xpath('.//span[@class="inq"]/text()').extract()
42 |             if quote:
43 |                 quote = quote[0].strip()
44 |             item['quote'] = quote
45 |             # 提取电影图片
46 |             item['img_url'] = movie.xpath('.//img/@src').extract()[0]
47 | 
48 |             yield item
49 | 
50 |         next_page = selector.xpath('//span[@class="next"]/a/@href').extract()[0]
51 |         url = 'https://movie.douban.com/top250' + next_page
52 |         if next_page:
53 |             yield scrapy.Request(url, callback=self.parse)
54 | 
55 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = doubanmovie.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = doubanmovie
12 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__init__.py


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class MakedreamItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     # pass
15 | 
16 |     # 文章标题
17 |     articleTitle = scrapy.Field()
18 |     # 文章标题url
19 |     articleUrl = scrapy.Field()
20 |     # 文章描述
21 |     articleDesc = scrapy.Field()
22 |     # 文章发布时间
23 |     articlePublic = scrapy.Field()
24 |     # 文章类型
25 |     articleType = scrapy.Field()
26 |     # 文章标签
27 |     articleTag = scrapy.Field()
28 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class MakedreamSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import json
 8 | import pymongo
 9 | from scrapy.conf import settings
10 | 
11 | class MakedreamPipeline(object):
12 |     def process_item(self, item, spider):
13 |         return item
14 | 
15 | 
16 | class DreamMongo(object):
17 |     def __init__(self):
18 |         self.client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT'])
19 |         self.db = self.client[settings['MONGO_DB']]
20 |         self.post = self.db[settings['MONGO_COLL']]
21 | 
22 |     def process_item(self, item, spider):
23 |         postItem = dict(item)
24 |         self.post.insert(postItem)
25 |         return item
26 | 
27 | 
28 | # 写入json文件类
29 | class JsonWritePipeline(object):
30 |     def __init__(self):
31 |         self.file = open('织梦网其他编程.json', 'w', encoding='utf-8')
32 | 
33 |     def process_item(self, item, spider):
34 |         line = json.dumps(dict(item), ensure_ascii=False) + "\n"
35 |         self.file.write(line)
36 |         return item
37 | 
38 |     def spider_closed(self, spider):
39 |         self.file.close()


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for makedream project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'makedream'
 13 | 
 14 | SPIDER_MODULES = ['makedream.spiders']
 15 | NEWSPIDER_MODULE = 'makedream.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'makedream (+http://www.yourdomain.com)'
 20 | # 配置mongoDB
 21 | MONGO_HOST = "127.0.0.1"  # 主机IP
 22 | MONGO_PORT = 27017  # 端口号
 23 | MONGO_DB = "DreamDB"  # 库名
 24 | MONGO_COLL = "Dream_info"  # collection
 25 | 
 26 | 
 27 | 
 28 | # Obey robots.txt rules
 29 | ROBOTSTXT_OBEY = False
 30 | 
 31 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 32 | #CONCURRENT_REQUESTS = 32
 33 | 
 34 | # Configure a delay for requests for the same website (default: 0)
 35 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 36 | # See also autothrottle settings and docs
 37 | #DOWNLOAD_DELAY = 3
 38 | # The download delay setting will honor only one of:
 39 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 40 | #CONCURRENT_REQUESTS_PER_IP = 16
 41 | 
 42 | # Disable cookies (enabled by default)
 43 | # COOKIES_ENABLED = False
 44 | 
 45 | # Disable Telnet Console (enabled by default)
 46 | #TELNETCONSOLE_ENABLED = False
 47 | 
 48 | # Override the default request headers:
 49 | #DEFAULT_REQUEST_HEADERS = {
 50 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 51 | #   'Accept-Language': 'en',
 52 | #}
 53 | 
 54 | # Enable or disable spider middlewares
 55 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 56 | #SPIDER_MIDDLEWARES = {
 57 | #    'makedream.middlewares.MakedreamSpiderMiddleware': 543,
 58 | #}
 59 | 
 60 | # Enable or disable downloader middlewares
 61 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 62 | #DOWNLOADER_MIDDLEWARES = {
 63 | #    'makedream.middlewares.MyCustomDownloaderMiddleware': 543,
 64 | #}
 65 | 
 66 | # Enable or disable extensions
 67 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 68 | #EXTENSIONS = {
 69 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 70 | #}
 71 | 
 72 | # Configure item pipelines
 73 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 74 | ITEM_PIPELINES = {
 75 |    # 'makedream.pipelines.MakedreamPipeline': 300,
 76 |     'makedream.pipelines.JsonWritePipeline':300,
 77 |     'makedream.pipelines.DreamMongo':300
 78 | }
 79 | 
 80 | # Enable and configure the AutoThrottle extension (disabled by default)
 81 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 82 | #AUTOTHROTTLE_ENABLED = True
 83 | # The initial download delay
 84 | #AUTOTHROTTLE_START_DELAY = 5
 85 | # The maximum download delay to be set in case of high latencies
 86 | #AUTOTHROTTLE_MAX_DELAY = 60
 87 | # The average number of requests Scrapy should be sending in parallel to
 88 | # each remote server
 89 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 90 | # Enable showing throttling stats for every response received:
 91 | #AUTOTHROTTLE_DEBUG = False
 92 | 
 93 | # Enable and configure HTTP caching (disabled by default)
 94 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 95 | #HTTPCACHE_ENABLED = True
 96 | #HTTPCACHE_EXPIRATION_SECS = 0
 97 | #HTTPCACHE_DIR = 'httpcache'
 98 | #HTTPCACHE_IGNORE_HTTP_CODES = []
 99 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
100 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/__pycache__/makedream_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/__pycache__/makedream_spider.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/makedream_spider.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf8
 2 | import scrapy
 3 | from makedream.items import MakedreamItem
 4 | 
 5 | 
 6 | class DramingNet(scrapy.Spider):
 7 |     # 启动爬虫的名称
 8 |     name = 'dreaming'
 9 |     # 爬虫的域范围
10 |     allowed_domains = ['zhimengzhe.com']
11 |     # 爬虫的第一个url
12 |     start_urls = ['http://www.zhimengzhe.com/bianchengjiaocheng/qitabiancheng/index_{}.html'.format(n) for n in
13 |                   range(0, 1466)]
14 | 
15 |     # 爬取结果解析
16 |     def parse(self, response):
17 |         base_url = 'http://www.zhimengzhe.com'
18 |         # print(response.body)
19 |         node_list = response.xpath("//ul[@class='list-unstyled list-article']/li")
20 |         for node in node_list:
21 |             item = MakedreamItem()
22 |             nextNode = node.xpath("./div[@class='pull-left ltxt w658']")
23 |             print('*' * 30)
24 |             title = nextNode.xpath('./h3/a/text()').extract()
25 |             link = nextNode.xpath('./h3/a/@href').extract()
26 |             desc = nextNode.xpath('./p/text()').extract()
27 | 
28 |             # 创建时间，类型，标签
29 |             publicTime = nextNode.xpath("./div[@class='tagtime']/span[1]/text()").extract()
30 |             publicType = nextNode.xpath("./div[@class='tagtime']/span[2]/a/text()").extract()
31 |             publicTag = nextNode.xpath("./div[@class='tagtime']/span[3]/a/text()").extract()
32 |             # node
33 |             titleLink = base_url + ''.join(link)
34 |             item['articleTitle'] = title
35 |             # 文章标题url
36 |             item['articleUrl'] = titleLink
37 |             # 文章描述
38 |             item['articleDesc'] = desc
39 |             # 文章发布时间
40 |             item['articlePublic'] = publicTime
41 |             # 文章类型
42 |             item['articleType'] = publicType
43 |             # 文章标签
44 |             item['articleTag'] = publicTag
45 |             yield item


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = makedream.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = makedream
12 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/21_python爬取豆瓣电影前任3评论(词云显示)/ComentsAnaylst.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2018/4/25 11:15
 3 | # @File    : commentsAnaylst.py（再见前任3的影评f词云）
 4 | 
 5 | import matplotlib.pyplot as plt
 6 | from PIL import Image
 7 | from wordcloud import WordCloud
 8 | import jieba
 9 | import numpy as np
10 | #读取txt格式的文本内容
11 | text_from_file_with_apath = open('douban.txt','rb').read()
12 | 
13 | #使用jieba进行分词，并对分词的结果以空格隔开
14 | wordlist_after_jieba = jieba.cut(text_from_file_with_apath, cut_all = True)
15 | wl_space_split = " ".join(wordlist_after_jieba)
16 | 
17 | #对分词后的文本生成词云
18 | # my_wordcloud = WordCloud().generate(wl_space_split)
19 | 
20 | font = r'C:\Windows\Fonts\simfang.ttf'
21 | mask = np.array(Image.open('ciyun.jpg'))
22 | wc = WordCloud(mask=mask,max_words=3000,collocations=False, font_path=font, width=5800, height=2400, margin=10,background_color='black').generate(wl_space_split)
23 | default_colors = wc.to_array()
24 | plt.title("QR 3")
25 | plt.imshow(wc)
26 | plt.axis("off")
27 | plt.savefig("ciyun.png")
28 | plt.show()


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/21_python爬取豆瓣电影前任3评论(词云显示)/ciyun.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/21_python爬取豆瓣电影前任3评论(词云显示)/ciyun.jpg


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/21_python爬取豆瓣电影前任3评论(词云显示)/ciyun.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/21_python爬取豆瓣电影前任3评论(词云显示)/ciyun.png


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/21_python爬取豆瓣电影前任3评论(词云显示)/douban_qianren3.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2018/4/25 11:15
 3 | # @File    : test_douban_qianren3.py（再见前任3的影评）
 4 | 
 5 | import csv
 6 | import requests
 7 | from lxml import etree
 8 | import time
 9 | 
10 | 
11 | url = 'https://movie.douban.com/subject/26662193/comments?start=0&limit=20&sort=new_score&status=P&percent_type='
12 | 
13 | headers = {
14 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36',
15 |     'Cookie': 'gr_user_id=ffdf2f63-ec37-49b5-99e8-0e0d28741172; bid=qh9RXgIGopg; viewed="26826540_24703171"; ap=1; ll="118172"; ct=y; _vwo_uuid_v2=8C5B24903B1D1D3886FE478B91C5DE97|7eac18658e7fecbbf3798b88cfcf6113; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1522129522%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DdnHqCRiT1HlhToCp0h1cpdyV8rB9f_OfOvJhjRPO3p1jrl764LGvi7gbYSdskDMh%26wd%3D%26eqid%3De15db1bb0000e3cd000000045ab9b6fe%22%5D; _pk_id.100001.4cf6=4e61f4192b9486a8.1485672092.10.1522130672.1522120744.; _pk_ses.100001.4cf6=*'}
16 | 
17 | 
18 | def get_html(current_url):
19 |     time.sleep(2)
20 |     r = requests.get(current_url, headers=headers)
21 |     r.raise_for_status()
22 |     return etree.HTML(r.text)
23 | 
24 | 
25 | def parse_html(content,writer):
26 |     links = content.xpath("//*[@class='comment-item']")
27 |     for link in links:
28 |         content = link.xpath("./div[@class='comment']/p/text()")[0].strip()
29 |         author = link.xpath("./div[@class='comment']/h3/span[@class='comment-info']/a/text()")[0].strip()
30 |         time = link.xpath("./div[@class='comment']/h3/span[@class='comment-info']/span[@class='comment-time ']/text()")[
31 |             0].strip()
32 |         is_useful = link.xpath("./div[@class='comment']/h3/span[@class='comment-vote']/span[@class='votes']/text()")[0]
33 |         print('content：', content)
34 |         print('time：', time)
35 |         print('is_useful：', is_useful)
36 |         # detail = (author, time, is_useful, content)
37 |         detail = (is_useful,content)
38 |         writer.writerow(detail)
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     with open('douban.txt', 'a+', encoding='utf-8', newline='') as csvf:
43 |         writer = csv.writer(csvf)
44 |         writer.writerow(('作者', '时间', '有用数', '内容'))
45 |         for page in range(0, 260, 20):
46 |             url = 'https://movie.douban.com/subject/26662193/comments?start={}&limit=20&sort=new_score&status=P&percent_type='.format(
47 |                 page)
48 |             r = get_html(url)
49 |             parse_html(r,writer)


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/22_python爬取Bilibili用户信息并导入mysql数据库/bilibili_user.py:
--------------------------------------------------------------------------------
  1 | # -*-coding:utf8-*-
  2 | 
  3 | import requests
  4 | import json
  5 | import random
  6 | import pymysql
  7 | import sys
  8 | import datetime
  9 | import time
 10 | from imp import reload
 11 | from multiprocessing.dummy import Pool as ThreadPool
 12 | 
 13 | 
 14 | def datetime_to_timestamp_in_milliseconds(d):
 15 |     def current_milli_time(): return int(round(time.time() * 1000))
 16 | 
 17 |     return current_milli_time()
 18 | 
 19 | 
 20 | reload(sys)
 21 | 
 22 | 
 23 | def LoadUserAgents(uafile):
 24 |     """
 25 |     uafile : string
 26 |         path to text file of user agents, one per line
 27 |     """
 28 |     uas = []
 29 |     with open(uafile, 'rb') as uaf:
 30 |         for ua in uaf.readlines():
 31 |             if ua:
 32 |                 uas.append(ua.strip()[1:-1 - 1])
 33 |     random.shuffle(uas)
 34 |     return uas
 35 | 
 36 | 
 37 | uas = LoadUserAgents("user_agents.txt")
 38 | head = {
 39 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
 40 |     'X-Requested-With': 'XMLHttpRequest',
 41 |     'Referer': 'http://space.bilibili.com/45388',
 42 |     'Origin': 'http://space.bilibili.com',
 43 |     'Host': 'space.bilibili.com',
 44 |     'AlexaToolbar-ALX_NS_PH': 'AlexaToolbar/alx-4.0',
 45 |     'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4',
 46 |     'Accept': 'application/json, text/javascript, */*; q=0.01',
 47 | }
 48 | proxies = {
 49 |     'http': 'http://61.155.164.108:3128',
 50 |     'http': 'http://116.199.115.79:80',
 51 |     'http': 'http://42.245.252.35:80',
 52 |     'http': 'http://106.14.51.145:8118',
 53 |     'http': 'http://116.199.115.78:80',
 54 |     'http': 'http://123.147.165.143:8080',
 55 |     'http': 'http://58.62.86.216:9999',
 56 |     'http': 'http://202.201.3.121:3128',
 57 |     'http': 'http://119.29.201.134:808',
 58 |     'http': 'http://61.155.164.112:3128',
 59 |     'http': 'http://123.57.76.102:80',
 60 |     'http': 'http://116.199.115.78:80',
 61 | }
 62 | time1 = time.time()
 63 | 
 64 | for m in range(99, 101):  # 26 ,1000
 65 |     urls = []
 66 |     for i in range(m * 100, (m + 1) * 100):
 67 |         url = 'https://space.bilibili.com/' + str(i)
 68 |         urls.append(url)
 69 | 
 70 | 
 71 |     def getsource(url):
 72 |         payload = {
 73 |             '_': datetime_to_timestamp_in_milliseconds(datetime.datetime.now()),
 74 |             'mid': url.replace('https://space.bilibili.com/', '')
 75 |         }
 76 |         ua = random.choice(uas)
 77 |         head = {
 78 |             'User-Agent': ua,
 79 |             'Referer': 'https://space.bilibili.com/' + str(i) + '?from=search&seid=' + str(random.randint(10000, 50000))
 80 |         }
 81 |         jscontent = requests \
 82 |             .session() \
 83 |             .post('http://space.bilibili.com/ajax/member/GetInfo',
 84 |                   headers=head,
 85 |                   data=payload,
 86 |                   proxies=proxies) \
 87 |             .text
 88 |         time2 = time.time()
 89 |         try:
 90 |             jsDict = json.loads(jscontent)
 91 |             statusJson = jsDict['status'] if 'status' in jsDict.keys() else False
 92 |             if statusJson == True:
 93 |                 if 'data' in jsDict.keys():
 94 |                     jsData = jsDict['data']
 95 |                     mid = jsData['mid']
 96 |                     name = jsData['name']
 97 |                     sex = jsData['sex']
 98 |                     face = jsData['face']
 99 |                     coins = jsData['coins']
100 |                     spacesta = jsData['spacesta']
101 |                     birthday = jsData['birthday'] if 'birthday' in jsData.keys() else 'nobirthday'
102 |                     place = jsData['place'] if 'place' in jsData.keys() else 'noplace'
103 |                     description = jsData['description']
104 |                     article = jsData['article']
105 |                     playnum = jsData['playNum']
106 |                     sign = jsData['sign']
107 |                     level = jsData['level_info']['current_level']
108 |                     exp = jsData['level_info']['current_exp']
109 |                     print("Succeed: " + mid + "\t" + str(time2 - time1))
110 |                     try:
111 |                         res = requests.get(
112 |                             'https://api.bilibili.com/x/space/navnum?mid=' + str(mid) + '&jsonp=jsonp').text
113 |                         js_fans_data = json.loads(res)
114 |                         following = js_fans_data['data']['following']
115 |                         fans = js_fans_data['data']['follower']
116 |                     except:
117 |                         following = 0
118 |                         fans = 0
119 |                 else:
120 |                     print('no data now')
121 |                 try:
122 |                     conn = pymysql.connect(
123 |                         host='127.0.0.1', port=3306, user='root', passwd='******', db='sunshine',charset="utf8")
124 |                     cur = conn.cursor()
125 |                     cur.execute('INSERT INTO bilibili_user_info(mid, name, sex, face, coins, spacesta, \
126 |                     birthday, place, description, article, following, fans, playnum, sign, level, exp) \
127 |                     VALUES ("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")'
128 |                                 % (
129 |                                     mid, name, sex, face, coins, spacesta,
130 |                                     birthday, place, description, article,
131 |                                     following, fans, playnum, sign, level, exp
132 |                                 ))
133 |                     conn.commit()
134 |                 except Exception:
135 |                     print("MySQL Error")
136 |             else:
137 |                 print("Error: " + url)
138 |         except ValueError:
139 |             pass
140 | 
141 | 
142 |     pool = ThreadPool(1)
143 |     try:
144 |         results = pool.map(getsource, urls)
145 |     except Exception:
146 |         print('ConnectionError')
147 |         pool.close()
148 |         pool.join()
149 |         time.sleep(11)
150 |         pool = ThreadPool(1)
151 |         results = pool.map(getsource, urls)
152 | 
153 |     time.sleep(30)
154 | 
155 | pool.close()
156 | pool.join()
157 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/22_python爬取Bilibili用户信息并导入mysql数据库/bilibili_user_info.sql:
--------------------------------------------------------------------------------
 1 | # ************************************************************
 2 | # Sequel Pro SQL dump
 3 | # Version 4135
 4 | #
 5 | # http://www.sequelpro.com/
 6 | # http://code.google.com/p/sequel-pro/
 7 | #
 8 | # Host: 127.0.0.1 (MySQL 5.1.63)
 9 | # Database: sunshine
10 | # Generation Time: 2018-04-26 13:33:32 +0000
11 | # ************************************************************
12 | 
13 | 
14 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
15 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
16 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
17 | /*!40101 SET NAMES utf8 */;
18 | /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
19 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
20 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
21 | 
22 | 
23 | # Dump of table bilibili_user_info
24 | # ------------------------------------------------------------
25 | 
26 | CREATE TABLE `bilibili_user_info` (
27 |   `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
28 |   `mid` varchar(11) DEFAULT NULL,
29 |   `name` varchar(45) DEFAULT NULL,
30 |   `sex` varchar(11) DEFAULT NULL,
31 |   `face` varchar(200) DEFAULT NULL,
32 |   `coins` int(11) DEFAULT NULL,
33 |   `spacesta` int(11) DEFAULT NULL,
34 |   `birthday` varchar(45) DEFAULT NULL,
35 |   `place` varchar(45) DEFAULT NULL,
36 |   `description` varchar(45) DEFAULT NULL,
37 |   `article` int(11) DEFAULT NULL,
38 |   `following` int(11) DEFAULT NULL,
39 |   `fans` int(11) DEFAULT NULL,
40 |   `playnum` int(30) DEFAULT NULL,
41 |   `sign` varchar(300) DEFAULT NULL,
42 |   `level` int(11) DEFAULT NULL,
43 |   `exp` int(11) DEFAULT NULL,
44 |   PRIMARY KEY (`id`)
45 | ) ENGINE=MyISAM DEFAULT CHARSET=utf8;
46 | 
47 | 
48 | 
49 | 
50 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
51 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
52 | /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;
53 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
54 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
55 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
56 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/23_python爬取网易云音乐所有歌曲的评论数/README.md:
--------------------------------------------------------------------------------
1 | #### 这是一个爬取网易云音乐的所有的歌曲的评论数的爬虫。
2 | 
3 | 以下为主要思路：
4 | 
5 | - 1. 爬取所有的歌手信息（[artists.py]）；
6 | - 2. 根据上一步爬取到的歌手信息去爬取所有的专辑信息（[album_by_artist.py]）；
7 | - 3. 根据专辑信息爬取所有的歌曲信息（[music_by_album.py]）；
8 | - 4. 根据歌曲信息爬取其评论条数（[comments_by_music.py]）
9 | - 5. 数据库相关的语句都存放于（[sql.py]）中。


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/23_python爬取网易云音乐所有歌曲的评论数/album_by_artist.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 根据上一步获取的歌手的 ID 来用于获取所有的专辑 ID
 3 | """
 4 | import requests
 5 | from bs4 import BeautifulSoup
 6 | import time
 7 | from music_163 import sql
 8 | 
 9 | 
10 | class Album(object):
11 |     headers = {
12 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
13 |         'Accept-Encoding': 'gzip, deflate, sdch',
14 |         'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
15 |         'Cache-Control': 'no-cache',
16 |         'Connection': 'keep-alive',
17 |         'Cookie': '_ntes_nnid=7eced19b27ffae35dad3f8f2bf5885cd,1476521011210; _ntes_nuid=7eced19b27ffae35dad3f8f2bf5885cd; usertrack=c+5+hlgB7TgnsAmACnXtAg==; Province=025; City=025; _ga=GA1.2.1405085820.1476521280; NTES_PASSPORT=6n9ihXhbWKPi8yAqG.i2kETSCRa.ug06Txh8EMrrRsliVQXFV_orx5HffqhQjuGHkNQrLOIRLLotGohL9s10wcYSPiQfI2wiPacKlJ3nYAXgM; P_INFO=hourui93@163.com|1476523293|1|study|11&12|jis&1476511733&mail163#jis&320100#10#0#0|151889&0|g37_client_check&mailsettings&mail163&study&blog|hourui93@163.com; JSESSIONID-WYYY=189f31767098c3bd9d03d9b968c065daf43cbd4c1596732e4dcb471beafe2bf0605b85e969f92600064a977e0b64a24f0af7894ca898b696bd58ad5f39c8fce821ec2f81f826ea967215de4d10469e9bd672e75d25f116a9d309d360582a79620b250625859bc039161c78ab125a1e9bf5d291f6d4e4da30574ccd6bbab70b710e3f358f%3A1476594130342; _iuqxldmzr_=25; __utma=94650624.1038096298.1476521011.1476588849.1476592408.6; __utmb=94650624.11.10.1476592408; __utmc=94650624; __utmz=94650624.1476521011.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
18 |         'DNT': '1',
19 |         'Host': 'music.163.com',
20 |         'Pragma': 'no-cache',
21 |         'Referer': 'http://music.163.com/',
22 |         'Upgrade-Insecure-Requests': '1',
23 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
24 |     }
25 | 
26 |     def save_albums(self, artist_id):
27 |         params = {'id': artist_id, 'limit': '200'}
28 |         # 获取歌手个人主页
29 |         r = requests.get('http://music.163.com/artist/album', headers=self.headers, params=params)
30 | 
31 |         # 网页解析
32 |         soup = BeautifulSoup(r.content.decode(), 'html.parser')
33 |         body = soup.body
34 | 
35 |         albums = body.find_all('a', attrs={'class': 'tit f-thide s-fc0'})  # 获取所有专辑
36 | 
37 |         for album in albums:
38 |             albume_id = album['href'].replace('/album?id=', '')
39 |             sql.insert_album(albume_id, artist_id)
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     artists = sql.get_all_artist()
44 |     my_album = Album()
45 |     for i in artists:
46 |         try:
47 |             my_album.save_albums(i['ARTIST_ID'])
48 |             # print(i)
49 |         except Exception as e:
50 |             # 打印错误日志
51 |             print(str(i) + ': ' + str(e))
52 |             time.sleep(5)
53 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/23_python爬取网易云音乐所有歌曲的评论数/artists.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 获取所有的歌手信息
 3 | """
 4 | import requests
 5 | from bs4 import BeautifulSoup
 6 | from music_163 import sql
 7 | 
 8 | headers = {
 9 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
10 |     'Accept-Encoding': 'gzip, deflate, sdch',
11 |     'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
12 |     'Cache-Control': 'no-cache',
13 |     'Connection': 'keep-alive',
14 |     'Cookie': '_ntes_nnid=7eced19b27ffae35dad3f8f2bf5885cd,1476521011210; _ntes_nuid=7eced19b27ffae35dad3f8f2bf5885cd; usertrack=c+5+hlgB7TgnsAmACnXtAg==; Province=025; City=025; NTES_PASSPORT=6n9ihXhbWKPi8yAqG.i2kETSCRa.ug06Txh8EMrrRsliVQXFV_orx5HffqhQjuGHkNQrLOIRLLotGohL9s10wcYSPiQfI2wiPacKlJ3nYAXgM; P_INFO=hourui93@163.com|1476523293|1|study|11&12|jis&1476511733&mail163#jis&320100#10#0#0|151889&0|g37_client_check&mailsettings&mail163&study&blog|hourui93@163.com; NTES_SESS=Fa2uk.YZsGoj59AgD6tRjTXGaJ8_1_4YvGfXUkS7C1NwtMe.tG1Vzr255TXM6yj2mKqTZzqFtoEKQrgewi9ZK60ylIqq5puaG6QIaNQ7EK5MTcRgHLOhqttDHfaI_vsBzB4bibfamzx1.fhlpqZh_FcnXUYQFw5F5KIBUmGJg7xdasvGf_EgfICWV; S_INFO=1476597594|1|0&80##|hourui93; NETEASE_AUTH_SOURCE=space; NETEASE_AUTH_USERNAME=hourui93; _ga=GA1.2.1405085820.1476521280; JSESSIONID-WYYY=cbd082d2ce2cffbcd5c085d8bf565a95aee3173ddbbb00bfa270950f93f1d8bb4cb55a56a4049fa8c828373f630c78f4a43d6c3d252c4c44f44b098a9434a7d8fc110670a6e1e9af992c78092936b1e19351435ecff76a181993780035547fa5241a5afb96e8c665182d0d5b911663281967d675ff2658015887a94b3ee1575fa1956a5a%3A1476607977016; _iuqxldmzr_=25; __utma=94650624.1038096298.1476521011.1476595468.1476606177.8; __utmb=94650624.20.10.1476606177; __utmc=94650624; __utmz=94650624.1476521011.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
15 |     'DNT': '1',
16 |     'Host': 'music.163.com',
17 |     'Pragma': 'no-cache',
18 |     'Referer': 'http://music.163.com/',
19 |     'Upgrade-Insecure-Requests': '1',
20 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
21 | }
22 | 
23 | 
24 | def save_artist(group_id, initial):
25 |     params = {'id': group_id, 'initial': initial}
26 |     r = requests.get('http://music.163.com/discover/artist/cat', params=params)
27 | 
28 |     # 网页解析
29 |     soup = BeautifulSoup(r.content.decode(), 'html.parser')
30 |     body = soup.body
31 | 
32 |     hot_artists = body.find_all('a', attrs={'class': 'msk'})
33 |     artists = body.find_all('a', attrs={'class': 'nm nm-icn f-thide s-fc0'})
34 | 
35 |     for artist in hot_artists:
36 |         artist_id = artist['href'].replace('/artist?id=', '').strip()
37 |         artist_name = artist['title'].replace('的音乐', '')
38 |         try:
39 |             sql.insert_artist(artist_id, artist_name)
40 |         except Exception as e:
41 |             # 打印错误日志
42 |             print(e)
43 | 
44 |     for artist in artists:
45 |         artist_id = artist['href'].replace('/artist?id=', '').strip()
46 |         artist_name = artist['title'].replace('的音乐', '')
47 |         try:
48 |             sql.insert_artist(artist_id, artist_name)
49 |         except Exception as e:
50 |             # 打印错误日志
51 |             print(e)
52 | 
53 | 
54 | gg = 4003
55 | 
56 | save_artist(gg, 0)
57 | for i in range(65, 91):
58 |     save_artist(gg, i)
59 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/23_python爬取网易云音乐所有歌曲的评论数/comments_by_music.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 根据歌曲 ID 获得所有的歌曲所对应的评论信息
 3 | """
 4 | 
 5 | import requests
 6 | from music_163 import sql
 7 | import time
 8 | import threading
 9 | import pymysql.cursors
10 | 
11 | 
12 | class Comments(object):
13 |     headers = {
14 |         'Host': 'music.163.com',
15 |         'Connection': 'keep-alive',
16 |         'Content-Length': '484',
17 |         'Cache-Control': 'max-age=0',
18 |         'Origin': 'http://music.163.com',
19 |         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36',
20 |         'Content-Type': 'application/x-www-form-urlencoded',
21 |         'Accept': '*/*',
22 |         'DNT': '1',
23 |         'Accept-Encoding': 'gzip, deflate',
24 |         'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
25 |         'Cookie': 'JSESSIONID-WYYY=b66d89ed74ae9e94ead89b16e475556e763dd34f95e6ca357d06830a210abc7b685e82318b9d1d5b52ac4f4b9a55024c7a34024fddaee852404ed410933db994dcc0e398f61e670bfeea81105cbe098294e39ac566e1d5aa7232df741870ba1fe96e5cede8372ca587275d35c1a5d1b23a11e274a4c249afba03e20fa2dafb7a16eebdf6%3A1476373826753; _iuqxldmzr_=25; _ntes_nnid=7fa73e96706f26f3ada99abba6c4a6b2,1476372027128; _ntes_nuid=7fa73e96706f26f3ada99abba6c4a6b2; __utma=94650624.748605760.1476372027.1476372027.1476372027.1; __utmb=94650624.4.10.1476372027; __utmc=94650624; __utmz=94650624.1476372027.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
26 |     }
27 | 
28 |     params = {
29 |         'csrf_token': ''
30 |     }
31 | 
32 |     data = {
33 |         'params': 'Ak2s0LoP1GRJYqE3XxJUZVYK9uPEXSTttmAS+8uVLnYRoUt/Xgqdrt/13nr6OYhi75QSTlQ9FcZaWElIwE+oz9qXAu87t2DHj6Auu+2yBJDr+arG+irBbjIvKJGfjgBac+kSm2ePwf4rfuHSKVgQu1cYMdqFVnB+ojBsWopHcexbvLylDIMPulPljAWK6MR8',
34 |         'encSecKey': '8c85d1b6f53bfebaf5258d171f3526c06980cbcaf490d759eac82145ee27198297c152dd95e7ea0f08cfb7281588cdab305946e01b9d84f0b49700f9c2eb6eeced8624b16ce378bccd24341b1b5ad3d84ebd707dbbd18a4f01c2a007cd47de32f28ca395c9715afa134ed9ee321caa7f28ec82b94307d75144f6b5b134a9ce1a'
35 |     }
36 | 
37 |     proxies = {'http': 'http://127.0.0.1:10800'}
38 | 
39 |     def get_comments(self, music_id, flag):
40 |         self.headers['Referer'] = 'http://music.163.com/playlist?id=' + str(music_id)
41 |         if flag:
42 |             r = requests.post('http://music.163.com/weapi/v1/resource/comments/R_SO_4_' + str(music_id),
43 |                               headers=self.headers, params=self.params, data=self.data, proxies=self.proxies)
44 |         else:
45 |             r = requests.post('http://music.163.com/weapi/v1/resource/comments/R_SO_4_' + str(music_id),
46 |                               headers=self.headers, params=self.params, data=self.data)
47 |         return r.json()
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     my_comment = Comments()
52 | 
53 | 
54 |     def save_comments(musics, flag, connection0):
55 |         for i in musics:
56 |             my_music_id = i['MUSIC_ID']
57 |             try:
58 |                 comments = my_comment.get_comments(my_music_id, flag)
59 |                 if comments['total'] > 0:
60 |                     sql.insert_comments(my_music_id, comments['total'], str(comments), connection0)
61 |             except Exception as e:
62 |                 # 打印错误日志
63 |                 print(my_music_id)
64 |                 print(e)
65 |                 time.sleep(5)
66 | 
67 | 
68 |     music_before = sql.get_before_music()
69 |     music_after = sql.get_after_music()
70 | 
71 |     # pymysql 链接不是线程安全的
72 |     connection1 = pymysql.connect(host='localhost',
73 |                                   user='root',
74 |                                   password='1234',
75 |                                   db='test',
76 |                                   charset='utf8mb4',
77 |                                   cursorclass=pymysql.cursors.DictCursor)
78 | 
79 |     connection2 = pymysql.connect(host='localhost',
80 |                                   user='root',
81 |                                   password='1234',
82 |                                   db='test',
83 |                                   charset='utf8mb4',
84 |                                   cursorclass=pymysql.cursors.DictCursor)
85 | 
86 |     t1 = threading.Thread(target=save_comments, args=(music_before, True, connection1))
87 |     t2 = threading.Thread(target=save_comments, args=(music_after, False, connection2))
88 |     t1.start()
89 |     t2.start()
90 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/23_python爬取网易云音乐所有歌曲的评论数/music_by_album.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 根据专辑 ID 获取到所有的音乐 ID
 3 | """
 4 | import requests
 5 | from bs4 import BeautifulSoup
 6 | import time
 7 | from music_163 import sql
 8 | 
 9 | 
10 | class Music(object):
11 |     headers = {
12 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
13 |         'Accept-Encoding': 'gzip, deflate, sdch',
14 |         'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
15 |         'Cache-Control': 'no-cache',
16 |         'Connection': 'keep-alive',
17 |         'Cookie': '_ntes_nnid=7eced19b27ffae35dad3f8f2bf5885cd,1476521011210; _ntes_nuid=7eced19b27ffae35dad3f8f2bf5885cd; usertrack=c+5+hlgB7TgnsAmACnXtAg==; Province=025; City=025; NTES_PASSPORT=6n9ihXhbWKPi8yAqG.i2kETSCRa.ug06Txh8EMrrRsliVQXFV_orx5HffqhQjuGHkNQrLOIRLLotGohL9s10wcYSPiQfI2wiPacKlJ3nYAXgM; P_INFO=hourui93@163.com|1476523293|1|study|11&12|jis&1476511733&mail163#jis&320100#10#0#0|151889&0|g37_client_check&mailsettings&mail163&study&blog|hourui93@163.com; _ga=GA1.2.1405085820.1476521280; JSESSIONID-WYYY=fb5288e1c5f667324f1636d020704cab2f27ee915622b114f89027cbf60c38be2af6b9cbef2223c1f2581e3502f11b86efd60891d6f61b6f783c0d55114f8269fa801df7352f5cc4c8259876e563a6bd0212b504a8997723a0593b21d5b3d9076d4fa38c098be68e3c5d36d342e4a8e40c1f73378cec0b5851bd8a628886edbdd23a7093%3A1476623819662; _iuqxldmzr_=25; __utma=94650624.1038096298.1476521011.1476610320.1476622020.10; __utmb=94650624.14.10.1476622020; __utmc=94650624; __utmz=94650624.1476521011.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
18 |         'DNT': '1',
19 |         'Host': 'music.163.com',
20 |         'Pragma': 'no-cache',
21 |         'Referer': 'http://music.163.com/',
22 |         'Upgrade-Insecure-Requests': '1',
23 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
24 |     }
25 | 
26 |     def save_music(self, album_id):
27 |         params = {'id': album_id}
28 |         # 获取专辑对应的页面
29 |         r = requests.get('http://music.163.com/album', headers=self.headers, params=params)
30 | 
31 |         # 网页解析
32 |         soup = BeautifulSoup(r.content.decode(), 'html.parser')
33 |         body = soup.body
34 | 
35 |         musics = body.find('ul', attrs={'class': 'f-hide'}).find_all('li')  # 获取专辑的所有音乐
36 | 
37 |         for music in musics:
38 |             music = music.find('a')
39 |             music_id = music['href'].replace('/song?id=', '')
40 |             music_name = music.getText()
41 |             sql.insert_music(music_id, music_name, album_id)
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     albums = sql.get_all_album()
46 |     my_music = Music()
47 |     for i in albums:
48 |         try:
49 |             my_music.save_music(i['ALBUM_ID'])
50 |             # print(i)
51 |         except Exception as e:
52 |             # 打印错误日志
53 |             print(str(i) + ': ' + str(e))
54 |             time.sleep(5)
55 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/23_python爬取网易云音乐所有歌曲的评论数/sql.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 一般 Python 用于连接 MySQL 的工具：pymysql
 3 | """
 4 | import pymysql.cursors
 5 | 
 6 | connection = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='***', db='sunshine',charset="utf8")
 7 | 
 8 | 
 9 | # 保存评论
10 | def insert_comments(music_id, comments, detail, connection):
11 |     with connection.cursor() as cursor:
12 |         sql = "INSERT INTO `comments` (`MUSIC_ID`, `COMMENTS`, `DETAILS`) VALUES (%s, %s, %s)"
13 |         cursor.execute(sql, (music_id, comments, detail))
14 |     connection.commit()
15 | 
16 | 
17 | # 保存音乐
18 | def insert_music(music_id, music_name, album_id):
19 |     with connection.cursor() as cursor:
20 |         sql = "INSERT INTO `musics` (`MUSIC_ID`, `MUSIC_NAME`, `ALBUM_ID`) VALUES (%s, %s, %s)"
21 |         cursor.execute(sql, (music_id, music_name, album_id))
22 |     connection.commit()
23 | 
24 | 
25 | # 保存专辑
26 | def insert_album(album_id, artist_id):
27 |     with connection.cursor() as cursor:
28 |         sql = "INSERT INTO `albums` (`ALBUM_ID`, `ARTIST_ID`) VALUES (%s, %s)"
29 |         cursor.execute(sql, (album_id, artist_id))
30 |     connection.commit()
31 | 
32 | 
33 | # 保存歌手
34 | def insert_artist(artist_id, artist_name):
35 |     with connection.cursor() as cursor:
36 |         sql = "INSERT INTO `artists` (`ARTIST_ID`, `ARTIST_NAME`) VALUES (%s, %s)"
37 |         cursor.execute(sql, (artist_id, artist_name))
38 |     connection.commit()
39 | 
40 | 
41 | # 获取所有歌手的 ID
42 | def get_all_artist():
43 |     with connection.cursor() as cursor:
44 |         sql = "SELECT `ARTIST_ID` FROM `artists` ORDER BY ARTIST_ID"
45 |         cursor.execute(sql, ())
46 |         return cursor.fetchall()
47 | 
48 | 
49 | # 获取所有专辑的 ID
50 | def get_all_album():
51 |     with connection.cursor() as cursor:
52 |         sql = "SELECT `ALBUM_ID` FROM `albums` ORDER BY ALBUM_ID"
53 |         cursor.execute(sql, ())
54 |         return cursor.fetchall()
55 | 
56 | 
57 | # 获取所有音乐的 ID
58 | def get_all_music():
59 |     with connection.cursor() as cursor:
60 |         sql = "SELECT `MUSIC_ID` FROM `musics` ORDER BY MUSIC_ID"
61 |         cursor.execute(sql, ())
62 |         return cursor.fetchall()
63 | 
64 | 
65 | # 获取前一半音乐的 ID
66 | def get_before_music():
67 |     with connection.cursor() as cursor:
68 |         sql = "SELECT `MUSIC_ID` FROM `musics` ORDER BY MUSIC_ID LIMIT 0, 800000"
69 |         cursor.execute(sql, ())
70 |         return cursor.fetchall()
71 | 
72 | 
73 | # 获取后一半音乐的 ID
74 | def get_after_music():
75 |     with connection.cursor() as cursor:
76 |         sql = "SELECT `MUSIC_ID` FROM `musics` ORDER BY MUSIC_ID LIMIT 800000, 1197429"
77 |         cursor.execute(sql, ())
78 |         return cursor.fetchall()
79 | 
80 | 
81 | def dis_connect():
82 |     connection.close()
83 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/ctrip_items.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/ctrip_items.csv


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__init__.py


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class FindtripItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     # pass
15 |     site = scrapy.Field()
16 |     company = scrapy.Field()
17 |     flight_time = scrapy.Field()
18 |     airports = scrapy.Field()
19 |     passtime = scrapy.Field()
20 |     price = scrapy.Field()
21 | 
22 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class FindtripSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | from findtrip.spiders.washctrip import wash
 8 | import pymongo
 9 | from scrapy.conf import settings
10 | from scrapy import log
11 | 
12 | class FindtripPipeline(object):
13 |     def process_item(self, item, spider):
14 |         return item
15 | 
16 | 
17 | class MongoDBPipeline(object):
18 |     def __init__(self):
19 |         self.client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT'])
20 |         self.db = self.client[settings['MONGO_DB']]
21 |         self.post = self.db[settings['MONGO_COLL']]
22 | 
23 |     def process_item(self, item, spider):
24 |         if item['site'] == 'Qua':
25 |             if item['company']:
26 |                 item['company'] = wash(item['company'])
27 |             if item['flight_time']:
28 |                 item['flight_time'] = wash(item['flight_time'])
29 |             if item['airports']:
30 |                 item['airports'] = wash(item['airports'])
31 |             if item['passtime']:
32 |                 item['passtime'] = wash(item['passtime'])
33 |             if item['price']:
34 |                 item['price'] = wash(item['price'])        
35 |             for data in item:
36 |                 if not data:
37 |                     raise DropItem("Missing data!")
38 |             self.collection.insert(dict(item))
39 |             log.msg("Question added to MongoDB database!",
40 |                     level=log.DEBUG, spider=spider)
41 |         elif item['site'] == 'Ctrip':
42 |             self.collection.insert(dict(item))
43 |             log.msg("Question added to MongoDB database!",
44 |                     level=log.DEBUG, spider=spider)
45 | 
46 |         return item


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for findtrip project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'findtrip'
 13 | 
 14 | SPIDER_MODULES = ['findtrip.spiders']
 15 | NEWSPIDER_MODULE = 'findtrip.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'findtrip (+http://www.yourdomain.com)'
 20 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
 21 | 
 22 | 
 23 | # Obey robots.txt rules
 24 | ROBOTSTXT_OBEY = False
 25 | 
 26 | # 配置mongoDB
 27 | MONGO_HOST = "127.0.0.1"  # 主机IP
 28 | MONGO_PORT = 27017  # 端口号
 29 | MONGO_DB = "FindTrip"  # 库名
 30 | MONGO_COLL = "qua_findtrip"  # collection
 31 | 
 32 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 33 | #CONCURRENT_REQUESTS = 32
 34 | 
 35 | # Configure a delay for requests for the same website (default: 0)
 36 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 37 | # See also autothrottle settings and docs
 38 | #DOWNLOAD_DELAY = 3
 39 | # The download delay setting will honor only one of:
 40 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 41 | #CONCURRENT_REQUESTS_PER_IP = 16
 42 | 
 43 | # Disable cookies (enabled by default)
 44 | COOKIES_ENABLED = False
 45 | 
 46 | # Disable Telnet Console (enabled by default)
 47 | #TELNETCONSOLE_ENABLED = False
 48 | 
 49 | # Override the default request headers:
 50 | #DEFAULT_REQUEST_HEADERS = {
 51 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 52 | #   'Accept-Language': 'en',
 53 | #}
 54 | 
 55 | # Enable or disable spider middlewares
 56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 57 | #SPIDER_MIDDLEWARES = {
 58 | #    'findtrip.middlewares.FindtripSpiderMiddleware': 543,
 59 | #}
 60 | 
 61 | # Enable or disable downloader middlewares
 62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 63 | #DOWNLOADER_MIDDLEWARES = {
 64 | #    'findtrip.middlewares.MyCustomDownloaderMiddleware': 543,
 65 | #}
 66 | 
 67 | # Enable or disable extensions
 68 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 69 | #EXTENSIONS = {
 70 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 71 | #}
 72 | 
 73 | # Configure item pipelines
 74 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 75 | ITEM_PIPELINES = {
 76 |    # 'findtrip.pipelines.FindtripPipeline': 300,
 77 |    	 'findtrip.pipelines.MongoDBPipeline': 300,
 78 | }
 79 | 
 80 | # Enable and configure the AutoThrottle extension (disabled by default)
 81 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 82 | #AUTOTHROTTLE_ENABLED = True
 83 | # The initial download delay
 84 | #AUTOTHROTTLE_START_DELAY = 5
 85 | # The maximum download delay to be set in case of high latencies
 86 | #AUTOTHROTTLE_MAX_DELAY = 60
 87 | # The average number of requests Scrapy should be sending in parallel to
 88 | # each remote server
 89 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 90 | # Enable showing throttling stats for every response received:
 91 | #AUTOTHROTTLE_DEBUG = False
 92 | 
 93 | # Enable and configure HTTP caching (disabled by default)
 94 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 95 | #HTTPCACHE_ENABLED = True
 96 | #HTTPCACHE_EXPIRATION_SECS = 0
 97 | #HTTPCACHE_DIR = 'httpcache'
 98 | #HTTPCACHE_IGNORE_HTTP_CODES = []
 99 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
100 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/ctrip_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/ctrip_spider.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/qua_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/qua_spider.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/washctrip.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/washctrip.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/ctrip_spider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from findtrip.items import FindtripItem
 3 | 
 4 | class CtripSpider(scrapy.Spider):
 5 |     name = 'ctrip'
 6 |     start_urls = [
 7 |             "http://flights.ctrip.com/booking/XMN-BJS-day-1.html?DDate1=2016-04-19"
 8 |             ]
 9 | 
10 |     def parse(self, response):
11 |         sel = scrapy.Selector(response)
12 |         fligint_div = "//div[@id='J_flightlist2']/div"
13 |         dataList = sel.xpath(fligint_div)
14 | 
15 |         for index,each in enumerate(dataList):
16 |             flight_each = fligint_div+'['+str(index+1)+']'
17 |             flight_tr = flight_each+"//tr[@class='J_header_row']"
18 |             istrain = sel.xpath(flight_each + "//div[@class='train_flight_tit']")
19 | 
20 |             if istrain:
21 |                 print ("this data is train add")
22 |             else:
23 |                 company = sel.xpath(flight_tr + "//div[@class='info-flight J_flight_no']//text()").extract()
24 | 
25 |                 flight_time_from = sel.xpath(flight_tr + "//td[@class='right']/div[1]//text()").extract()
26 |                 flight_time_to = sel.xpath(flight_tr + "//td[@class='left']/div[1]//text()").extract()
27 |                 flight_time = [flight_time_from,flight_time_to]
28 | 
29 |                 airports_from =  sel.xpath(flight_tr + "//td[@class='right']/div[2]//text()").extract()
30 |                 airports_to = sel.xpath(flight_tr + "//td[@class='left']/div[2]//text()").extract()
31 |                 airports = [airports_from,airports_to]
32 | 
33 |                 price_middle = sel.xpath(flight_tr + "[1]//td[@class='price middle ']/span//text()").extract()
34 |                 price = sel.xpath(flight_tr + "[1]//td[@class='price ']/span//text()").extract()
35 |                 if price_middle:
36 |                     price = price_middle
37 |                 elif price:
38 |                     price = price
39 |                 else:
40 |                     price = ''
41 | 
42 |                 item = FindtripItem()
43 |                 item['site'] = 'Ctrip'
44 |                 item['company'] = company
45 |                 item['flight_time'] = flight_time
46 |                 item['airports'] = airports
47 |                 item['price'] = price
48 |                 yield item
49 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/qua_spider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from findtrip.items import FindtripItem
 3 | 
 4 | class QuaSpider(scrapy.Spider):
 5 |     name = "qua"
 6 |     start_urls = [
 7 |         "http://www.qua.com/flights/PEK-XMN/2016-05-12?m=CNY&from=flight_home"
 8 |     ]
 9 | 
10 |     def parse(self, response):
11 |         sel = scrapy.Selector(response)
12 |         dataList = sel.xpath("//div[@class='m-fly-item s-oneway']")
13 | 
14 |         for index,each in enumerate(dataList):
15 |             flight_each = "//div[@id='list-box']/div["+str(index+1)+"]"
16 |             detail_span = "//div[@class='fl-detail-nav']/ul/li[1]/span[@class='nav-label']"
17 |             f_route_div = "//div[@class='m-fl-info-bd']/div"
18 | 
19 |             airports = sel.xpath(flight_each + f_route_div + '/p[3]//text()').extract()
20 |             company = sel.xpath(flight_each + f_route_div + '/p[1]//text()').extract()
21 |             flight_time = sel.xpath(flight_each + f_route_div + '/p[2]//text()').extract()
22 |             passtime = sel.xpath(flight_each + f_route_div + '/p[4]//text()').extract()
23 |             price = sel.xpath(flight_each + "//div[@class='fl-price-box']//em//text()").extract()
24 | 
25 |             item = FindtripItem()
26 |             item['site'] = 'Qua'
27 |             item['company'] = company
28 |             item['flight_time'] = flight_time
29 |             item['airports'] = airports
30 |             item['passtime'] = passtime
31 |             item['price'] = price
32 |             yield item
33 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/washctrip.py:
--------------------------------------------------------------------------------
1 | def wash(dateList):
2 |     dateList =  map(lambda x : x.split(), dateList)
3 |     cleanList = []
4 |     for each in dateList:
5 |         if each:
6 |             cleanList.append(each[0])
7 |     return cleanList
8 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/qua_items.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/qua_items.csv


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/qua_items.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/qua_items.json


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = findtrip.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = findtrip
12 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__init__.py


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy import Field, Item
 9 | 
10 | 
11 | class PythonjobsItem(Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     #pass
15 |     title = Field()
16 |     city = Field()
17 |     company = Field()
18 |     location = Field()
19 |     url = Field()
20 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class PythonjobsSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class PythonjobsPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for pythonjobs project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'pythonjobs'
13 | 
14 | SPIDER_MODULES = ['pythonjobs.spiders']
15 | NEWSPIDER_MODULE = 'pythonjobs.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'pythonjobs (+http://www.yourdomain.com)'
20 | USER_AGENT = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'
21 | 
22 | # Obey robots.txt rules
23 | ROBOTSTXT_OBEY = False
24 | 
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | #CONCURRENT_REQUESTS = 32
27 | 
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | #DOWNLOAD_DELAY = 3
32 | # The download delay setting will honor only one of:
33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
34 | #CONCURRENT_REQUESTS_PER_IP = 16
35 | 
36 | # Disable cookies (enabled by default)
37 | COOKIES_ENABLED = False
38 | 
39 | # Disable Telnet Console (enabled by default)
40 | #TELNETCONSOLE_ENABLED = False
41 | 
42 | # Override the default request headers:
43 | #DEFAULT_REQUEST_HEADERS = {
44 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45 | #   'Accept-Language': 'en',
46 | #}
47 | 
48 | # Enable or disable spider middlewares
49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
50 | #SPIDER_MIDDLEWARES = {
51 | #    'pythonjobs.middlewares.PythonjobsSpiderMiddleware': 543,
52 | #}
53 | 
54 | # Enable or disable downloader middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
56 | #DOWNLOADER_MIDDLEWARES = {
57 | #    'pythonjobs.middlewares.MyCustomDownloaderMiddleware': 543,
58 | #}
59 | 
60 | # Enable or disable extensions
61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | #    'scrapy.extensions.telnet.TelnetConsole': None,
64 | #}
65 | 
66 | # Configure item pipelines
67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
68 | ITEM_PIPELINES = {
69 |     'pythonjobs.pipelines.PythonjobsPipeline': 300,
70 | }
71 | 
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 | 
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/__pycache__/job_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/__pycache__/job_spider.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/job_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from pythonjobs.items import PythonjobsItem
 4 | #from bs4 import BeautifulSoup
 5 | 
 6 | class JobspiderSpider(scrapy.Spider):
 7 |     name = 'jobSpider'
 8 |     allowed_domains = ['search.51job.com','jobs.51job.com']
 9 | 
10 |     def start_requests(self):
11 |         for i in range(1,20):              # Set pages to crawl here.
12 |             url = "http://search.51job.com/list/000000,000000,0000,00,9,99,python,2,{0}.html".format(i)
13 |             yield scrapy.Request(url)
14 | 
15 |     def parse(self, response):
16 |         for sel in response.css("html body div.dw_wp div#resultList.dw_table div.el p.t1 span a"):
17 |             url  = sel.re('href="(.*?)"')[0]
18 |             yield scrapy.Request(url,callback=self.parse_item)
19 | 
20 |     def parse_item(self, response):
21 |         item = PythonjobsItem()
22 |         item['title'] = response.xpath('//div[@class="cn"]/h1/@title').extract()[0]
23 |         item['url'] = response.url
24 |         item['city'] = response.xpath('//span[@class="lname"]/text()').extract()[0]
25 |         item['company'] = response.xpath('//p[@class="cname"]/a/@title').extract()[0]
26 |         item['location'] = response.xpath('//p[@class="fp"]/text()').extract()[1].rstrip()
27 |         return item


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = pythonjobs.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = pythonjobs
12 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/ghostdriver.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/ghostdriver.log


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = shuimujob.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = shuimujob
12 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__init__.py


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/platform.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/platform.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class ShuimujobItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     # pass
15 |     title = scrapy.Field()
16 |     href = scrapy.Field()
17 |     author = scrapy.Field()
18 |     time = scrapy.Field()
19 |     content = scrapy.Field()
20 |     is_dev = scrapy.Field()
21 |     is_alg = scrapy.Field()
22 |     is_fin = scrapy.Field()
23 |     base_url_index = scrapy.Field()
24 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class ShuimujobSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import pymongo
 8 | from scrapy.conf import settings
 9 | from scrapy.exceptions import DropItem
10 | from scrapy import log
11 | 
12 | class ShuimujobPipeline(object):
13 |     def process_item(self, item, spider):
14 |         return item
15 | 
16 | class MongoDBPipeline(object):
17 | 
18 |     def __init__(self):
19 |         pass
20 | 
21 | 
22 |     def open_spider(self, spider):
23 |         self.client = pymongo.MongoClient(
24 |             settings['MONGODB_SERVER'],
25 |             settings['MONGODB_PORT']
26 |         )
27 |         self.db = self.client[settings['MONGODB_DB']]
28 |         self.collection = self.db[settings['MONGODB_COLLECTION']]
29 | 
30 |     def close_spider(self, spider):
31 |         self.client.close()
32 | 
33 |     def process_item(self, item, spider):
34 |         valid = True
35 |         for data in item:
36 |             if not data :
37 |                 valid = False
38 |                 raise DropItem("Missing {0}!".format(data))
39 |         if item['title'] == '':
40 |             valid = False
41 |             raise DropItem("title is '' ")
42 |         if item['content'] == '':
43 |             valid = False
44 |             raise DropItem("content is '' ")
45 |         if valid:
46 |             self.collection.insert(dict(item))
47 |         return item
48 | 
49 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/platform.py:
--------------------------------------------------------------------------------
1 | import sys
2 | def getPlatform():
3 |     platform=''
4 |     if sys.platform.startswith('win'):
5 |         platform = 'win'
6 |     elif sys.platform.startswith('linux'):
7 |         platform = 'linux'
8 |     return platform


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for shuimujob project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'shuimujob'
13 | 
14 | SPIDER_MODULES = ['shuimujob.spiders']
15 | NEWSPIDER_MODULE = 'shuimujob.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'shuimujob (+http://www.yourdomain.com)'
20 | 
21 | 
22 | MONGODB_SERVER = "localhost"
23 | MONGODB_PORT = 27017
24 | MONGODB_DB = "shuimujob"
25 | MONGODB_COLLECTION = "job_info"
26 | 
27 | # Obey robots.txt rules
28 | ROBOTSTXT_OBEY = False
29 | 
30 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
31 | #CONCURRENT_REQUESTS = 32
32 | 
33 | # Configure a delay for requests for the same website (default: 0)
34 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
35 | # See also autothrottle settings and docs
36 | #DOWNLOAD_DELAY = 3
37 | # The download delay setting will honor only one of:
38 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
39 | #CONCURRENT_REQUESTS_PER_IP = 16
40 | 
41 | # Disable cookies (enabled by default)
42 | COOKIES_ENABLED = False
43 | 
44 | # Disable Telnet Console (enabled by default)
45 | #TELNETCONSOLE_ENABLED = False
46 | 
47 | # Override the default request headers:
48 | #DEFAULT_REQUEST_HEADERS = {
49 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
50 | #   'Accept-Language': 'en',
51 | #}
52 | 
53 | # Enable or disable spider middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
55 | #SPIDER_MIDDLEWARES = {
56 | #    'shuimujob.middlewares.ShuimujobSpiderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable downloader middlewares
60 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
61 | #DOWNLOADER_MIDDLEWARES = {
62 | #    'shuimujob.middlewares.MyCustomDownloaderMiddleware': 543,
63 | #}
64 | 
65 | # Enable or disable extensions
66 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
67 | #EXTENSIONS = {
68 | #    'scrapy.extensions.telnet.TelnetConsole': None,
69 | #}
70 | 
71 | # Configure item pipelines
72 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
73 | ITEM_PIPELINES = {
74 |    # 'shuimujob.pipelines.ShuimujobPipeline': 300,
75 |    'shuimujob.pipelines.MongoDBPipeline':300
76 | }
77 | 
78 | # Enable and configure the AutoThrottle extension (disabled by default)
79 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
80 | #AUTOTHROTTLE_ENABLED = True
81 | # The initial download delay
82 | #AUTOTHROTTLE_START_DELAY = 5
83 | # The maximum download delay to be set in case of high latencies
84 | #AUTOTHROTTLE_MAX_DELAY = 60
85 | # The average number of requests Scrapy should be sending in parallel to
86 | # each remote server
87 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
88 | # Enable showing throttling stats for every response received:
89 | #AUTOTHROTTLE_DEBUG = False
90 | 
91 | # Enable and configure HTTP caching (disabled by default)
92 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
93 | #HTTPCACHE_ENABLED = True
94 | #HTTPCACHE_EXPIRATION_SECS = 0
95 | #HTTPCACHE_DIR = 'httpcache'
96 | #HTTPCACHE_IGNORE_HTTP_CODES = []
97 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
98 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/__pycache__/shuimu_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/__pycache__/shuimu_spider.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/shuimu_spider.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | import scrapy
 3 | from shuimujob.items import ShuimujobItem
 4 | from selenium import webdriver
 5 | from selenium.webdriver.common.by import By
 6 | from selenium.webdriver.support.ui import WebDriverWait
 7 | from selenium.webdriver.support import  expected_conditions as EC
 8 | from bs4 import BeautifulSoup
 9 | from scrapy import signals
10 | from scrapy.xlib.pydispatch import dispatcher
11 | from shuimujob.platform import getPlatform
12 | 
13 | class SMSpider(scrapy.spiders.CrawlSpider):
14 |     '''
15 |     #要建立一个 Spider，你可以为 scrapy.spider.BaseSpider 创建一个子类，并确定三个主要的、强制的属性：
16 |     #name ：爬虫的识别名，它必须是唯一的，在不同的爬虫中你必须定义不同的名字.
17 |     #start_urls ：爬虫开始爬的一个 URL 列表。爬虫从这里开始抓取数据，所以，第一次下载的数据将会从这些 URLS 开始。其他子 URL 将会从这些起始 URL 中继承性生成。
18 |     #parse() ：爬虫的方法，调用时候传入从每一个 URL 传回的 Response 对象作为参数，response 将会是 parse 方法的唯一的一个参数,
19 |     #这个方法负责解析返回的数据、匹配抓取的数据(解析为 item )并跟踪更多的 URL。
20 |     '''
21 |     name="shuimujob"
22 |     base_url = 'http://www.newsmth.net/nForum/board/Intern'
23 |     start_urls = [base_url]
24 |     start_urls.extend([base_url+'?p='+str(i) for i in range(2,4)])
25 |     # start_urls = ['http://www.newsmth.net/']
26 |     platform = getPlatform()
27 | 
28 |     def __init__(self):
29 |         scrapy.spiders.Spider.__init__(self)
30 |         if self.platform == 'linux':
31 |             self.driver = webdriver.PhantomJS()
32 |         elif self.platform == 'win':
33 |             self.driver = webdriver.PhantomJS()
34 |         self.driver.set_page_load_timeout(15)
35 |         dispatcher.connect(self.spider_closed, signals.spider_closed)
36 | 
37 | 
38 | 
39 |     def spider_closed(self, spider):
40 |         self.driver.quit()
41 | 
42 |     def parse(self,response):
43 |         self.driver.get(response.url)
44 | 
45 |         element = WebDriverWait(self.driver,30).until(EC.presence_of_all_elements_located((By.TAG_NAME,'table')))
46 |         page_source = self.driver.page_source
47 |         bs_obj = BeautifulSoup(page_source, "lxml")
48 |         table = bs_obj.find('table',class_='board-list tiz')
49 |         intern_messages = table.find_all('tr',class_=False)
50 |         for message in intern_messages:
51 |             title, href, time, author = '','','',''
52 |             td_9 = message.find('td',class_='title_9')
53 |             if td_9:
54 |                 title = td_9.a.get_text().encode('utf-8','ignore')
55 |                 href = td_9.a['href']
56 |             td_10 = message.find('td', class_='title_10')
57 |             if td_10:
58 |                 time=td_10.get_text().encode('utf-8','ignore')
59 |             td_12 = message.find('td', class_='title_12')
60 |             if td_12:
61 |                 author = td_12.a.get_text().encode('utf-8','ignore')
62 |             item = ShuimujobItem()
63 |             item['title'] = title
64 |             item['href'] = href
65 |             item['time'] = time
66 |             item['author'] = author
67 |             item['base_url_index'] = 0
68 |             root_url = 'http://www.newsmth.net'
69 |             # content = scrapy.Request(root_url+href,self.parse_content)
70 |             if href!='':
71 |                 content = self.parse_content(root_url+href)
72 |                 # print 'content:', content
73 |                 item['content'] = content
74 |             yield item
75 | 
76 |     def parse_content(self,url):
77 | 
78 |         self.driver.get(url)
79 |         element = WebDriverWait(self.driver, 30).until(EC.presence_of_all_elements_located((By.TAG_NAME, 'table')))
80 |         page_source = self.driver.page_source
81 |         bs_obj = BeautifulSoup(page_source, "lxml")
82 |         return bs_obj.find('td', class_='a-content').p.get_text().encode('utf-8','ignore')


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/house.csv:
--------------------------------------------------------------------------------
 1 | house,house_area,house_room,total_price,unit_price
 2 | 滨江奥城听雨苑 ,115.83,3室2厅,515.0,44462
 3 | 虹苑新寓三村 ,63.8,2室2厅,196.0,30722
 4 | 天坛新寓 ,75.16,3室1厅,243.0,32332
 5 | 棉鞋营小区 ,69.74,3室1厅,220.0,31546
 6 | 常府街10至16号 ,62.21,2室1厅,212.0,34079
 7 | house,house_area,house_room,total_price,unit_price
 8 | 滨江奥城听雨苑 ,115.83,3室2厅,515.0,44462
 9 | 虹苑新寓三村 ,63.8,2室2厅,196.0,30722
10 | 天坛新寓 ,75.16,3室1厅,243.0,32332
11 | 棉鞋营小区 ,69.74,3室1厅,220.0,31546
12 | 常府街10至16号 ,62.21,2室1厅,212.0,34079
13 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__init__.py


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class NjHouseItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     house=scrapy.Field()
15 |     total_price=scrapy.Field()
16 |     unit_price=scrapy.Field()
17 |     house_room=scrapy.Field()
18 |     house_area=scrapy.Field()


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class NjHouseSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class NjHousePipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for nj_house project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'nj_house'
13 | 
14 | SPIDER_MODULES = ['nj_house.spiders']
15 | NEWSPIDER_MODULE = 'nj_house.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'nj_house (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'nj_house.middlewares.NjHouseSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'nj_house.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'nj_house.pipelines.NjHousePipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/__pycache__/lj_house.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/__pycache__/lj_house.cpython-36.pyc


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/lj_house.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | import scrapy
 4 | from nj_house.items import NjHouseItem
 5 | 
 6 | class LjHouseSpider(scrapy.Spider):
 7 |     name = "lj_house"
 8 |     allowed_domains = ["nj.lianjia.com/ershoufang/"]
 9 |     start_urls = ['http://nj.lianjia.com/ershoufang//']
10 | 
11 |     def parse(self, response):
12 |         clears = response.css('.sellListContent li')
13 |         item = NjHouseItem()
14 |         for c in clears:       	
15 |         		house = c.css('.houseInfo a::text').extract_first()
16 |         		house_text = c.css('.houseInfo::text').extract_first()
17 |         		house_info_list = [e for e in re.split('\|', int(house_text)) if len(e) > 1]
18 |         		house_room = house_info_list[0].strip()
19 |         		house_area = ''.join(re.findall(r'[\d+\.]', house_info_list[1]))
20 |         		total_price = c.css('.totalPrice span::text').extract_first()
21 |         		unit_price = c.css('.unitPrice span::text').extract_first()
22 |         		unit_price = re.findall('\d+', unit_price)[0]
23 | 
24 |         		item['house'] = house
25 |         		item['total_price'] = float(total_price)
26 |         		item['unit_price'] = int(unit_price)
27 |         		item['house_area'] = float(house_area)
28 |         		item['house_room'] = house_room
29 |         		yield item
30 | 
31 |         page_info = response.css('div[class="page-box fr"]').css('div::attr(page-data)').extract_first()
32 |         page_list = re.findall('\d+', page_info)
33 |         next_page = 'pg' + str(int(page_list[1]) + 1)
34 |         url = self.start_urls[0] + next_page
35 |         if next_page:
36 |             yield Request(url=url, callback=self.parse)
37 | 


--------------------------------------------------------------------------------
/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = nj_house.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = nj_house
12 | 


--------------------------------------------------------------------------------
/Python3网络爬虫快速入门篇/biqukan.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:UTF-8 -*-
 2 | from bs4 import BeautifulSoup
 3 | import requests, sys
 4 | 
 5 | """
 6 | 类说明:下载《笔趣看》网小说《一念永恒》
 7 | Parameters:
 8 |     无
 9 | Returns:
10 |     无
11 | """
12 | class downloader(object):
13 | 
14 |     def __init__(self):
15 |         self.server = 'http://www.biqukan.com/'
16 |         self.target = 'http://www.biqukan.com/1_1094/'
17 |         self.names = []            #存放章节名
18 |         self.urls = []            #存放章节链接
19 |         self.nums = 0            #章节数
20 | 
21 |     """
22 |     函数说明:获取下载链接
23 |     Parameters:
24 |         无
25 |     Returns:
26 |         无
27 |     """
28 |     def get_download_url(self):
29 |         req = requests.get(url = self.target)
30 |         html = req.text
31 |         div_bf = BeautifulSoup(html)
32 |         div = div_bf.find_all('div', class_ = 'listmain')
33 |         a_bf = BeautifulSoup(str(div[0]))
34 |         a = a_bf.find_all('a')
35 |         self.nums = len(a[15:])                                #剔除不必要的章节，并统计章节数
36 |         for each in a[15:]:
37 |             self.names.append(each.string)
38 |             self.urls.append(self.server + each.get('href'))
39 | 
40 |     """
41 |     函数说明:获取章节内容
42 |     Parameters:
43 |         target - 下载连接(string)
44 |     Returns:
45 |         texts - 章节内容(string)
46 |     """
47 |     def get_contents(self, target):
48 |         req = requests.get(url = target)
49 |         html = req.text
50 |         bf = BeautifulSoup(html, "lxml")
51 |         texts = bf.find_all('div', class_ = 'showtxt')
52 |         texts = texts[0].text.replace('\xa0'*8,'\n\n')
53 |         return texts
54 | 
55 |     """
56 |     函数说明:将爬取的文章内容写入文件
57 |     Parameters:
58 |         name - 章节名称(string)
59 |         path - 当前路径下,小说保存名称(string)
60 |         text - 章节内容(string)
61 |     Returns:
62 |         无
63 |     """
64 |     def writer(self, name, path, text):
65 |         write_flag = True
66 |         with open(path, 'a', encoding='utf-8') as f:
67 |             f.write(name + '\n')
68 |             f.writelines(text)
69 |             f.write('\n\n')
70 | 
71 | if __name__ == "__main__":
72 |     dl = downloader()
73 |     dl.get_download_url()
74 |     print('《一年永恒》开始下载：')
75 |     for i in range(dl.nums):
76 |         dl.writer(dl.names[i], '一念永恒.txt', dl.get_contents(dl.urls[i]))
77 |         sys.stdout.write("  已下载:%.3f%%" %  float(i/dl.nums) + '\r')
78 |         sys.stdout.flush()
79 |     print('《一年永恒》下载完成')


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Python3网络爬虫中小型项目实战清单
 2 | 
 3 | 01.python爬取电影天堂
 4 | 
 5 | 02.python爬取斗罗大陆小说
 6 | 
 7 | 03.Python抓取欧洲足球联赛数据
 8 | 
 9 | 04.python爬取豆瓣电影Top250
10 | 
11 | 05.python爬取股票数据
12 | 
13 | 06.python爬取人人贷网数据
14 | 
15 | 07.python爬取创业邦创投库
16 | 
17 | 08.python抓取美团网百万商家信息 ***
18 | 
19 | 09.python爬取网易云音乐评论并把他们存入mysql数据库 ***
20 | 
21 | 10.python爬取“网上购物”类APP
22 | 
23 | 11.python爬取链家网房价信息 ***
24 | 
25 | 12.python爬取并分析豆瓣中最新电影的影评(词云显示)
26 | 
27 | 13.python爬取豆瓣书籍信息
28 | 
29 | 14.python爬取今日头条信息并导入mongodb数据库
30 | 
31 | 15.python爬取百度招聘内容并存入mongodb数据库 ***
32 | 
33 | 16.python爬取熊猫直播用户信息
34 | 
35 | 17.scrapy爬取游天下南京短租房信息并存入mongodb数据库
36 | 
37 | 18.scrapy爬取中国医学人才网信息并以json格式保存
38 | 
39 | 19.scrapy框架爬取豆瓣电影top250信息
40 | 
41 | 20.scrapy爬取织梦者网站信息并存入mongodb数据库 ***
42 | 
43 | 21.python爬取豆瓣电影<前任3>评论(词云显示)
44 | 
45 | 22.python爬取Bilibili用户信息并导入mysql数据库 ***
46 | 
47 | 23.python爬取网易云音乐所有歌曲的评论数
48 | 
49 | 24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库 ***
50 | 
51 | 25.scrapy爬取前程无忧网站python相关的工作信息
52 | 
53 | 26.scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库 ***
54 | 
55 | 27.scrapy爬取南京20000多套二手房信息


--------------------------------------------------------------------------------