├── Python3网络爬虫中小型项目实战集中营 ├── 01_python爬取电影天堂 │ ├── dytt.py │ └── 电影天堂.csv ├── 02_python爬取斗罗大陆小说 │ ├── dldl.py │ ├── 斗破苍穹小说.csv │ ├── 斗破苍穹小说.py │ └── 斗罗大陆小说.csv ├── 03_python爬取欧洲足球联赛数据 │ └── footballData.py ├── 04_python爬取豆瓣电影Top250 │ ├── douban_top250_movies.csv │ └── filmTop250.py ├── 05_python爬取股票数据 │ └── stockInfo.py ├── 06_python爬取人人贷网数据 │ └── peopleLoad.py ├── 07_python爬取创业邦创投库 │ ├── python爬取创业邦创投库.py │ └── resultsDatas.csv ├── 08_python抓取美团网百万商家信息 │ ├── meituan.csv │ └── python抓取美团网百万商家信息.py ├── 09_python爬取网易云音乐评论并把他们存入mysql数据库 │ └── python爬取网易云音乐评论并把他们存入mysql数据库.py ├── 10_python爬取“网上购物”类APP │ ├── apps.csv │ ├── python爬取网上购物类APP数据py │ └── 网上购物类APP数据分析并展示.py ├── 11_python爬取链家网房价信息 │ ├── Lianjia_Info_v1.py │ ├── Lianjia_Info_v2.py │ ├── Lianjia_Info_v3.py │ ├── Lianjia_Info_v4.py │ ├── Lianjia_Info_v4_analysis.py │ ├── lianjia.csv │ ├── lianjia_ershou_futian_100.xlsx │ └── lianjia_re_v4.csv ├── 12_python爬取并分析豆瓣中最新电影的影评(词云显示) │ ├── alice_mask.png │ ├── alice_mask1.png │ ├── python爬取并分析豆瓣中最新电影的影评.py │ ├── show_Chinese.png │ ├── stopwords.txt │ └── 豆瓣影评爬取入库.py ├── 13_python爬取豆瓣书籍信息 │ ├── books.csv │ └── python爬取豆瓣书籍信息.py ├── 14_python爬取今日头条信息并导入mongodb数据库 │ └── python爬取今日头条信息并导入mongodb数据库.py ├── 15_python使用selenium爬取百度招聘内容并存入mongodb数据库 │ └── python使用selenium爬取百度招聘内容并入mongodb数据库.py ├── 16_python爬取熊猫直播用户信息 │ └── python爬取熊猫直播用户信息.py ├── 17_scrapy爬取游天下南京短租房信息并存入mongodb数据库 │ └── youtxNanJin │ │ ├── README.txt │ │ ├── scrapy.cfg │ │ ├── youtxNanJin │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── items.cpython-36.pyc │ │ │ ├── pipelines.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── youtxNanJin_spider.cpython-36.pyc │ │ │ └── youtxNanJin_spider.py │ │ ├── 游天下南京.csv │ │ └── 游天下南京.json ├── 18_scrapy爬取中国医学人才网信息并以json格式保存 │ └── chinadoctornet │ │ ├── README.txt │ │ ├── chinadoctornet │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── items.cpython-36.pyc │ │ │ ├── pipelines.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── chinadoctornet_spider.cpython-36.pyc │ │ │ └── chinadoctornet_spider.py │ │ ├── scrapy.cfg │ │ ├── 中国医学人才网招聘最新招聘专栏.csv │ │ └── 中国医学人才网招聘最新招聘专栏.json ├── 19_scrapy框架爬取豆瓣电影top250信息 │ └── doubanmovie │ │ ├── README.txt │ │ ├── doubanmovie │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── items.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── doubanmovie_spider.cpython-36.pyc │ │ │ └── doubanmovie_spider.py │ │ ├── items.csv │ │ ├── items.json │ │ └── scrapy.cfg ├── 20_scrapy爬取织梦者网站信息并存入mongodb数据库 │ └── makedream │ │ ├── makedream │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── items.cpython-36.pyc │ │ │ ├── pipelines.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── makedream_spider.cpython-36.pyc │ │ │ └── makedream_spider.py │ │ └── scrapy.cfg ├── 21_python爬取豆瓣电影前任3评论(词云显示) │ ├── ComentsAnaylst.py │ ├── ciyun.jpg │ ├── ciyun.png │ ├── douban.txt │ └── douban_qianren3.py ├── 22_python爬取Bilibili用户信息并导入mysql数据库 │ ├── bilibili_user.py │ ├── bilibili_user_info.sql │ └── user_agents.txt ├── 23_python爬取网易云音乐所有歌曲的评论数 │ ├── README.md │ ├── album_by_artist.py │ ├── artists.py │ ├── comments_by_music.py │ ├── music_by_album.py │ └── sql.py ├── 24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库 │ └── findtrip │ │ ├── ctrip_items.csv │ │ ├── findtrip │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── items.cpython-36.pyc │ │ │ ├── pipelines.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── ctrip_spider.cpython-36.pyc │ │ │ ├── qua_spider.cpython-36.pyc │ │ │ └── washctrip.cpython-36.pyc │ │ │ ├── ctrip_spider.py │ │ │ ├── qua_spider.py │ │ │ └── washctrip.py │ │ ├── qua_items.csv │ │ ├── qua_items.json │ │ └── scrapy.cfg ├── 25_scrapy爬取前程无忧网站python相关的工作信息 │ └── pythonjobs │ │ ├── PythonJobs.csv │ │ ├── pythonjobs │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── items.cpython-36.pyc │ │ │ ├── pipelines.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── job_spider.cpython-36.pyc │ │ │ └── job_spider.py │ │ └── scrapy.cfg ├── 26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库 │ └── shuimujob │ │ ├── ghostdriver.log │ │ ├── scrapy.cfg │ │ └── shuimujob │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── items.cpython-36.pyc │ │ ├── platform.cpython-36.pyc │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── platform.py │ │ ├── settings.py │ │ └── spiders │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── shuimu_spider.cpython-36.pyc │ │ └── shuimu_spider.py └── 27_scrapy爬取南京20000多套二手房信息 │ └── nj_house │ ├── house.csv │ ├── nj_house │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── items.cpython-36.pyc │ │ └── settings.cpython-36.pyc │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── lj_house.cpython-36.pyc │ │ └── lj_house.py │ └── scrapy.cfg ├── Python3网络爬虫快速入门篇 ├── README.md ├── biqukan.py └── 一念永恒.txt └── README.md /Python3网络爬虫中小型项目实战集中营/01_python爬取电影天堂/dytt.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 今日主题:python抓取电影天堂最新电影迅雷下载地址链接信息 3 | 所用模块:requests bs4 pandas数据分析 4 | ''' 5 | import requests 6 | import re 7 | import pandas as pd 8 | 9 | url = 'https://www.dy2018.com/html/gndy/dyzz/index.html' 10 | 11 | headers = { 12 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' 13 | } 14 | 15 | items_list = [] 16 | 17 | html = requests.get(url,headers=headers) 18 | html.encoding = 'gb2312' 19 | data = re.findall('.*?',html_1.text) 27 | #print(data_1[0]) 28 | list_1 = [i[1], url_1, data_1[0]] 29 | 30 | # list_1 = [url_1] 31 | 32 | items_list.append(list_1) 33 | #print (list_1) 34 | 35 | #print ('==========================================================================================================') 36 | 37 | for m in range(2, 298): 38 | url_2 = 'https://www.dy2018.com/html/gndy/dyzz/index_'+str(m)+'.html' 39 | print(url_2) 40 | html_2 = requests.get(url_2,headers=headers) 41 | html_2.encoding = 'gb2312' 42 | data_2 = re.findall('.*?',html_3.text) 50 | #print(data_3[0]) 51 | if len(data_3) < 1: 52 | continue 53 | list_2 = [n[1], url_3, data_3[0]] 54 | # list_2 = [url_3] 55 | 56 | 57 | items_list.append(list_2) 58 | #print (list_2) 59 | #print ('=====================================================================================================') 60 | 61 | df = pd.DataFrame(items_list, columns = ['电影名称','电影网址链接','电影迅雷下载链接']) 62 | 63 | df.to_csv('dytt.csv') -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/01_python爬取电影天堂/电影天堂.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/01_python爬取电影天堂/电影天堂.csv -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/02_python爬取斗罗大陆小说/dldl.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 今日主题:python抓取斗罗大陆最新章节标题信息 3 | 所用模块:requests re bs4 pandas数据分析 4 | ''' 5 | import requests 6 | import re 7 | import pandas as pd 8 | from bs4 import BeautifulSoup #分析网页 获取标签内容 9 | 10 | url = 'https://www.freexs.org/novel/0/896/index.html' 11 | 12 | headers = { 13 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' 14 | } 15 | 16 | items_list = [] 17 | 18 | html = requests.get(url,headers=headers) 19 | html.encoding = 'gb2312' 20 | 21 | data = re.findall('
(.*?)
',html.text) 22 | for i in data: 23 | url_1 = 'https://www.freexs.org/novel/0/896/'+str(i[0]) 24 | print (i[1]) 25 | print (url_1) 26 | list = [url_1, i[1]] 27 | items_list.append(list) 28 | 29 | 30 | # html_1 = requests.get(url_1,headers=headers) 31 | # html_1.encoding = 'gb2312' 32 | # soup = BeautifulSoup(html_1.text,'lxml') 33 | # title = soup.find('div', class_='readout').text #标题 34 | # print (title) 35 | 36 | df = pd.DataFrame(items_list, columns = ['链接','章节主题']) 37 | df.to_csv('斗罗大陆小说.csv') -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/02_python爬取斗罗大陆小说/斗破苍穹小说.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/02_python爬取斗罗大陆小说/斗破苍穹小说.csv -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/02_python爬取斗罗大陆小说/斗破苍穹小说.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*- coding: utf-8 -*- 3 | ''' 4 | 今日主题:python抓取斗破苍穹最新章节标题信息 5 | 所用模块:requests re bs4 pandas数据分析 6 | ''' 7 | import requests 8 | import re 9 | import pandas as pd 10 | from bs4 import BeautifulSoup #分析网页 获取标签内容 11 | 12 | url = 'https://www.miaobige.com/read/68/' 13 | 14 | headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'} 15 | 16 | items_list = [] 17 | 18 | html = requests.get(url, headers=headers) 19 | # html.encoding = 'gb2312' 20 | soup = BeautifulSoup(html.text,'html.parser') 21 | title = soup.find('div', id='readerlists') 22 | datas = re.findall('
  • (.*?)
  • ',title.text) 23 | for data in datas: 24 | url_1 = 'https://www.miaobige.com/' + data[0] 25 | print (data) 26 | item_list = [url_1, data[1]] 27 | items_list.append(item_list) 28 | 29 | df = pd.DataFrame(items_list, columns = ['链接','章节主题']) 30 | df.to_csv('斗破苍穹小说.csv') -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/02_python爬取斗罗大陆小说/斗罗大陆小说.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/02_python爬取斗罗大陆小说/斗罗大陆小说.csv -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/03_python爬取欧洲足球联赛数据/footballData.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*- coding: utf-8 -*- 3 | 4 | import requests 5 | import urlparse 6 | import bs4 7 | import csv 8 | 9 | BASE_URL = "http://soccerdata.sports.qq.com" 10 | PLAYER_LIST_QUERY = "/playerSearch.aspx?lega=%s&pn=%d" 11 | league = ['epl','seri','bund','liga','fran','scot','holl','belg'] 12 | page_number_limit = 100 13 | player_fields = ['league_cn','img','name_cn','name','team','age','position_cn','nation','birth','query','id','teamid','league'] 14 | 15 | def get_players(baseurl): 16 | html = requests.get(baseurl).text 17 | soup = bs4.BeautifulSoup(html, "lxml") 18 | players = [ dd for dd in soup.select('.searchResult tr') if dd.contents[1].name != 'th'] 19 | result = [] 20 | for player in players: 21 | record = [] 22 | link = '' 23 | query = [] 24 | for item in player.contents: 25 | if type(item) is bs4.element.Tag: 26 | if not item.string and item.img: 27 | record.append(item.img['src']) 28 | else : 29 | record.append(item.string and item.string.strip() or 'na') 30 | try: 31 | o = urlparse.urlparse(item.a['href']).query 32 | if len(link) == 0: 33 | link = o 34 | query = dict([(k,v[0]) for k,v in urlparse.parse_qs(o).items()]) 35 | except: 36 | pass 37 | 38 | if len(record) != 10: 39 | for i in range(0, 10 - len(record)): 40 | record.append('na') 41 | record.append(unicode(link,'utf-8')) 42 | record.append(unicode(query["id"],'utf-8')) 43 | record.append(unicode(query["teamid"],'utf-8')) 44 | record.append(unicode(query["lega"],'utf-8')) 45 | result.append(record) 46 | return result 47 | 48 | result = [] 49 | for url in [ BASE_URL + PLAYER_LIST_QUERY % (l,n) for l in league for n in range(page_number_limit) ]: 50 | result = result + get_players(url) 51 | 52 | 53 | for i in league: 54 | for j in range(0, 100): 55 | url = BASE_URL + PLAYER_LIST_QUERY % (l,n) 56 | ## send request to url and do scraping 57 | 58 | 59 | def write_csv(filename, content, header = None): 60 | file = open(filename, "wb") 61 | file.write('\xEF\xBB\xBF') 62 | writer = csv.writer(file, delimiter=',') 63 | if header: 64 | writer.writerow(header) 65 | for row in content: 66 | encoderow = [dd.encode('utf8') for dd in row] 67 | writer.writerow(encoderow) 68 | 69 | write_csv('players.csv',result,player_fields) 70 | 71 | def get_player_match(url): 72 | html = requests.get(url).text 73 | soup = bs4.BeautifulSoup(html, "lxml") 74 | matches = [ dd for dd in soup.select('.shtdm tr') if dd.contents[1].name != 'th'] 75 | records = [] 76 | for item in [ dd for dd in matches if len(dd.contents) > 11]: ## filter out the personal part 77 | record = [] 78 | for match in [ dd for dd in item.contents if type(dd) is bs4.element.Tag]: 79 | if match.string: 80 | record.append(match.string) 81 | else: 82 | for d in [ dd for dd in match.contents if type(dd) is bs4.element.Tag]: 83 | query = dict([(k,v[0]) for k,v in urlparse.parse_qs(d['href']).items()]) 84 | record.append('teamid' in query and query['teamid'] or query['id']) 85 | record.append(d.string and d.string or 'na') 86 | records.append(record) 87 | return records[1:] ##remove the first record as the header 88 | 89 | def get_players_match(playerlist, baseurl = BASE_URL + '/player.aspx?'): 90 | result = [] 91 | for item in playerlist: 92 | url = baseurl + item[10] 93 | print (url) 94 | result = result + get_player_match(url) 95 | return result 96 | match_fields = ['date_cn','homeid','homename_cn','matchid','score','awayid','awayname_cn','league_cn','firstteam','playtime','goal','assist','shoot','run','corner','offside','foul','violation','yellowcard','redcard','save'] 97 | write_csv('m.csv',get_players_match(result),match_fields) -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/04_python爬取豆瓣电影Top250/douban_top250_movies.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/04_python爬取豆瓣电影Top250/douban_top250_movies.csv -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/04_python爬取豆瓣电影Top250/filmTop250.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- encoding:utf-8 -*- 3 | 4 | """ 5 | @author : Tom 6 | @file : douban_movie 7 | @time : 2018/4/6 23:04 8 | @description : 9 | 10 | """ 11 | 12 | import requests 13 | import re 14 | from bs4 import BeautifulSoup 15 | import csv 16 | 17 | 18 | # 先创建一个csv文件,写好头部 19 | with open("douban_top250_movies.csv", 'w') as filed: # a+为添加,w为擦除重写 20 | csv_writer = csv.DictWriter(filed, [ 21 | u'片名', 22 | u'评分', 23 | u'评分人数', 24 | u'一句话描述', 25 | u'豆瓣链接', 26 | ]) 27 | csv_writer.writeheader() 28 | 29 | 30 | def get_mov_info(response): 31 | mov_info = {} 32 | soup = BeautifulSoup(response.text, "lxml") 33 | movies = soup.find_all('div', class_="info") 34 | 35 | for info in movies: 36 | # 获得电影的中文名 37 | mov_info['mov_name'] = info.find('span', class_='title').text # find()只找到一个,结果以树结构返回 38 | 39 | # 获得电影在豆瓣中的链接 40 | mov_info['mov_link'] = info.find('a').get('href') 41 | 42 | # 找到评分以及评价人数 43 | rating_num = info.find(class_='rating_num') 44 | mov_info['rating_score'] = rating_num.text 45 | comment = rating_num.find_next_sibling().find_next_sibling() 46 | # 对评价字段切分 47 | comment_num = re.findall('\d{0,}', comment.text) 48 | mov_info['comment_nums'] = comment_num[0] # 正则匹配re中没有find(),findall()以列表形式返回结果 49 | 50 | # 获得一句话评价 51 | comment_one = info.find('span', class_='inq') 52 | if comment_one is None: 53 | mov_info['inq_comment'] = u' ' 54 | else: 55 | mov_info['inq_comment'] = comment_one.text 56 | print (mov_info) 57 | 58 | # 一条条存入csv文件 59 | write_csv(mov_info) 60 | 61 | 62 | def write_csv(info_dict): 63 | with open("douban_top250_movies.csv", 'a+') as f: 64 | csv_write = csv.DictWriter(f, [ 65 | u'片名', 66 | u'评分', 67 | u'评分人数', 68 | u'一句话描述', 69 | u'豆瓣链接', 70 | ]) 71 | csv_write.writerow({ # writerow()写入单行,writerows写入多行,这里只有一行数据,用writerows报错 72 | u'片名': info_dict['mov_name'], 73 | u'评分': info_dict['rating_score'], 74 | u'评分人数': info_dict['comment_nums'], 75 | u'一句话描述': info_dict['inq_comment'], 76 | u'豆瓣链接': info_dict['mov_link'] 77 | }) 78 | 79 | for num in range(0, 10): 80 | page = num * 25 81 | response = requests.get("https://movie.douban.com/top250?start=%d&filter=" % page) 82 | get_mov_info(response) -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/05_python爬取股票数据/stockInfo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import traceback 6 | import re 7 | 8 | def getHTMLText(url): 9 | try: 10 | r = requests.get(url) 11 | r.raise_for_status() 12 | r.encoding = r.apparent_encoding 13 | return r.text 14 | except: 15 | return "" 16 | 17 | def getStockList(lst, stockURL): 18 | html = getHTMLText(stockURL) 19 | soup = BeautifulSoup(html, 'html.parser') 20 | a = soup.find_all('a') 21 | for i in a: 22 | try: 23 | href = i.attrs['href'] 24 | lst.append(re.findall(r"[s][hz]\d{6}", href)[0]) 25 | except: 26 | continue 27 | 28 | def getStockInfo(lst, stockURL, fpath): 29 | count = 0 30 | for stock in lst: 31 | url = stockURL + stock + ".html" 32 | html = getHTMLText(url) 33 | try: 34 | if html=="": 35 | continue 36 | infoDict = {} 37 | soup = BeautifulSoup(html, 'html.parser') 38 | stockInfo = soup.find('div',attrs={'class':'stock-bets'}) 39 | 40 | name = stockInfo.find_all(attrs={'class':'bets-name'})[0] 41 | infoDict.update({'股票名称': name.text.split()[0]}) 42 | 43 | keyList = stockInfo.find_all('dt') 44 | valueList = stockInfo.find_all('dd') 45 | for i in range(len(keyList)): 46 | key = keyList[i].text 47 | val = valueList[i].text 48 | infoDict[key] = val 49 | 50 | with open(fpath, 'a', encoding='utf-8') as f: 51 | f.write( str(infoDict) + '\n' ) 52 | count = count + 1 53 | print("\r当前进度: {:.2f}%".format(count*100/len(lst)),end="") 54 | except: 55 | count = count + 1 56 | print("\r当前进度: {:.2f}%".format(count*100/len(lst)),end="") 57 | continue 58 | 59 | def main(): 60 | stock_list_url = 'http://quote.eastmoney.com/stocklist.html' 61 | stock_info_url = 'https://gupiao.baidu.com/stock/' 62 | output_file = 'BaiduStockInfo.csv' 63 | slist=[] 64 | getStockList(slist, stock_list_url) 65 | getStockInfo(slist, stock_info_url, output_file) 66 | 67 | main() -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/06_python爬取人人贷网数据/peopleLoad.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import re 3 | import numpy as np 4 | import requests 5 | import time 6 | import random 7 | from bs4 import BeautifulSoup 8 | 9 | s=requests.session() 10 | 11 | headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'} 12 | #根据浏览器下自行修改 13 | 14 | headers['Cookie'] = 'gr_user_id=022d0f46-4981-4224-9895-18bfe32d9276; rrdLoginCartoon=rrdLoginCartoon; pgv_pvi=905847926; Hm_lvt_16f9bb97b83369e62ee1386631124bb1=1474288518,1474332677,1474336816,1474368269; Hm_lpvt_16f9bb97b83369e62ee1386631124bb1=1474372985; JSESSIONID=7EB90C9967D8C42B08DFB18EB9A9F74ED2ACC468B7D56B9372E2A20684713847; jforumUserInfo=bEAY23pgyLLLjII69w9oS%2BtK2jljmxa8%0A; IS_MOBLIE_IDPASS=true-false; activeTimestamp=5195275; gr_session_id_9199126ed94d770d=70bbe285-4ac6-42c9-a49b-9255d0eb9c46; gr_cs1_70bbe285-4ac6-42c9-a49b-9255d0eb9c46=user_id%3A5195275' 15 | #根据浏览器F12下的Request Headers->Cookie自行复制上去即可 16 | 17 | 18 | def parse_userinfo(loanid):#自定义解析借贷人信息的函数 19 | timestamp=str(int(time.time())) + '%03d' % random.randint(0,999)
    20 | urll="http://www.we.com/lend/detailPage.action?loanId=%.0f×tamp=" % loanid+timestamp
    #这个urll我也不知道怎么来的,貌似可以用urll="http://www.we.com/loan/%f" % loanid+timestamp
    #(就是页面本身,我也没试过) 21 | result = s.get(urll,headers=headers) 22 | html = BeautifulSoup(result.text,'lxml') 23 | info = html.find_all('table',class_="ui-table-basic-list") 24 | info1= info[0] 25 | info2 = info1.find_all('div',class_="basic-filed") 26 | userinfo = {} 27 | for item in info2: 28 | vartag = item.find('span') 29 | var = vartag.string 30 | if var == '信用评级': 31 | var = '信用评分' 32 | pf1 = repr(item.find('em')) 33 | value = re.findall(r'\d+',pf1) 34 | else: 35 | valuetag = item.find('em') 36 | value = valuetag.string 37 | userinfo[var]=value 38 | data = pd.DataFrame(userinfo) 39 | return data 40 | 41 | rrd=pd.read_csv('loanId.csv') #loanId是之前散标数据中的loanId,将其单独整理为一个csv文档 42 | loanId=rrd.ix[:,'loanId'] 43 | user_info = ['昵称', '信用评分', 44 | 45 | '年龄', '学历', '婚姻', 46 | 47 | '申请借款', '信用额度', '逾期金额', '成功借款', '借款总额', '逾期次数','还清笔数', '待还本息', '严重逾期', 48 | 49 | '收入', '房产', '房贷', '车产', '车贷', 50 | 51 | '公司行业', '公司规模', '岗位职位', '工作城市', '工作时间'] 52 | 53 | table = pd.DataFrame(np.array(user_info).reshape(1, 24), columns=user_info) 54 | 55 | i = 1 56 | 57 | for loanid in loanId: 58 | table = pd.concat([table, parse_userinfo(loanid)]) 59 | print(i) 60 | i += 1 #看一下循环多少次 61 | 62 | table.to_csv('userinfo.csv',header=False) -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/07_python爬取创业邦创投库/python爬取创业邦创投库.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import requests 4 | import time 5 | from bs4 import BeautifulSoup 6 | import pandas as pd 7 | # 导入pandas库 8 | 9 | # 设置列表页面URL的固定部分 10 | url = 'https://bj.lianjia.com/ershoufang/' 11 | BASE_URL_U1 = "http://www.cyzone.cn/event/list-764-0-" 12 | BASE_URL_U2 = "-0-0-0-0/" 13 | 14 | # 最好在http请求中设置一个头部信息,否则很容易被封ip 15 | headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 16 | 'Accept':'text/html;q=0.9,*/*;q=0.8', 17 | 'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 18 | 'Accept-Encoding':'gzip', 19 | 'Connection':'close', 20 | 'Referer':'http://www.baidu.com/link?url=_andhfsjjjKRgEWkj7i9cFmYYGsisrnm2A-TN3XZDQXxvGsM9k9ZZSnikW2Yds4s&amp;wd=&amp;eqid=c3435a7d00146bd600000003582bfd1f' 21 | } 22 | 23 | # 循环抓取列表页信息 24 | for i in range(1,31): # 分页 25 | if i == 1: 26 | i=str(1) 27 | var_url = (BASE_URL_U1 + i + BASE_URL_U2) 28 | r = requests.get(url=var_url, headers=headers) 29 | html = r.content 30 | #print(html) 31 | else: 32 | i=str(i) 33 | var_url=(BASE_URL_U1 + i + BASE_URL_U2) 34 | var_url=requests.get(url=var_url,headers=headers) 35 | html2=r.content 36 | html = html + html2 37 | # 每次间隔1秒 38 | time.sleep(1) 39 | 40 | # 解析抓取的页面内容 41 | res = BeautifulSoup(html, 'html.parser') 42 | # 获取感兴趣目标信息: 43 | # 提取公司名称 44 | # table>tbody>tr.table-plate3>td.tp2>span.tp2_tit>a 45 | companys = res.find_all('span', 'tp2_tit') 46 | cnames = [] 47 | print(len(companys)) 48 | for item in companys: 49 | cname = item.a.string 50 | cnames.append(cname) 51 | 52 | #print(cnames) 53 | # 获取感兴趣目标信息: 54 | # 提取公司详情url 55 | companys = res.find_all('span', 'tp2_tit') 56 | urls = [] 57 | for item in companys: 58 | url = item.a['href'] 59 | urls.append(url) 60 | # 获取感兴趣目标信息: 61 | # 提取当前融资轮次,行业,投资方和更新时间 62 | 63 | # res = BeautifulSoup(html, 'html5lib') 64 | # finances = res.select('div#main > div.list-table3 > table > tbody > tr') 65 | finances = res.find_all('tr', 'table-plate3') 66 | # 融资轮次,行业,投资方,更新时间 67 | financing_rounds, businesses, investors, update_times = [],[],[],[] 68 | #print(len(finances)) 69 | for i in range(0, len(finances)): 70 | # 获取第一行数据(范围) 71 | items = finances[i].find_all('td') 72 | # print(items) 73 | # 获取融资轮次 74 | fround = items[-5].text.strip() 75 | #获取行业 76 | business = items[-4].text.strip() 77 | #获取投资方 78 | investor = items[-3].text.strip() 79 | #获取更新时间 80 | update_time = items[-2].text.strip() 81 | financing_rounds.append(fround) 82 | businesses.append(business) 83 | investors.append(investor) 84 | update_times.append(update_time) 85 | 86 | # 将获取的数据进行汇总: 87 | #print(len(cnames)) 88 | #print(len(urls)) 89 | #print(len(financing_rounds)) 90 | #print(len(financing_rounds)) 91 | #print(len(businesses)) 92 | #print(len(investors)) 93 | #print(len(update_times)) 94 | 95 | # 创建数据表 96 | resultsDatas = pd.DataFrame({'公司名称':cnames,'详情URL':urls,'融资轮次':financing_rounds,'行业':businesses,'投资方':investors,'更新时间':update_times}) 97 | # 查看数据表内容 98 | print(resultsDatas) 99 | 100 | 101 | resultsDatas.to_csv("resultsDatas.csv") -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/07_python爬取创业邦创投库/resultsDatas.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/07_python爬取创业邦创投库/resultsDatas.csv -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/08_python抓取美团网百万商家信息/meituan.csv: -------------------------------------------------------------------------------- 1 | ,title,score,address,phone,Evaluation_number 2 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/08_python抓取美团网百万商家信息/python抓取美团网百万商家信息.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 今日主题:python抓取美团网百万商家信息 3 | 所用模块:requests bs4 数据分析 4 | 流程分析:1、获取主页源码 5 | 2、获取二级菜单链接(美食、电影。。。) 6 | 3、商品店家信息 7 | ''' 8 | import requests 9 | from bs4 import BeautifulSoup #分析网页 获取标签内容 10 | import json 11 | import lxml 12 | import pandas as pd 13 | 14 | url = 'http://chs.meituan.com/' 15 | 16 | headers = { 17 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' 18 | } 19 | 20 | #获取分类(电影、美食) 21 | def get_start_links(url, headers=None): 22 | html = requests.get(url,headers=headers).text #发送请求获取主页文本 23 | soup = BeautifulSoup(html, 'lxml') #解析网页 24 | links = [link.find('div').find('div').find('dl').find('dt').find('a')['href'] for link in soup.find_all('div', class_='J-nav-item')] 25 | return links 26 | 27 | #获取分类链接中的店铺id 28 | def get_detail_id(url, headers=None): 29 | html = requests.get(url,headers=headers).text 30 | soup = BeautifulSoup(html,'lxml') 31 | content_id = json.loads(soup.find('div', class_='J-scrollloader cf J-hub')['data-async-params']) 32 | return json.loads(content_id.get('data')).get('poiidList') 33 | 34 | #获取店铺详情数据 35 | def get_item_info(url, headers=None): 36 | html = requests.get(url,headers=headers).text 37 | soup = BeautifulSoup(html,'lxml') 38 | title = soup.find('span', class_='title').text #标题 39 | score = soup.find('span', class_='biz-level').get_text() #评分 40 | address = soup.find('span', class_='geo').text #地址 41 | phone = soup.find_all('p', class_='under-title')[1].get_text() #电话 42 | Evaluation_number = soup.find('a', class_='num rate-count').text #评价 43 | print (u'店名: '+title) 44 | print (u'评论数量: '+Evaluation_number) 45 | print (u'地址: '+address) 46 | print (u'评分: '+score) 47 | print (u'电话: '+phone) 48 | print ('======================================================') 49 | return (title, score, address, phone, Evaluation_number) 50 | 51 | 52 | items_list = [] 53 | 54 | start_url_list = get_start_links(url) 55 | for j in start_url_list:#分类链接 56 | for i in range(1,11): #多页 57 | category_url = j+'/all/page()'.format(i) #完整的分类多页链接 58 | shop_id_list = get_detail_id(category_url,headers=headers) 59 | print (shop_id_list) 60 | for shop_id in shop_id_list: 61 | items = get_item_info(url+'shop/{}'.format(shop_id),headers) 62 | items_list.append(items) 63 | 64 | df = pd.DataFrame(items_list, columns = ['title','score','address','phone','Evaluation_number']) 65 | df.to_csv('meituan.csv') -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/09_python爬取网易云音乐评论并把他们存入mysql数据库/python爬取网易云音乐评论并把他们存入mysql数据库.py: -------------------------------------------------------------------------------- 1 | # 爬取网易云音乐评论并把他们存入mysql数据库 2 | import requests,json,os 3 | import base64 4 | import codecs 5 | from Crypto.Cipher import AES 6 | import pymysql 7 | 8 | 9 | class Spider(): 10 | 11 | def __init__(self): 12 | 13 | self.header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0', 14 | 'Referer': 'http://music.163.com/'} 15 | self.url = 'http://music.163.com/weapi/v1/resource/comments/R_SO_4_531051217?csrf_token=' 16 | 17 | 18 | def __get_jsons(self,url,page): 19 | # 获取两个参数 20 | music = WangYiYun() 21 | text = music.create_random_16() 22 | params = music.get_params(text,page) 23 | encSecKey = music.get_encSEcKey(text) 24 | fromdata = {'params' : params,'encSecKey' : encSecKey} 25 | jsons = requests.post(url, data=fromdata, headers=self.header) 26 | #print(jsons.raise_for_status()) 27 | # 打印返回来的内容,是个json格式的 28 | #print(jsons.content) 29 | return jsons.text 30 | 31 | def json2list(self,jsons): 32 | '''把json转成字典,并把他重要的信息获取出来存入列表''' 33 | # 可以用json.loads()把他转成字典 34 | #print(json.loads(jsons.text)) 35 | users = json.loads(jsons) 36 | comments = [] 37 | for user in users['comments']: 38 | # print(user['user']['nickname']+' : '+user['content']+' 点赞数:'+str(user['likedCount'])) 39 | name = user['user']['nickname'] 40 | content = user['content'] 41 | # 点赞数 42 | likedCount = user['likedCount'] 43 | user_dict = {'name': name, 'content': content, 'likedCount': likedCount} 44 | comments.append(user_dict) 45 | return comments 46 | 47 | def write2sql(self,comments): 48 | '''把评论写入数据库''' 49 | music = Operate_SQL() 50 | print('第%d页正在获取' % self.page) 51 | for comment in comments: 52 | #print(comment) 53 | music.add_data(comment) 54 | print(' 该页获取完成') 55 | 56 | 57 | 58 | def run(self): 59 | self.page = 1 60 | while True: 61 | jsons = self.__get_jsons(self.url,self.page) 62 | comments = self.json2list(jsons) 63 | print(comments[0]) 64 | # 当这一页的评论数少于20条时,证明已经获取完 65 | self.write2sql(comments) 66 | if len(comments) < 20: 67 | print('评论已经获取完') 68 | break 69 | self.page +=1 70 | 71 | # 找出post的两个参数params和encSecKey 72 | class WangYiYun(): 73 | 74 | def __init__(self): 75 | # 在网易云获取的三个参数 76 | 77 | self.second_param = '010001' 78 | self.third_param = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7' 79 | self.fourth_param = '0CoJUm6Qyw8W8jud' 80 | 81 | def create_random_16(self): 82 | '''获取随机十六个字母拼接成的字符串''' 83 | return (''.join(map(lambda xx: (hex(ord(xx))[2:]), str(os.urandom(16)))))[0:16] 84 | 85 | def aesEncrypt(self, text, key): 86 | # 偏移量 87 | iv = '0102030405060708' 88 | # 文本 89 | pad = 16 - len(text) % 16 90 | text = text + pad * chr(pad) 91 | encryptor = AES.new(key, 2, iv) 92 | ciphertext = encryptor.encrypt(text) 93 | ciphertext = base64.b64encode(ciphertext) 94 | return ciphertext 95 | 96 | def get_params(self,text,page): 97 | '''获取网易云第一个参数''' 98 | # 第一个参数 99 | if page == 1: 100 | self.first_param = '{rid: "R_SO_4_400162138", offset: "0", total: "true", limit: "20", csrf_token: ""}' 101 | else: 102 | self.first_param = ('{rid: "R_SO_4_400162138", offset:%s, total: "false", limit: "20", csrf_token: ""}'%str((page-1)*20)) 103 | 104 | params = self.aesEncrypt(self.first_param, self.fourth_param).decode('utf-8') 105 | params = self.aesEncrypt(params, text) 106 | return params 107 | 108 | def rsaEncrypt(self, pubKey, text, modulus): 109 | '''进行rsa加密''' 110 | text = text[::-1] 111 | rs = int(codecs.encode(text.encode('utf-8'), 'hex_codec'), 16) ** int(pubKey, 16) % int(modulus, 16) 112 | return format(rs, 'x').zfill(256) 113 | 114 | def get_encSEcKey(self,text): 115 | '''获取第二个参数''' 116 | pubKey = self.second_param 117 | moudulus = self.third_param 118 | encSecKey = self.rsaEncrypt(pubKey, text, moudulus) 119 | return encSecKey 120 | 121 | # 操作 mysql 122 | class Operate_SQL(): 123 | # 连接数据库 124 | def __get_conn(self): 125 | try: 126 | # 我用的的本地数据库,所以host是127.0.0.1 127 | self.conn = pymysql.connect(host='127.0.0.1',user='root',passwd='19980129.jie',port=3306,db='music',charset='utf8mb4') 128 | except Exception as e: 129 | print(e, '数据库连接失败') 130 | 131 | def __close_conn(self): 132 | '''关闭数据库连接''' 133 | try: 134 | if self.conn: 135 | self.conn.close() 136 | except pymysql.Error as e: 137 | print(e, '关闭数据库失败') 138 | 139 | def add_data(self,comment): 140 | '''增加一条数据到数据库''' 141 | sql = 'INSERT INTO `comments`(`name`,`content`,`likedCount`) VALUE(%s,%s,%s)' 142 | try: 143 | self.__get_conn() 144 | cursor = self.conn.cursor() 145 | cursor.execute(sql, (comment['name'],comment['content'],comment['likedCount'])) 146 | self.conn.commit() 147 | return 1 148 | except AttributeError as e: 149 | print(e,'添加数据失败') 150 | # 添加失败就倒回数据 151 | self.conn.rollback() 152 | return 0 153 | except pymysql.DataError as e: 154 | print(e) 155 | self.conn.rollback() 156 | return 0 157 | finally: 158 | if cursor: 159 | cursor.close() 160 | self.__close_conn() 161 | 162 | 163 | 164 | def main(): 165 | spider = Spider() 166 | spider.run() 167 | 168 | if __name__ == '__main__': 169 | main() -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/10_python爬取“网上购物”类APP/apps.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/10_python爬取“网上购物”类APP/apps.csv -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/10_python爬取“网上购物”类APP/python爬取网上购物类APP数据py: -------------------------------------------------------------------------------- 1 | # =========== Python3.X Jupyter =========== 2 | # =========== 步骤一、抓取每一个子分类的URL =========== 3 | 4 | # 导入第三方包 5 | import requests 6 | from bs4 import BeautifulSoup 7 | import numpy as np 8 | import time 9 | import pandas as pd 10 | 11 | # 设置请求头 12 | headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36'} 13 | 14 | # 豌豆荚应用首页 > 安卓软件分类 > 网上购物 > 商城下载的链接 15 | url = 'http://www.wandoujia.com/category/5017_591' 16 | # 发送请求 17 | res = requests.get(url, headers = headers).text 18 | # 解析HTML 19 | soup = BeautifulSoup(res, 'html.parser') 20 | 21 | # 商城类app的5个分类链接及名称 22 | category_urls = [i.findAll('a')[0]['href'] for i in soup.findAll('ul',{'class':'switch-tab cate-tab'})[0].findAll('li')[1:]] 23 | category_names = [i.text.strip() for i in soup.findAll('ul',{'class':'switch-tab cate-tab'})[0].findAll('li')[1:]] 24 | 25 | 26 | 27 | # =========== 步骤二、生成所有子分类及页码的URL =========== 28 | 29 | # 各类别app的前10页urls 30 | names = [] 31 | urls = [] 32 | 33 | for url,name in zip(category_urls,category_names): 34 | for i in range(1,11): 35 | names.append(name) 36 | urls.append(url+'/'+str(i)) 37 | 38 | 39 | 40 | 41 | # =========== 步骤三、抓取子分类页下各APP对应的URL =========== 42 | 43 | # 根据每一页的url抓出app对应的链接 44 | app_urls = [] 45 | 46 | for url in urls: 47 | res = requests.get(url, headers = headers).text 48 | soup = BeautifulSoup(res,'html.parser') 49 | 50 | # 返回每个页面中app的名称及对应的链接 51 | # 为防止报错,这里做了异常处理 52 | try: 53 | app_lists = soup.findAll('ul',{'id':'j-tag-list'})[0] 54 | app_urls.extend([i.findAll('a')[0]['href'] for i in app_lists.findAll('h2',{'class':'app-title-h2'})]) 55 | except: 56 | pass 57 | 58 | 59 | 60 | 61 | # =========== 步骤四、爬虫抓取各APP的详细信息 =========== 62 | 63 | # 构建空的列表,用于数据的存储 64 | appname = [] 65 | appcategory = [] 66 | install = [] 67 | love = [] 68 | comments = [] 69 | size = [] 70 | update = [] 71 | version = [] 72 | platform = [] 73 | company = [] 74 | 75 | for url in app_urls: 76 | res = requests.get(url, headers = headers).text 77 | soup = BeautifulSoup(res,'html.parser') 78 | 79 | try: 80 | # 抓取的信息 81 | appname.append(soup.find('p',{'class':'app-name'}).text.strip()) 82 | appcategory.append('-'.join(soup.find('dl',{'class':'infos-list'}).findAll('dd')[1].text.strip().split('\n'))) 83 | install.append(soup.find('span',{'class':'item install'}).find('i').text) 84 | love.append(soup.find('span',{'class':'item love'}).find('i').text) 85 | comments.append(soup.find('div',{'class':'comment-area'}).find('i').text) 86 | size.append(soup.find('dl',{'class':'infos-list'}).findAll('dd')[0].text.strip()) 87 | update.append(soup.find('dl',{'class':'infos-list'}).findAll('dd')[3].text.strip()) 88 | version.append(soup.find('dl',{'class':'infos-list'}).findAll('dd')[4].text.strip()) 89 | platform.append(soup.find('dl',{'class':'infos-list'}).findAll('dd')[5].text.strip().split('\n')[0]) 90 | company.append(soup.find('dl',{'class':'infos-list'}).findAll('dd')[6].text.strip()) 91 | except: 92 | pass 93 | 94 | 95 | 96 | 97 | # =========== 步骤五、数据存储 =========== 98 | 99 | # 将存储的列表值写入到字典,并进行数据框的转换 100 | apps = pd.DataFrame({'appname':appname,'appcategory':appcategory, 101 | 'install':install,'love':love,'comments':comments,'size':size, 102 | 'update':update,'version':version,'platform':platform,'company':company}) 103 | 104 | # 数据导出 105 | apps.to_csv('apps.csv', index = False) -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/10_python爬取“网上购物”类APP/网上购物类APP数据分析并展示.py: -------------------------------------------------------------------------------- 1 | # =========== Python3.X Jupyter =========== 2 | 3 | # 导入第三方包 4 | import pandas as pd 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | 8 | # 读取外部数据源 9 | app_info = pd.read_csv('apps.csv') 10 | 11 | # 数据集的观测数量及变量数 12 | app_info.shape 13 | 14 | # 窥探数据前5行信息 15 | app_info.head() 16 | 17 | # 查看数据集各变量的类型 18 | app_info.dtypes 19 | 20 | # 检查数据是否有重复(一般对于爬虫数据都需要检查) 21 | any(app_info.duplicated()) 22 | 23 | # 数值变量的描述性分析 24 | app_info.describe() 25 | 26 | 27 | 28 | # 剔除重复观测 29 | app_info.drop_duplicates(inplace=True) 30 | app_info.shape 31 | 32 | # 删除评论人数为-1的观测(因为只有2条记录) 33 | app_info = app_info.loc[app_info.comments != -1,] 34 | 35 | # 离散变量的统计描述 36 | app_info.describe(include = ['object']) 37 | 38 | 39 | 40 | # 自定义函数,处理安装人数的单位 41 | def func(x): 42 | if x.find('亿') != -1: 43 | y = float(x[:-1])*10000 44 | elif x.find('万') != -1: 45 | y = float(x[:-1]) 46 | else: 47 | y = float(x)/10000 48 | return(y) 49 | # 安装人数变量的类型转换 50 | app_info['install_new'] = app_info.install.apply(func) 51 | 52 | # 自定义匿名函数 53 | y = lambda x : float(x[:-2]) if x.find('MB') != -1 else float(x[:-2])/1024 54 | # 软件大小变量的类型转换 55 | app_info['size_new'] = app_info['size'].apply(y) 56 | 57 | # 自定义匿名函数,将“暂无”设置为缺失值 58 | y = lambda x : np.nan if x == '暂无' else float(x[:-1])/100 59 | app_info['love_new'] = app_info['love'].apply(y) 60 | 61 | # 用中位数对好评率进行填补 62 | app_info['love_new'] = app_info.love_new.fillna(app_info.love_new.median()) 63 | 64 | # 日期类型的转换 65 | app_info['update_new'] = pd.to_datetime(app_info['update'], format = '%Y年%m月%d日') 66 | 67 | 68 | 69 | # 数值变量的描述性统计 70 | app_info.describe() 71 | 72 | # 删除不必要的变量 73 | app_info.drop(['install','size','love','update'], axis = 1, inplace=True) 74 | app_info.head() 75 | 76 | 77 | 78 | # 各类应用安装量最多的前5个APP(产生绘图数据) 79 | ls = [] 80 | 81 | categories = ['商城','团购','优惠','快递','全球导购'] 82 | for cate in categories: 83 | sub = app_info.loc[app_info.appcategory.apply(lambda x : x.find(cate) != -1),['appname','install_new']] 84 | 85 | # 取前5的安装量 86 | sub = sub.sort_values(by = ['install_new'],ascending=False)[:5] 87 | sub['type'] = cate 88 | ls.append(sub) 89 | # 合并数据集 90 | app_install_cat = pd.concat(ls) 91 | 92 | 93 | # 设置绘图风格 94 | plt.style.use('ggplot') 95 | # 中文处理 96 | plt.rcParams['font.sans-serif'] = 'Microsoft YaHei' 97 | 98 | # 为了让多张子图在一张图中完成,设置子图的位置 99 | ax1 = plt.subplot2grid((3,2),(0,0)) 100 | ax2 = plt.subplot2grid((3,2),(0,1)) 101 | ax3 = plt.subplot2grid((3,2),(1,0)) 102 | ax4 = plt.subplot2grid((3,2),(1,1)) 103 | ax5 = plt.subplot2grid((3,2),(2,0), colspan=2) # colspan指定跨过的列数 104 | 105 | # 将图框存放起来,用于循环使用 106 | axes = [ax1,ax2,ax3,ax4,ax5] 107 | types = app_install_cat.type.unique() 108 | 109 | # 循环的方式完成5张图的绘制 110 | for i in range(5): 111 | # 准备绘图数据 112 | data = app_install_cat.loc[app_install_cat.type == types[i]] 113 | # 绘制条形图 114 | axes[i].bar(range(5), data.install_new, color = 'steelblue', alpha = 0.7) 115 | # 设置图框大小 116 | gcf = plt.gcf() 117 | gcf.set_size_inches(8, 6) 118 | # 添加标题 119 | axes[i].set_title(types[i]+'类APP下载量前5的应用', size = 9) 120 | # 设置刻度位置 121 | axes[i].set_xticks(np.arange(5) + 0.4) 122 | # 为刻度添加标签值 123 | axes[i].set_xticklabels(data.appname, fontdict={'fontsize':7}, color = 'red') 124 | # 删除各子图上、右和下的边界刻度标记 125 | axes[i].tick_params(top = 'off', bottom = 'off', right = 'off') 126 | 127 | # 调整子图之间的水平间距和高度间距 128 | plt.subplots_adjust(hspace=0.6, wspace=0.3) 129 | 130 | # 显示图形 131 | plt.show() 132 | 133 | 134 | 135 | # 各类应用好评率最低的前5个APP(产生绘图数据) 136 | ls = [] 137 | categories = ['商城','团购','优惠','快递','全球导购'] 138 | for cate in categories: 139 | sub = app_info.loc[app_info.appcategory.apply(lambda x : x.find(cate) != -1),['appname','love_new']] 140 | # 取前5的安装量 141 | sub = sub.sort_values(by = ['love_new'])[:5] 142 | sub['type'] = cate 143 | ls.append(sub) 144 | app_love_cat = pd.concat(ls) 145 | 146 | # 为了让多张子图在一张图中完成,设置子图的位置 147 | ax1 = plt.subplot2grid((3,2),(0,0)) 148 | ax2 = plt.subplot2grid((3,2),(0,1)) 149 | ax3 = plt.subplot2grid((3,2),(1,0)) 150 | ax4 = plt.subplot2grid((3,2),(1,1)) 151 | ax5 = plt.subplot2grid((3,2),(2,0), colspan=2) # colspan指定跨过的列数 152 | 153 | # 将图框存放起来,用于循环使用 154 | axes = [ax1,ax2,ax3,ax4,ax5] 155 | types = app_love_cat.type.unique() 156 | 157 | # 循环的方式完成5张图的绘制 158 | for i in range(5): 159 | # 准备绘图数据 160 | data = app_love_cat.loc[app_love_cat.type == types[i]] 161 | # 绘制条形图 162 | axes[i].bar(range(5), data.love_new, color = 'steelblue', alpha = 0.7) 163 | # 设置图框大小 164 | gcf = plt.gcf() 165 | gcf.set_size_inches(8, 6) 166 | # 添加标题 167 | axes[i].set_title(types[i]+'类APP好评率后5的应用', size = 9) 168 | # 设置x轴刻度位置 169 | axes[i].set_xticks(np.arange(5) + 0.4) 170 | # 为x轴刻度添加标签值 171 | axes[i].set_xticklabels(data.appname, fontdict={'fontsize':7}, color = 'red') 172 | # 设置y轴刻度位置 173 | axes[i].set_yticks(np.arange(0,0.6,0.15)) 174 | # 为y轴刻度添加标签值 175 | axes[i].set_yticklabels([str(i*100) + '%' for i in np.arange(0,0.6,0.15)]) 176 | # 删除各子图上、右和下的边界刻度标记 177 | axes[i].tick_params(top = 'off', bottom = 'off', right = 'off') 178 | 179 | 180 | # 调整子图之间的水平间距和高度间距 181 | plt.subplots_adjust(hspace=0.6, wspace=0.3) 182 | # 显示图形 183 | plt.show() 184 | 185 | 186 | 187 | # 导入第三方模块 188 | from sklearn.linear_model import LinearRegression 189 | 190 | # 评价人数与好评率是否存在关系呢? 191 | # 散点图 192 | plt.scatter(app_info.comments, # 评价人数 193 | app_info.love_new, # 好评率 194 | s = 30, # 设置点的大小 195 | c = 'black', # 设置点的颜色 196 | marker = 'o', # 设置点的形状 197 | alpha = 0.9, # 设置点的透明度 198 | linewidths = 0.3, # 设置散点边界的粗细 199 | label = '观测点' 200 | ) 201 | 202 | # 建模 203 | reg = LinearRegression().fit(app_info.comments.reshape(-1,1), app_info.love_new) 204 | # 回归预测值 205 | pred = reg.predict(app_info.comments.reshape(-1,1)) 206 | 207 | # 绘制回归线 208 | plt.plot(app_info.comments, pred, linewidth = 2, label = '回归线') 209 | plt.legend(loc = 'lower right') 210 | 211 | # 添加轴标签和标题 212 | plt.title('评论人数与好评率的关系') 213 | plt.xlabel('评论人数') 214 | plt.ylabel('好评率') 215 | 216 | # 去除图边框的顶部刻度和右边刻度 217 | plt.tick_params(top = 'off', right = 'off') 218 | # 显示图形 219 | plt.show() 220 | 221 | 222 | 223 | # 评论人数的描述统计 224 | app_info.comments.describe(percentiles=np.arange(0,1.2,0.2)) 225 | 226 | #       有8成的APP,其评论人数不超过53人,数据太过偏态了。这里先筛选出评论人数不超过55人的app,然后,对其研究“评论人数”与“好评率”的关系。 227 | 228 | # 散点图 229 | sub_data = app_info.loc[app_info.comments <= 55,] 230 | # sub_data = app_info.loc[app_info.comments > 55,] 231 | plt.scatter(sub_data.comments, # 评价人数 232 | sub_data.love_new, # 好评率 233 | s = 30, # 设置点的大小 234 | c = 'black', # 设置点的颜色 235 | marker = 'o', # 设置点的形状 236 | alpha = 0.9, # 设置点的透明度 237 | linewidths = 0.3, # 设置散点边界的粗细 238 | label = '观测点' 239 | ) 240 | 241 | # 建模 242 | reg = LinearRegression().fit(sub_data.comments.reshape(-1,1), sub_data.love_new) 243 | # 回归预测值 244 | pred = reg.predict(sub_data.comments.reshape(-1,1)) 245 | 246 | # 绘制回归线 247 | plt.plot(sub_data.comments, pred, linewidth = 2, label = '回归线') 248 | plt.legend(loc = 'lower right') 249 | 250 | # 添加轴标签和标题 251 | plt.title('评论人数与好评率的关系') 252 | plt.xlabel('评论人数') 253 | plt.ylabel('好评率') 254 | 255 | # 显示图形 256 | plt.show() -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/11_python爬取链家网房价信息/Lianjia_Info_v1.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | from bs4 import BeautifulSoup 3 | import re 4 | import requests 5 | from parsel import Selector 6 | import pandas as pd 7 | import time 8 | ############################################################# 9 | ''''' 10 | 这个模块爬取链家网福田区的二手房信息;仅仅爬取了前100页的数据 11 | 为了避免反爬虫策略,设定每5秒钟抓取一页信息 12 | @time=2018-04-24 13 | @author=Tom 14 | 15 | ''' 16 | 17 | ########################################################### 18 | # 进行网络请求的浏览器头部 19 | headers={ 20 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.7 Safari/537.36' 21 | 22 | } 23 | # pages是不同页码的网址列表 24 | pages=['https://sz.lianjia.com/ershoufang/futianqu/pg{}/'.format(x) for x in range(1,100)] 25 | ############################################################ 26 | 27 | ############################################################# 28 | lj_futian = pd.DataFrame(columns=['code','dec','img']) 29 | count=0 30 | def l_par_html(url): 31 | # 这个函数是用来获取链家网福田区二手房的信息 32 | wr=requests.get(url,headers=headers,stream=True) 33 | sel=Selector(wr.text) 34 | # describ用来获取房源的文字信息 35 | describ=sel.xpath('//li[@class="clear"]//text()').extract() 36 | new_information=([x for x in describ if x != '关注'and x != '加入对比' ]) 37 | sep_infor=' '.join(new_information).split(r'/平米')[:-1] 38 | # hou_code用来获取房源的编号 39 | hou_code=sel.xpath('//li[@class="clear"]/a/@data-housecode').extract() 40 | # hou_image用来获取房源的图片 41 | hou_image=sel.xpath('//li[@class="clear"]/a/img/@data-original').extract() 42 | # 将信息形成表格全部写到一起 43 | pages_info=pd.DataFrame(list(zip(hou_code,sep_infor,hou_image)),columns=['code','dec','img']) 44 | return pages_info 45 | 46 | for page in pages: 47 | a=l_par_html(page) 48 | count=count+1 49 | print ('the '+str(count)+' page is sucessful') 50 | time.sleep(5) 51 | lj_futian=pd.concat([lj_futian,a],ignore_index=True) 52 | 53 | # 将表格数据输出到excel文件 54 | lj_futian.to_excel('d:\\lianjia_ershou_futian_100.xlsx') 55 | 56 | 57 | #encoding:utf-8 58 | # import json # 使用json解码 因为拉勾网的格式是json 59 | # import requests # 使用这个requests是得到网页源码 60 | # import pandas as pd # 使用这个数据进行存储 61 | 62 | # items = [] # 定义空列表用来存放你得到的数据 63 | # # 循环两页 这里爬取的是两页内容 64 | # for i in range(1,2): 65 | # # 传入data 因为这个url是post的请求方法 pn指的是页数 kd指的是你搜索的内容 66 | # data = {'first': 'true', 'pn': i, 'kd': 'python'} 67 | # # 拉钩网的链接是固定的就变化的是页数 因为是post的提交方法 所以传入data 68 | # yuan = requests.post('https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false', data=data).text 69 | # # 使用json进行解码 因为返回的是一个json的格式 70 | # yuan = json.loads(yuan) 71 | # # 得到14个数据 72 | # for i in range(14): 73 | # item = [] 74 | # # 看下面的图片item里面的是什么数据 75 | # item.append(yuan['content']['positionResult']['result'][i]['positionName']) 76 | # item.append(yuan['content']['positionResult']['result'][i]['companyFullName']) 77 | # item.append(yuan['content']['positionResult']['result'][i]['salary']) 78 | # item.append(yuan['content']['positionResult']['result'][i]['city']) 79 | # item.append(yuan['content']['positionResult']['result'][i]['positionAdvantage']) 80 | # items.append(item) 81 | # # 使用的是pands的存数据 存为xlsx就是excel格式 82 | # data = pd.DataFrame(items) 83 | # data.to_excel('拉钩.xlsx') -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/11_python爬取链家网房价信息/Lianjia_Info_v2.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | #爬取链家二手房信息 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import csv 6 | 7 | def getHTMLText(url): 8 | try: 9 | r = requests.get(url,timeout=30) 10 | r.raise_for_status() 11 | r.encoding = r.apparent_encoding 12 | return r.text 13 | except: 14 | return '产生异常' 15 | 16 | def get_data(list,html): 17 | soup = BeautifulSoup(html,'html.parser') 18 | infos = soup.find('ul',{'class':'sellListContent'}).find_all('li') 19 | with open(r'lianjia.csv','a',encoding='utf-8') as f: 20 | for info in infos: 21 | name = info.find('div',{'class':'title'}).find('a').get_text() 22 | price =info.find('div',{'class':'priceInfo'}).find('div',{'class','totalPrice'}).find('span').get_text() 23 | f.write("{},{}\n".format(name,price)) 24 | 25 | def main(): 26 | start_url = 'https://sh.lianjia.com/ershoufang/pg' 27 | depth = 20 28 | info_list =[] 29 | for i in range(depth): 30 | url = start_url + str(i) 31 | html = getHTMLText(url) 32 | get_data(info_list,html) 33 | main() -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/11_python爬取链家网房价信息/Lianjia_Info_v3.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import requests 3 | from requests.exceptions import RequestException 4 | from bs4 import BeautifulSoup 5 | from time import sleep 6 | import csv 7 | 8 | 9 | def write_to_file(content): 10 | with open('lianjia_bs4.csv', 'w') as csvfile: 11 | writer = csv.writer(csvfile) 12 | writer.writerows(content) 13 | csvfile.close() 14 | 15 | 16 | def get_one_page(url): 17 | try: 18 | response = requests.get(url) 19 | if response.status_code == 200: 20 | return response.text 21 | else: 22 | return None 23 | except RequestException: 24 | return None 25 | 26 | 27 | def parse_one_page(html): 28 | soup = BeautifulSoup(html, 'lxml') 29 | prefix = 'http://sh.lianjia.com' 30 | for item in soup.select('.info-panel'): 31 | houseUrl = prefix + item.find("h2").a["href"] 32 | title = item.find("h2").a["title"] 33 | spans = item.find(class_="where").find_all("span") 34 | xiaoqu, huxing, mianji = spans[0].string, spans[1].string.split('\xa0')[0], spans[2].string.split('\xa0')[0] 35 | cons = item.find(class_="con").find_all("a") 36 | area, sub_area = cons[0].string, cons[1].string 37 | subway = item.find(class_="fang-subway-ex").string 38 | price = item.find(class_="price").find(class_="num").string 39 | data = item.find(class_="price-pre").string.split('\n')[0] 40 | watched = item.find(class_="square").find(class_="num").string 41 | 42 | yield [houseUrl, title, xiaoqu, huxing, mianji, area, sub_area, subway, price, data, watched] 43 | 44 | 45 | if __name__ == '__main__': 46 | results = [] 47 | for page in range(100): 48 | sleep(1) 49 | print (page) 50 | url = 'http://sh.lianjia.com/zufang/d' + str(page) 51 | html = get_one_page(url) 52 | for item in parse_one_page(html): 53 | results.append(item) 54 | write_to_file(results) -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/11_python爬取链家网房价信息/Lianjia_Info_v4.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import requests 3 | from requests.exceptions import RequestException 4 | from time import sleep 5 | import re 6 | import csv 7 | 8 | 9 | def write_to_file(content): 10 | with open('lianjia_re_v4.csv', 'w') as csvfile: 11 | writer = csv.writer(csvfile) 12 | writer.writerows(content) 13 | csvfile.close() 14 | 15 | 16 | def get_one_page(url): 17 | try: 18 | response = requests.get(url) 19 | if response.status_code == 200: 20 | return response.text 21 | else: 22 | return None 23 | except RequestException: 24 | return None 25 | 26 | 27 | def parse_one_page(html): 28 | pattern = re.compile('

    .*?"\shref="(.*?)".*?title="(.*?)".*?.*?(.*?)&nb.*?' + 29 | '(.*?)&nb.*?/">(.*?).*?/">(.*?).*?(.*?)<.*?-ex">(.*?)<' + 30 | '.*?-ex">(.*?).*?num">(\d+)<.*?-pre">(.*?)<.*?num">(\d+)<.*?', re.S) 31 | prefix = 'http://sh.lianjia.com' 32 | items = re.findall(pattern, html) 33 | for item in items: 34 | item = list(item) 35 | item[0] = prefix + item[0] 36 | item[6] = item[6].strip() 37 | item[10] = item[10].split('\n')[0] 38 | yield item 39 | 40 | 41 | def main(page, results): 42 | url = 'http://sh.lianjia.com/zufang/d' + str(page) 43 | html = get_one_page(url) 44 | for item in parse_one_page(html): 45 | results.append(item) 46 | 47 | 48 | if __name__ == '__main__': 49 | results = [] 50 | for i in range(100): 51 | sleep(1) 52 | print(i) 53 | main(i+1, results) 54 | write_to_file(results) -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/11_python爬取链家网房价信息/Lianjia_Info_v4_analysis.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import numpy as np 3 | import csv 4 | import re 5 | import matplotlib.pyplot as plt 6 | plt.rcParams['font.sans-serif'] = ['SimHei'] 7 | 8 | 9 | if __name__ == "__main__": 10 | csv_reader = csv.reader(open('lianjia_re_v4.csv')) 11 | content = [] 12 | for row in csv_reader: 13 | content.append(row) 14 | 15 | all_region = [] 16 | regions = ['徐汇', '静安', '浦东', '杨浦', '闵行', '长宁', '宝山', '青浦', 17 | '金山', '普陀','松江', '嘉定', '闸北', '虹口', '奉贤', 18 | '崇明', '黄浦', '上海周边'] 19 | tmp = [] 20 | region_statistics = [] 21 | region_statistics_dict = {} 22 | for item in content: 23 | all_region.append(item[5]) 24 | 25 | for region in regions: 26 | if all_region.count(region): 27 | region_statistics.append(all_region.count(region)) 28 | region_statistics_dict[region] = all_region.count(region) 29 | tmp.append(region) 30 | regions = tmp 31 | 32 | fangzu = {} 33 | for region in regions: 34 | fangzu[region] = 0 35 | for item in content: 36 | fangzu[item[5]] += int(item[-3]) 37 | fangzu_average = [] 38 | for region in regions: 39 | fangzu_average.append(fangzu[region]/region_statistics_dict[region]) 40 | 41 | area = {} 42 | for region in regions: 43 | area[region] = 0 44 | for item in content: 45 | tmp = item[4] 46 | tmp = re.sub(r'[^\x00-\x7f]', '', tmp) 47 | area[item[5]] += int(tmp) 48 | area_average = [] 49 | for region in regions: 50 | area_average.append(area[region] / region_statistics_dict[region]) 51 | for i in range(len(area_average)): 52 | area_average[i] = fangzu_average[i]/area_average[i] 53 | 54 | # 地区分布 55 | a = [i for i in range(1, len(regions) + 1)] 56 | plt.bar(a, region_statistics, 0.4, color="blue") 57 | xlocations = np.array(range(1, len(regions) + 1)) 58 | plt.xticks(xlocations, regions, rotation=60) 59 | plt.ylabel("房屋数量") 60 | plt.xlabel("地区") 61 | plt.title("上海各区租房数量") 62 | for a, b in zip(a, region_statistics): 63 | plt.text(a, b, '%.0f' % b, ha='center', va='bottom', fontsize=7) 64 | plt.savefig("上海各区租房数量.jpg", dpi=300) 65 | plt.close() 66 | 67 | # 各区房租情况 68 | a = [i for i in range(1, len(regions) + 1)] 69 | plt.bar(a, fangzu_average, 0.4, color="blue") 70 | xlocations = np.array(range(1, len(regions) + 1)) 71 | plt.xticks(xlocations, regions, rotation=60) 72 | plt.ylabel("月租 元/月") 73 | plt.xlabel("地区") 74 | plt.title("上海各区租房平均月租") 75 | for a, b in zip(a, fangzu_average): 76 | plt.text(a, b, '%.0f' % b, ha='center', va='bottom', fontsize=7) 77 | plt.savefig("上海各区租房房租信息.jpg", dpi=300) 78 | plt.close() 79 | 80 | # 各区每平米平均月租 81 | a = [i for i in range(1, len(regions) + 1)] 82 | plt.bar(a, area_average, 0.4, color="blue") 83 | xlocations = np.array(range(1, len(regions) + 1)) 84 | plt.xticks(xlocations, regions, rotation=60) 85 | plt.ylabel("月租 元/月/平米") 86 | plt.xlabel("地区") 87 | plt.title("上海各区租房每平米平均月租") 88 | for a, b in zip(a, area_average): 89 | plt.text(a, b, '%.0f' % b, ha='center', va='bottom', fontsize=7) 90 | plt.savefig("上海各区租房每平米房租信息.jpg", dpi=300) 91 | plt.close() -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/11_python爬取链家网房价信息/lianjia_ershou_futian_100.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/11_python爬取链家网房价信息/lianjia_ershou_futian_100.xlsx -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/11_python爬取链家网房价信息/lianjia_re_v4.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/11_python爬取链家网房价信息/lianjia_re_v4.csv -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/12_python爬取并分析豆瓣中最新电影的影评(词云显示)/alice_mask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/12_python爬取并分析豆瓣中最新电影的影评(词云显示)/alice_mask.png -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/12_python爬取并分析豆瓣中最新电影的影评(词云显示)/alice_mask1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/12_python爬取并分析豆瓣中最新电影的影评(词云显示)/alice_mask1.png -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/12_python爬取并分析豆瓣中最新电影的影评(词云显示)/python爬取并分析豆瓣中最新电影的影评.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import warnings 4 | warnings.filterwarnings("ignore") 5 | import jieba # 分词包 6 | import numpy # numpy计算包 7 | import codecs # codecs提供的open方法来指定打开的文件的语言编码,它会在读取的时候自动转换为内部unicode 8 | import re 9 | import pandas as pd 10 | import matplotlib.pyplot as plt 11 | from PIL import Image 12 | from urllib import request 13 | from bs4 import BeautifulSoup as bs 14 | from wordcloud import WordCloud,ImageColorGenerator # 词云包 15 | import matplotlib 16 | matplotlib.rcParams['figure.figsize'] = (10.0, 5.0) 17 | 18 | 19 | 20 | # 分析网页函数 21 | def getNowPlayingMovie_list(): 22 | resp = request.urlopen('https://movie.douban.com/nowplaying/hangzhou/') 23 | html_data = resp.read().decode('utf-8') 24 | soup = bs(html_data, 'html.parser') 25 | nowplaying_movie = soup.find_all('div', id='nowplaying') 26 | nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item') 27 | nowplaying_list = [] 28 | for item in nowplaying_movie_list: 29 | nowplaying_dict = {} 30 | nowplaying_dict['id'] = item['data-subject'] 31 | for tag_img_item in item.find_all('img'): 32 | nowplaying_dict['name'] = tag_img_item['alt'] 33 | nowplaying_list.append(nowplaying_dict) 34 | return nowplaying_list 35 | 36 | # 爬取评论函数 37 | def getCommentsById(movieId, pageNum): 38 | eachCommentList = [] 39 | if pageNum > 0: 40 | start = (pageNum - 1) * 20 41 | else: 42 | return False 43 | requrl = 'https://movie.douban.com/subject/' + movieId + '/comments' + '?' + 'start=' + str(start) + '&limit=20' 44 | print(requrl) 45 | resp = request.urlopen(requrl) 46 | html_data = resp.read().decode('utf-8') 47 | soup = bs(html_data, 'html.parser') 48 | comment_div_lits = soup.find_all('div', class_='comment') 49 | for item in comment_div_lits: 50 | if item.find_all('p')[0].string is not None: 51 | eachCommentList.append(item.find_all('p')[0].string) 52 | return eachCommentList 53 | 54 | def main(): 55 | # 循环获取第一个电影的前10页评论 56 | commentList = [] 57 | NowPlayingMovie_list = getNowPlayingMovie_list() 58 | for i in range(10): 59 | num = i + 1 60 | commentList_temp = getCommentsById(NowPlayingMovie_list[0]['id'], num) 61 | commentList.append(commentList_temp) 62 | 63 | # 将列表中的数据转换为字符串 64 | comments = '' 65 | for k in range(len(commentList)): 66 | comments = comments + (str(commentList[k])).strip() 67 | 68 | # 使用正则表达式去除标点符号 69 | pattern = re.compile(r'[\u4e00-\u9fa5]+') 70 | filterdata = re.findall(pattern, comments) 71 | cleaned_comments = ''.join(filterdata) 72 | 73 | # 使用结巴分词进行中文分词 74 | segment = jieba.lcut(cleaned_comments) 75 | words_df = pd.DataFrame({'segment': segment}) 76 | 77 | # 去掉停用词 78 | stopwords = pd.read_csv("stopwords.txt", index_col=False, quoting=3, sep="\t", names=['stopword'], 79 | encoding='utf-8') # quoting=3全不引用 80 | words_df = words_df[~words_df.segment.isin(stopwords.stopword)] 81 | 82 | # 统计词频 83 | words_stat = words_df.groupby(by=['segment'])['segment'].agg({"计数": numpy.size}) 84 | words_stat = words_stat.reset_index().sort_values(by=["计数"], ascending=False) 85 | # print(words_stat.head()) 86 | 87 | bg_pic = numpy.array(Image.open("alice_mask.png")) 88 | 89 | # 用词云进行显示 90 | wordcloud = WordCloud( 91 | font_path="simhei.ttf", 92 | background_color="white", 93 | max_font_size=80, 94 | width = 2000, 95 | height = 1800, 96 | mask = bg_pic, 97 | mode = "RGBA" 98 | ) 99 | word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values} 100 | # print(word_frequence) 101 | """ 102 | word_frequence_list = [] 103 | for key in word_frequence: 104 | temp = (key, word_frequence[key]) 105 | word_frequence_list.append(temp) 106 | #print(word_frequence_list) 107 | """ 108 | wordcloud = wordcloud.fit_words(word_frequence) 109 | 110 | image_colors = ImageColorGenerator(bg_pic) # 根据图片生成词云颜色 111 | 112 | plt.imshow(wordcloud) #显示词云图片 113 | plt.axis("off") 114 | plt.show() 115 | wordcloud.to_file('show_Chinese.png') # 把词云保存下来 116 | 117 | main() -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/12_python爬取并分析豆瓣中最新电影的影评(词云显示)/show_Chinese.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/12_python爬取并分析豆瓣中最新电影的影评(词云显示)/show_Chinese.png -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/13_python爬取豆瓣书籍信息/books.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/13_python爬取豆瓣书籍信息/books.csv -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/13_python爬取豆瓣书籍信息/python爬取豆瓣书籍信息.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | #爬虫爬取豆瓣书目录 4 | import requests 5 | from bs4 import BeautifulSoup 6 | import json 7 | import pandas #该库用于对爬取的信息进行表格性操作 8 | from skimage import io#该库用于打印爬取到的照片 9 | 10 | url = 'https://market.douban.com/book/?utm_campaign=book_nav_freyr&utm_source=douban&utm_medium=pc_web&page={}&page_num=18' 11 | 12 | #该函数式用来返回一个列表存放含有书籍信息的字典 13 | def bookList(url): 14 | newurl = requests.get(url) 15 | soup = BeautifulSoup(newurl.text,'html.parser') 16 | result_total = [] 17 | for book in soup.select('.book-item'): 18 | if len(book.select('.book-brief'))>0: 19 | 20 | bookimag2 = io.imread(book.select('img')[1]['src'])#书的图片 21 | io.imshow(bookimag2) 22 | #io.show()#为了使爬取到的图片显示出来 23 | 24 | bookurl = book.select('a')[0]['href']#抓取书的链接url 25 | #print('链接: ',bookurl) 26 | 27 | result_total.append(booktextscore(bookurl))#将所抓取书的信息字典添加到列表里面 28 | 29 | bookimag1 = io.imread(book.select('img')[0]['src'])#背景图片 30 | io.imshow(bookimag1) 31 | #io.show() 32 | return result_total #返回一个列 33 | 34 | 35 | #该函数式用来爬取书籍的名字,评分,评价人数以及书的简单介绍 36 | def booktextscore(url): 37 | booktexturl = requests.get(url) 38 | soup = BeautifulSoup(booktexturl.text,'html.parser') 39 | result = {}#创建一个字典将相关书籍信息存入到字典中 40 | bookname = soup.select('.book-breintro h3')[0].text 41 | bookname2 = '《' + bookname + '》' 42 | print(bookname2) 43 | result['书籍名称'] = bookname2 44 | 45 | bookauthor = soup.select('.book-public')[0].text.lstrip('\n ').rstrip('\n ') 46 | result['作者'] = bookauthor 47 | print(bookauthor) 48 | 49 | print(url) 50 | result['书籍链接'] = url 51 | 52 | score = soup.select('.total-score')[0].text#爬取该书评分,其中可能含有有些书籍由于评论人数不足导致没有评分,加一个判断默认该种情况成评分为0 53 | if score == '评价人数不足': 54 | score = 0 55 | score = float(score) 56 | result['书籍评分'] = float(score)#将评分强制转换成float类型的 57 | print('评分:',score) 58 | 59 | commentnum = soup.select('.comment-number')[0].text#爬取本书评论人数 60 | print(commentnum) 61 | print('该书简介:\n') 62 | result['书籍评论人数'] = commentnum 63 | 64 | article = []#添加一个列表 65 | for ench in soup.select('.layout-content'):#爬取的是图书详情 66 | for p in ench.select('.paragraph-content p')[:-1]: 67 | article.append(p.text.strip())#将p标签中的文字添加到列表中 68 | articlebook = '\n '.join(article) 69 | #print(articlebook) 70 | #result['书籍简介'] = articlebook 71 | return result 72 | 73 | 74 | 75 | book_total = [] 76 | #由于书籍信息有两页,所以加一个循环将两页书籍信息都添加进列表中方便生成表格 77 | for ench in range(1,4): 78 | newurl = url.format(ench)#通过format将URL地址实现可变性,可以将两页书籍信息都打印出来 79 | book_result = bookList(newurl) 80 | book_total.extend(book_result) 81 | 82 | df = pandas.DataFrame(book_total) 83 | df.to_csv('books.csv')#将爬取后的书籍信息通过pandas转换成表格形式 84 | 85 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/14_python爬取今日头条信息并导入mongodb数据库/python爬取今日头条信息并导入mongodb数据库.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import requests 4 | import re 5 | # 导入mongo 数据库 6 | import pymongo 7 | import json 8 | 9 | # 打开数据库连接,mongodb默认端口为27017 10 | conn = pymongo.MongoClient(host='localhost',port=27017) 11 | # 选择或创建数据库 12 | toutiao = conn['toutiao'] 13 | # 选择或者创建数据集合 14 | newsdata = toutiao['news'] 15 | 16 | toutiaoUrl = 'http://www.toutiao.com/api/pc/focus/' 17 | reqData = requests.get(toutiaoUrl).text 18 | print(reqData) 19 | 20 | jsonData = json.loads(reqData) 21 | newsData = jsonData['data']['pc_feed_focus'] 22 | 23 | # 存储到数据库 24 | for new in newsData: 25 | title = new['title'] 26 | img_url = new['image_url'] 27 | url = new['media_url'] 28 | data = { 29 | 'title':title, 30 | 'img_url':img_url, 31 | 'url':url 32 | } 33 | # 插入一行数据 34 | newsdata.insert_one(data) 35 | 36 | for i in newsdata.find(): 37 | # 从数据库中读取出来 38 | print('i'+str(i)) -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/15_python使用selenium爬取百度招聘内容并存入mongodb数据库/python使用selenium爬取百度招聘内容并入mongodb数据库.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import re 4 | import time 5 | 6 | from selenium import webdriver 7 | from selenium.webdriver.common.keys import Keys 8 | from selenium.webdriver.common.action_chains import ActionChains 9 | 10 | from bs4 import BeautifulSoup 11 | 12 | import pymongo 13 | 14 | # 多进程 15 | from multiprocessing import Pool 16 | 17 | # 1 打开数据库连接,mongodb默认端口为27017 18 | conn = pymongo.MongoClient(host='localhost',port=27017) 19 | # 2 选择或创建数据库 20 | jobdata = conn['baidujobs'] 21 | # 3 选择或创建数据集合 22 | ver_job = jobdata['verjob'] 23 | 24 | baidu_baseurl = 'http://zhaopin.baidu.com/quanzhi?tid=4139&ie=utf8&oe=utf8&query=python%E6%9D%AD%E5%B7%9E&city_sug=%E6%9D%AD%E5%B7%9E' 25 | def set_winscroll(driver): 26 | time.sleep(2) 27 | driver.execute_script('window.scrollBy(0,2000)') 28 | time.sleep(3) 29 | driver.execute_script('window.scrollBy(0,3000)') 30 | time.sleep(3) 31 | 32 | 33 | # 1 初始化driver 34 | driver = webdriver.PhantomJS() 35 | # 2 调用get方法 36 | driver.get(baidu_baseurl) 37 | # 3 进入网页 38 | set_winscroll(driver) 39 | 40 | # 4 获取资源(第一页的数据) 41 | we_data = driver.page_source 42 | # print('first_we_data ' + we_data) 43 | 44 | 45 | def parse_html(html): 46 | soup = BeautifulSoup(html, 'lxml') 47 | item_url = soup.findAll('a', {'class': 'clearfix item line-bottom'}) 48 | # for item in zip(item_url): 49 | # print(item.get('href')) 50 | 51 | # 职位信息 52 | jobs = soup.findAll('div', {'class': 'title-h3 line-clamp1'}) 53 | # for job in jobs: 54 | # print(job.string) # 职位信息 55 | # 地址 + 公司名 56 | compy = soup.findAll('p', {'class': 'area line-clamp1'}) 57 | # for com in compy: 58 | # print(com.string) 59 | 60 | # 薪资 61 | salarys = soup.findAll('p', {'class': 'salary'}) 62 | # for salary in salarys: 63 | # print(salary.string) 64 | # 发布时间跟发布来源网站 65 | addresss = soup.findAll('div', {'class': 'right time'}) 66 | # print(addresss) 67 | reg = r'

    (.*?)

    ' 68 | regx = re.compile(reg) 69 | ads = re.findall(regx, str(addresss)) 70 | # print(ads) 71 | # for adds in ads: 72 | # data = adds.split('|') 73 | # print(data) 74 | for itm_url, job_detail, ver_compny, ver_salary, ver_addres in zip(item_url, jobs, compy, salarys, ads): 75 | data = { 76 | 'itme_url': 'http://zhaopin.baidu.com'+itm_url.get('href'), 77 | 'job_detail': job_detail.string, 78 | 'ver_compny': str(ver_compny.string), 79 | 'ver_salary': ver_salary.string, 80 | 'ver_addres': str(ver_addres).split('|'), 81 | } 82 | print(data) 83 | # 插入数据库 84 | ver_job.insert_one(data) # 插入数据库失败 85 | f.write(str(data)) 86 | 87 | 88 | def get_page_source(page_num): 89 | time.sleep(2) 90 | driver.find_element_by_xpath('//*[@id="pagination"]/p/span/a[%s]' % page_num).click() 91 | # //*[@id="pagination"]/p/span/a[1] 为在第一页的按钮 92 | # //*[@id="pagination"]/p/span/a[2] 为第二页的按钮 93 | set_winscroll(driver) 94 | we_data = driver.page_source 95 | return we_data 96 | 97 | f = open('百度招聘前30页杭州.csv', 'a',encoding='utf-8') 98 | # 首页的数据 99 | def getBaiduHangZhouJob(we_data): 100 | parse_html(we_data) 101 | for i in range(1, 50): 102 | if i==1: 103 | we_data = get_page_source(1) 104 | parse_html(we_data) 105 | elif i<=5: 106 | we_data = get_page_source(str(2)) 107 | parse_html(we_data) 108 | else: 109 | we_data = get_page_source(str(3)) 110 | parse_html(we_data) 111 | f.close() 112 | 113 | 114 | if __name__ == '__main__': 115 | getBaiduHangZhouJob(we_data) 116 | # pool = Pool(processes=10) 117 | # pool.map_async(getBaiduHangZhouJob(we_data)) 118 | # pool.close() 119 | # f.close() -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/16_python爬取熊猫直播用户信息/python爬取熊猫直播用户信息.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import requests 4 | import json 5 | import pandas as pd 6 | 7 | url = "https://www.panda.tv/live_lists?status=2&order=person_num&token=&pageno=%d&pagenum=120&_=%d".format(a=range(0,35),b=range(1501946526480,1501946526880)) 8 | 9 | headers = { 10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0' 11 | , 12 | 'Cookie': '__guid=96554777.3243119502220345300.1500627276199.6702; smid=608e0bde-ffe2-4251-90ca-2938cabdc074; monitor_count=18' 13 | , 14 | } 15 | 16 | 17 | def getHtml(url): 18 | req = requests.get(url, headers=headers) 19 | print(req.text) 20 | return req.text 21 | 22 | 23 | def printInfos(data): 24 | jsondata = json.loads(data, "utf-8") 25 | # print(jsondata) 26 | itemsinfo = jsondata['data']['items'] 27 | items_list = [] 28 | for pinfo in itemsinfo: 29 | name = pinfo['name'] 30 | person_num = pinfo['person_num'] 31 | nickName = pinfo['userinfo']['nickName'] 32 | lelvel = pinfo['host_level_info'] 33 | lable = pinfo['label'] 34 | cname = pinfo['classification'] 35 | item_list = [name, person_num, nickName, lelvel, label, cname] 36 | items_list.append(item_list) 37 | df = pd.DataFrame(items_list, columns = ['name','person_num','nickName','host_level_info','label','classification']) 38 | df.to_csv('熊猫直播用户信息.csv') 39 | 40 | 41 | def mainStart(): 42 | for n in range(0, 3): 43 | pageindex = 1 + n 44 | pagetime = int(1501946526480 + n) 45 | url = "https://www.panda.tv/live_lists?status=2&order=person_num&token=&pageno=%d&pagenum=120&_=%d"%(pageindex,pagetime) 46 | data = getHtml(url) 47 | printInfos(data) 48 | 49 | mainStart() -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/README.txt: -------------------------------------------------------------------------------- 1 | 输入:scrapy crawl youtx -o items.json 时以json格式保存下载数据 2 | 输入:scrapy crawl youtx -o items.csv 时以csv格式保存下载数据 3 | 4 | 5 | Scrapy必须背下来的命令: 6 | 1 创建项目: scrapy startproject youtxNanJin 7 | startproject: 表示创建项目 8 | youtxNanJin: 表示创建的项目名 9 | 10 | 2 创建爬虫: scrapy genspider youtx "http://www.youtx.com" 11 | genspider: 表示生成一个爬虫(默认是scrapy.Spider类) 12 | youtx: 表示爬虫名(对应爬虫代码里的 name 参数) 13 | "http://www.youtx.com": 表示允许爬虫爬取的域范围 14 | 15 | 3 执行爬虫: scrapy crawl youtx 16 | crawl: 表示启动一个sc rapy爬虫 17 | youtx: 表示需要启动的爬虫名(对应爬虫代码里的 name 参数) -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = youtxNanJin.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = youtxNanJin 12 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__init__.py -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class YoutxnanjinItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | # pass 15 | 16 | # 房源名称 17 | homeName = scrapy.Field() 18 | # 房源链接 19 | homeLine = scrapy.Field() 20 | # 房租单价 21 | homeSinglePrice = scrapy.Field() 22 | # 房租地址 23 | homeAddress = scrapy.Field() 24 | # 房租近期信息 25 | homeDetai = scrapy.Field() 26 | # 满七天价格 27 | homeSeven = scrapy.Field() 28 | # 满30天价格 29 | homeThirth = scrapy.Field() 30 | 31 | # 房东 32 | homePerson = scrapy.Field() 33 | # 房东头像 34 | homePersonImg = scrapy.Field() 35 | # 房东头像链接 36 | homePersonLink = scrapy.Field() 37 | 38 | # 房子大图 39 | homePicBg = scrapy.Field() 40 | # 房子大图链接 41 | homePicLink = scrapy.Field() 42 | 43 | # 品牌店铺信息 44 | # homePinPai = scrapy.Field() 45 | # 明星房东 46 | # homeStarrPerson = scrapy.Field() 47 | 48 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class YoutxnanjinSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import json 8 | from scrapy.conf import settings 9 | import pymongo 10 | 11 | 12 | class YoutxnanjinPipeline(object): 13 | def process_item(self, item, spider): 14 | return item 15 | 16 | 17 | class YouTXMongo(object): 18 | def __init__(self): 19 | self.client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT']) 20 | self.db = self.client[settings['MONGO_DB']] 21 | self.post = self.db[settings['MONGO_COLL']] 22 | 23 | def process_item(self, item, spider): 24 | postItem = dict(item) 25 | self.post.insert(postItem) 26 | return item 27 | 28 | # 写入json文件 29 | class JsonWritePipline(object): 30 | def __init__(self): 31 | self.file = open('游天下南京.json','w',encoding='utf-8') 32 | 33 | def process_item(self,item,spider): 34 | line = json.dumps(dict(item),ensure_ascii=False)+"\n" 35 | self.file.write(line) 36 | return item 37 | 38 | def spider_closed(self,spider): 39 | self.file.close() -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for youtxNanJin project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'youtxNanJin' 13 | 14 | SPIDER_MODULES = ['youtxNanJin.spiders'] 15 | NEWSPIDER_MODULE = 'youtxNanJin.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'youtxNanJin (+http://www.yourdomain.com)' 20 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' 21 | 22 | 23 | # Obey robots.txt rules 24 | ROBOTSTXT_OBEY = False 25 | 26 | # 配置mongoDB 27 | MONGO_HOST = "127.0.0.1" # 主机IP 28 | MONGO_PORT = 27017 # 端口号 29 | MONGO_DB = "YouTianXia" # 库名 30 | MONGO_COLL = "house_nanjin" # collection 31 | 32 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 33 | #CONCURRENT_REQUESTS = 32 34 | 35 | # Configure a delay for requests for the same website (default: 0) 36 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 37 | # See also autothrottle settings and docs 38 | #DOWNLOAD_DELAY = 3 39 | # The download delay setting will honor only one of: 40 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 41 | #CONCURRENT_REQUESTS_PER_IP = 16 42 | 43 | # Disable cookies (enabled by default) 44 | COOKIES_ENABLED = False 45 | 46 | # Disable Telnet Console (enabled by default) 47 | #TELNETCONSOLE_ENABLED = False 48 | 49 | # Override the default request headers: 50 | #DEFAULT_REQUEST_HEADERS = { 51 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 52 | # 'Accept-Language': 'en', 53 | #} 54 | 55 | # Enable or disable spider middlewares 56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 57 | #SPIDER_MIDDLEWARES = { 58 | # 'youtxNanJin.middlewares.YoutxnanjinSpiderMiddleware': 543, 59 | #} 60 | 61 | # Enable or disable downloader middlewares 62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 63 | #DOWNLOADER_MIDDLEWARES = { 64 | # 'youtxNanJin.middlewares.MyCustomDownloaderMiddleware': 543, 65 | #} 66 | 67 | # Enable or disable extensions 68 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 69 | #EXTENSIONS = { 70 | # 'scrapy.extensions.telnet.TelnetConsole': None, 71 | #} 72 | 73 | # Configure item pipelines 74 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 75 | ITEM_PIPELINES = { 76 | # 'youtxNanJin.pipelines.YoutxnanjinPipeline': 300, 77 | 'youtxNanJin.pipelines.YouTXMongo': 300, 78 | 'youtxNanJin.pipelines.JsonWritePipline': 300, 79 | } 80 | 81 | # Enable and configure the AutoThrottle extension (disabled by default) 82 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 83 | #AUTOTHROTTLE_ENABLED = True 84 | # The initial download delay 85 | #AUTOTHROTTLE_START_DELAY = 5 86 | # The maximum download delay to be set in case of high latencies 87 | #AUTOTHROTTLE_MAX_DELAY = 60 88 | # The average number of requests Scrapy should be sending in parallel to 89 | # each remote server 90 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 91 | # Enable showing throttling stats for every response received: 92 | #AUTOTHROTTLE_DEBUG = False 93 | 94 | # Enable and configure HTTP caching (disabled by default) 95 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 96 | #HTTPCACHE_ENABLED = True 97 | #HTTPCACHE_EXPIRATION_SECS = 0 98 | #HTTPCACHE_DIR = 'httpcache' 99 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 100 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 101 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/__pycache__/youtxNanJin_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/__pycache__/youtxNanJin_spider.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/youtxNanJin_spider.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | import scrapy 3 | from youtxNanJin.items import YoutxnanjinItem 4 | 5 | class NanJinDefault(scrapy.Spider): 6 | name = 'youtx' 7 | allowed_domains = ['youtx.com'] 8 | start_urls = ["http://www.youtx.com/nanjing/longrent1-page{}".format(n) for n in range(0,6)] 9 | def parse(self, response): 10 | # print(response.body) 11 | node_list = response.xpath("//div[@class='duanzu houseList']/ul/li[@class='clearfix']") 12 | # print(node_list) 13 | for node in node_list: 14 | item = YoutxnanjinItem() 15 | homeName = node.xpath("./div[@class='houseInfo clearfix']/div[@class='house-tit clearfix']/h3/a/text()").extract() 16 | homeLink = node.xpath("./div[@class='houseInfo clearfix']/div[@class='house-tit clearfix']/h3/a/@href").extract() 17 | print(homeName) 18 | print(homeLink) 19 | 20 | # 单日价格 21 | homeSinglePrice = node.xpath("./div[@class='houseInfo clearfix']/div[@class='house-tit clearfix']/div[@class='house-price mt9']/span/span[@class='housePrice']/text()").extract() 22 | print(homeSinglePrice) 23 | 24 | # 获取房源地址 25 | homeAddress = node.xpath("./div[@class='houseInfo clearfix']/div[@class='houseInfo-left mt2']/p[@class='clearfix mt5']/text()").extract() 26 | # 房租信息 27 | homeDesc =node.xpath("./div[@class='houseInfo clearfix']/div[@class='houseInfo-left mt2']/p[@class='mt5']/text()").extract() 28 | homeDesc2 =node.xpath("./div[@class='houseInfo clearfix']/div[@class='houseInfo-left mt2']/p[@class='mt5']/span[2]/text()").extract() 29 | print(homeAddress) 30 | print(homeDesc) 31 | print(homeDesc2) 32 | 33 | # 满30天的信息 34 | homeThrty = node.xpath("./div[@class='houseInfo clearfix']/div[@class='house-tit clearfix']/div[@class='house-price mt9']/div[@class='mix12_5']/div[@class='discount']/div[@class='discount-price']/span//text()").extract() 35 | print(homeThrty) 36 | # 房东信息 37 | homePerson = node.xpath("./div[@class='houseInfo clearfix']/div[@class='agentInfo mt16']/p[1]/a/text()").extract() 38 | # 房东链接 39 | homePersonLink = node.xpath("./div[@class='houseInfo clearfix']/div[@class='agentInfo mt16']/p[1]/a/@href").extract() 40 | print(homePerson) 41 | print(homePersonLink) 42 | 43 | # 房源大图图片 44 | homeBigPic = node.xpath("./div[@class='house-img']/a[1]/img/@src").extract() 45 | homeBigPicLink = node.xpath("./div[@class='house-img']/a[1]/@href").extract() 46 | print(homeBigPic) 47 | print(homeBigPicLink) 48 | # 房东头像信息 49 | personPic = node.xpath("./div[@class='house-img']/a[2]/img/@src").extract() 50 | # 房东头像链接地址 51 | personPicLink = node.xpath("./div[@class='house-img']/a[2]/img/@href").extract() 52 | 53 | print(personPic) 54 | print(homePersonLink) 55 | item['homeName'] ="".join(homeName) 56 | item['homeLine'] ="".join(homeLink) 57 | item['homeSinglePrice'] ="".join(homeSinglePrice) 58 | item['homeAddress'] ="".join(homeAddress) 59 | item['homeDetai'] ="".join(homeDesc)+"".join(homeDesc2) 60 | # 这里的值暂时没有取出来 61 | item['homeSeven'] ="".join(homeThrty) 62 | item['homeThirth'] ="".join(homeThrty) 63 | 64 | item['homePerson'] ="".join(homePerson) 65 | item['homePersonImg'] ="".join(personPic) 66 | item['homePersonLink'] ="".join(homePersonLink) 67 | item['homePicBg'] ="".join(homeBigPic) 68 | item['homePicLink'] ="".join(homeBigPicLink) 69 | yield item -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/README.txt: -------------------------------------------------------------------------------- 1 | 输入:scrapy crawl docNet -o items.json 时以json格式保存下载数据 2 | 输入:scrapy crawl docNet -o items.csv 时以csv格式保存下载数据 -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__init__.py -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ChinadoctornetItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | 15 | # 爬取中国医学人才网的条目(共5个条目) 16 | # 医院名称 17 | hospitalName = scrapy.Field() 18 | # 医院规模 19 | hospitalSize = scrapy.Field() 20 | # 医院所在地 21 | hospitalAddress = scrapy.Field() 22 | # 医院科目 23 | hospitalDesc = scrapy.Field() 24 | # pass 25 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class ChinadoctornetSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | # import json 8 | 9 | class ChinadoctornetPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | 13 | 14 | # class JsonWriterPipeline(object): 15 | # def __init__(self): 16 | # self.file = open('中国医学人才网招聘最新招聘专栏2.json', 'w', encoding='utf-8') 17 | 18 | # def process_item(self, item, spider): 19 | # line = json.dumps(dict(item), ensure_ascii=False) + "\n" 20 | # self.file.write(line) 21 | # return item 22 | 23 | # def spider_closed(self, spider): 24 | # self.file.close() -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for chinadoctornet project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'chinadoctornet' 13 | 14 | SPIDER_MODULES = ['chinadoctornet.spiders'] 15 | NEWSPIDER_MODULE = 'chinadoctornet.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'chinadoctornet (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'chinadoctornet.middlewares.ChinadoctornetSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'chinadoctornet.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | # ITEM_PIPELINES = { 68 | # # 'chinadoctornet.pipelines.ChinadoctornetPipeline': 300, 69 | # 'chinadoctornet.pipelines.JsonWritePipline': 300, 70 | # } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/__pycache__/chinadoctornet_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/__pycache__/chinadoctornet_spider.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/chinadoctornet_spider.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | import scrapy 3 | from chinadoctornet.items import ChinadoctornetItem 4 | 5 | 6 | class ChinaDocNet(scrapy.Spider): 7 | # 启动爬虫的名称 8 | name = 'docNet' 9 | # 爬取域名的范围 10 | allowed_domains = ['yixuezp.com'] 11 | # 爬虫第一个url地址 12 | start_urls = ['http://www.yixuezp.com/zhaopin?page={}'.format(n) for n in range(0, 464)] # 463 13 | 14 | def parse(self, response): 15 | # 医院name 16 | node_list = response.xpath("//div[@class='newsjob']/ul/li") 17 | items = [] 18 | for node in node_list: 19 | item = ChinadoctornetItem() 20 | hospitalName = node.xpath("./a/text()").extract() 21 | hospitalSize = node.xpath("./span[1]/text()").extract() 22 | hospitalAddress = node.xpath("./span[2]/text()").extract() 23 | hospitalDesc = node.xpath("./p/a/text()").extract() 24 | 25 | item['hospitalName'] = hospitalName 26 | item['hospitalSize'] = hospitalSize 27 | item['hospitalAddress'] = hospitalAddress 28 | item['hospitalDesc'] = hospitalDesc 29 | items.append(item) 30 | # return items # 如果直接return的话,一页数据只会返回一条数据 31 | yield item #用yield 的话,可以交给下载器,继续执行下一步操作。 -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = chinadoctornet.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = chinadoctornet 12 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/中国医学人才网招聘最新招聘专栏.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/中国医学人才网招聘最新招聘专栏.csv -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/README.txt: -------------------------------------------------------------------------------- 1 | 输入:scrapy crawl doubanMovie -o items.json 时以json格式保存下载数据 2 | 输入:scrapy crawl doubanMovie -o items.csv 时以csv格式保存下载数据 -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__init__.py -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class DoubanmovieItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | # pass 15 | 16 | # 电影名字 17 | name = scrapy.Field() 18 | # 电影信息 19 | info = scrapy.Field() 20 | # 评分 21 | rating = scrapy.Field() 22 | # 评论人数 23 | num = scrapy.Field() 24 | # 经典语句 25 | quote = scrapy.Field() 26 | # 电影图片 27 | img_url = scrapy.Field() 28 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class DoubanmovieSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class DoubanmoviePipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for doubanmovie project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'doubanmovie' 13 | 14 | SPIDER_MODULES = ['doubanmovie.spiders'] 15 | NEWSPIDER_MODULE = 'doubanmovie.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'doubanmovie (+http://www.yourdomain.com)' 20 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0' 21 | 22 | # Obey robots.txt rules 23 | ROBOTSTXT_OBEY = False 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | #CONCURRENT_REQUESTS = 32 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | #DOWNLOAD_DELAY = 3 32 | # The download delay setting will honor only one of: 33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 34 | #CONCURRENT_REQUESTS_PER_IP = 16 35 | 36 | # Disable cookies (enabled by default) 37 | COOKIES_ENABLED = False 38 | 39 | # Disable Telnet Console (enabled by default) 40 | #TELNETCONSOLE_ENABLED = False 41 | 42 | # Override the default request headers: 43 | #DEFAULT_REQUEST_HEADERS = { 44 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 45 | # 'Accept-Language': 'en', 46 | #} 47 | 48 | # Enable or disable spider middlewares 49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 50 | #SPIDER_MIDDLEWARES = { 51 | # 'doubanmovie.middlewares.DoubanmovieSpiderMiddleware': 543, 52 | #} 53 | 54 | # Enable or disable downloader middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 56 | #DOWNLOADER_MIDDLEWARES = { 57 | # 'doubanmovie.middlewares.MyCustomDownloaderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable extensions 61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 68 | #ITEM_PIPELINES = { 69 | # 'doubanmovie.pipelines.DoubanmoviePipeline': 300, 70 | #} 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/__pycache__/doubanmovie_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/__pycache__/doubanmovie_spider.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/doubanmovie_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from doubanmovie.items import DoubanmovieItem 3 | 4 | class Movie(scrapy.Spider): 5 | # 爬虫唯一标识符 6 | name = 'doubanMovie' 7 | # 爬取域名 8 | allowed_domain = ['movie.douban.com'] 9 | # 爬取页面地址 10 | start_urls = ['https://movie.douban.com/top250'] 11 | 12 | def parse(self, response): 13 | selector = scrapy.Selector(response) 14 | # 解析出各个电影 15 | movies = selector.xpath('//div[@class="item"]') 16 | # 存放电影信息 17 | item = DoubanmovieItem() 18 | 19 | for movie in movies: 20 | 21 | # 电影各种语言名字的列表 22 | titles = movie.xpath('.//span[@class="title"]/text()').extract() 23 | # 将中文名与英文名合成一个字符串 24 | name = '' 25 | for title in titles: 26 | name += title.strip() 27 | item['name'] = name 28 | 29 | # 电影信息列表 30 | infos = movie.xpath('.//div[@class="bd"]/p/text()').extract() 31 | # 电影信息合成一个字符串 32 | fullInfo = '' 33 | for info in infos: 34 | fullInfo += info.strip() 35 | item['info'] = fullInfo 36 | # 提取评分信息 37 | item['rating'] = movie.xpath('.//span[@class="rating_num"]/text()').extract()[0].strip() 38 | # 提取评价人数 39 | item['num'] = movie.xpath('.//div[@class="star"]/span[last()]/text()').extract()[0].strip()[:-3] 40 | # 提取经典语句,quote可能为空 41 | quote = movie.xpath('.//span[@class="inq"]/text()').extract() 42 | if quote: 43 | quote = quote[0].strip() 44 | item['quote'] = quote 45 | # 提取电影图片 46 | item['img_url'] = movie.xpath('.//img/@src').extract()[0] 47 | 48 | yield item 49 | 50 | next_page = selector.xpath('//span[@class="next"]/a/@href').extract()[0] 51 | url = 'https://movie.douban.com/top250' + next_page 52 | if next_page: 53 | yield scrapy.Request(url, callback=self.parse) 54 | 55 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = doubanmovie.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = doubanmovie 12 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__init__.py -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class MakedreamItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | # pass 15 | 16 | # 文章标题 17 | articleTitle = scrapy.Field() 18 | # 文章标题url 19 | articleUrl = scrapy.Field() 20 | # 文章描述 21 | articleDesc = scrapy.Field() 22 | # 文章发布时间 23 | articlePublic = scrapy.Field() 24 | # 文章类型 25 | articleType = scrapy.Field() 26 | # 文章标签 27 | articleTag = scrapy.Field() 28 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class MakedreamSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import json 8 | import pymongo 9 | from scrapy.conf import settings 10 | 11 | class MakedreamPipeline(object): 12 | def process_item(self, item, spider): 13 | return item 14 | 15 | 16 | class DreamMongo(object): 17 | def __init__(self): 18 | self.client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT']) 19 | self.db = self.client[settings['MONGO_DB']] 20 | self.post = self.db[settings['MONGO_COLL']] 21 | 22 | def process_item(self, item, spider): 23 | postItem = dict(item) 24 | self.post.insert(postItem) 25 | return item 26 | 27 | 28 | # 写入json文件类 29 | class JsonWritePipeline(object): 30 | def __init__(self): 31 | self.file = open('织梦网其他编程.json', 'w', encoding='utf-8') 32 | 33 | def process_item(self, item, spider): 34 | line = json.dumps(dict(item), ensure_ascii=False) + "\n" 35 | self.file.write(line) 36 | return item 37 | 38 | def spider_closed(self, spider): 39 | self.file.close() -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for makedream project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'makedream' 13 | 14 | SPIDER_MODULES = ['makedream.spiders'] 15 | NEWSPIDER_MODULE = 'makedream.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'makedream (+http://www.yourdomain.com)' 20 | # 配置mongoDB 21 | MONGO_HOST = "127.0.0.1" # 主机IP 22 | MONGO_PORT = 27017 # 端口号 23 | MONGO_DB = "DreamDB" # 库名 24 | MONGO_COLL = "Dream_info" # collection 25 | 26 | 27 | 28 | # Obey robots.txt rules 29 | ROBOTSTXT_OBEY = False 30 | 31 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 32 | #CONCURRENT_REQUESTS = 32 33 | 34 | # Configure a delay for requests for the same website (default: 0) 35 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 36 | # See also autothrottle settings and docs 37 | #DOWNLOAD_DELAY = 3 38 | # The download delay setting will honor only one of: 39 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 40 | #CONCURRENT_REQUESTS_PER_IP = 16 41 | 42 | # Disable cookies (enabled by default) 43 | # COOKIES_ENABLED = False 44 | 45 | # Disable Telnet Console (enabled by default) 46 | #TELNETCONSOLE_ENABLED = False 47 | 48 | # Override the default request headers: 49 | #DEFAULT_REQUEST_HEADERS = { 50 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 51 | # 'Accept-Language': 'en', 52 | #} 53 | 54 | # Enable or disable spider middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 56 | #SPIDER_MIDDLEWARES = { 57 | # 'makedream.middlewares.MakedreamSpiderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable downloader middlewares 61 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 62 | #DOWNLOADER_MIDDLEWARES = { 63 | # 'makedream.middlewares.MyCustomDownloaderMiddleware': 543, 64 | #} 65 | 66 | # Enable or disable extensions 67 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 68 | #EXTENSIONS = { 69 | # 'scrapy.extensions.telnet.TelnetConsole': None, 70 | #} 71 | 72 | # Configure item pipelines 73 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 74 | ITEM_PIPELINES = { 75 | # 'makedream.pipelines.MakedreamPipeline': 300, 76 | 'makedream.pipelines.JsonWritePipeline':300, 77 | 'makedream.pipelines.DreamMongo':300 78 | } 79 | 80 | # Enable and configure the AutoThrottle extension (disabled by default) 81 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 82 | #AUTOTHROTTLE_ENABLED = True 83 | # The initial download delay 84 | #AUTOTHROTTLE_START_DELAY = 5 85 | # The maximum download delay to be set in case of high latencies 86 | #AUTOTHROTTLE_MAX_DELAY = 60 87 | # The average number of requests Scrapy should be sending in parallel to 88 | # each remote server 89 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 90 | # Enable showing throttling stats for every response received: 91 | #AUTOTHROTTLE_DEBUG = False 92 | 93 | # Enable and configure HTTP caching (disabled by default) 94 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 95 | #HTTPCACHE_ENABLED = True 96 | #HTTPCACHE_EXPIRATION_SECS = 0 97 | #HTTPCACHE_DIR = 'httpcache' 98 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 99 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 100 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/__pycache__/makedream_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/__pycache__/makedream_spider.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/makedream_spider.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | import scrapy 3 | from makedream.items import MakedreamItem 4 | 5 | 6 | class DramingNet(scrapy.Spider): 7 | # 启动爬虫的名称 8 | name = 'dreaming' 9 | # 爬虫的域范围 10 | allowed_domains = ['zhimengzhe.com'] 11 | # 爬虫的第一个url 12 | start_urls = ['http://www.zhimengzhe.com/bianchengjiaocheng/qitabiancheng/index_{}.html'.format(n) for n in 13 | range(0, 1466)] 14 | 15 | # 爬取结果解析 16 | def parse(self, response): 17 | base_url = 'http://www.zhimengzhe.com' 18 | # print(response.body) 19 | node_list = response.xpath("//ul[@class='list-unstyled list-article']/li") 20 | for node in node_list: 21 | item = MakedreamItem() 22 | nextNode = node.xpath("./div[@class='pull-left ltxt w658']") 23 | print('*' * 30) 24 | title = nextNode.xpath('./h3/a/text()').extract() 25 | link = nextNode.xpath('./h3/a/@href').extract() 26 | desc = nextNode.xpath('./p/text()').extract() 27 | 28 | # 创建时间,类型,标签 29 | publicTime = nextNode.xpath("./div[@class='tagtime']/span[1]/text()").extract() 30 | publicType = nextNode.xpath("./div[@class='tagtime']/span[2]/a/text()").extract() 31 | publicTag = nextNode.xpath("./div[@class='tagtime']/span[3]/a/text()").extract() 32 | # node 33 | titleLink = base_url + ''.join(link) 34 | item['articleTitle'] = title 35 | # 文章标题url 36 | item['articleUrl'] = titleLink 37 | # 文章描述 38 | item['articleDesc'] = desc 39 | # 文章发布时间 40 | item['articlePublic'] = publicTime 41 | # 文章类型 42 | item['articleType'] = publicType 43 | # 文章标签 44 | item['articleTag'] = publicTag 45 | yield item -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = makedream.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = makedream 12 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/21_python爬取豆瓣电影前任3评论(词云显示)/ComentsAnaylst.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2018/4/25 11:15 3 | # @File : commentsAnaylst.py(再见前任3的影评f词云) 4 | 5 | import matplotlib.pyplot as plt 6 | from PIL import Image 7 | from wordcloud import WordCloud 8 | import jieba 9 | import numpy as np 10 | #读取txt格式的文本内容 11 | text_from_file_with_apath = open('douban.txt','rb').read() 12 | 13 | #使用jieba进行分词,并对分词的结果以空格隔开 14 | wordlist_after_jieba = jieba.cut(text_from_file_with_apath, cut_all = True) 15 | wl_space_split = " ".join(wordlist_after_jieba) 16 | 17 | #对分词后的文本生成词云 18 | # my_wordcloud = WordCloud().generate(wl_space_split) 19 | 20 | font = r'C:\Windows\Fonts\simfang.ttf' 21 | mask = np.array(Image.open('ciyun.jpg')) 22 | wc = WordCloud(mask=mask,max_words=3000,collocations=False, font_path=font, width=5800, height=2400, margin=10,background_color='black').generate(wl_space_split) 23 | default_colors = wc.to_array() 24 | plt.title("QR 3") 25 | plt.imshow(wc) 26 | plt.axis("off") 27 | plt.savefig("ciyun.png") 28 | plt.show() -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/21_python爬取豆瓣电影前任3评论(词云显示)/ciyun.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/21_python爬取豆瓣电影前任3评论(词云显示)/ciyun.jpg -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/21_python爬取豆瓣电影前任3评论(词云显示)/ciyun.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/21_python爬取豆瓣电影前任3评论(词云显示)/ciyun.png -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/21_python爬取豆瓣电影前任3评论(词云显示)/douban_qianren3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2018/4/25 11:15 3 | # @File : test_douban_qianren3.py(再见前任3的影评) 4 | 5 | import csv 6 | import requests 7 | from lxml import etree 8 | import time 9 | 10 | 11 | url = 'https://movie.douban.com/subject/26662193/comments?start=0&limit=20&sort=new_score&status=P&percent_type=' 12 | 13 | headers = { 14 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36', 15 | 'Cookie': 'gr_user_id=ffdf2f63-ec37-49b5-99e8-0e0d28741172; bid=qh9RXgIGopg; viewed="26826540_24703171"; ap=1; ll="118172"; ct=y; _vwo_uuid_v2=8C5B24903B1D1D3886FE478B91C5DE97|7eac18658e7fecbbf3798b88cfcf6113; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1522129522%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DdnHqCRiT1HlhToCp0h1cpdyV8rB9f_OfOvJhjRPO3p1jrl764LGvi7gbYSdskDMh%26wd%3D%26eqid%3De15db1bb0000e3cd000000045ab9b6fe%22%5D; _pk_id.100001.4cf6=4e61f4192b9486a8.1485672092.10.1522130672.1522120744.; _pk_ses.100001.4cf6=*'} 16 | 17 | 18 | def get_html(current_url): 19 | time.sleep(2) 20 | r = requests.get(current_url, headers=headers) 21 | r.raise_for_status() 22 | return etree.HTML(r.text) 23 | 24 | 25 | def parse_html(content,writer): 26 | links = content.xpath("//*[@class='comment-item']") 27 | for link in links: 28 | content = link.xpath("./div[@class='comment']/p/text()")[0].strip() 29 | author = link.xpath("./div[@class='comment']/h3/span[@class='comment-info']/a/text()")[0].strip() 30 | time = link.xpath("./div[@class='comment']/h3/span[@class='comment-info']/span[@class='comment-time ']/text()")[ 31 | 0].strip() 32 | is_useful = link.xpath("./div[@class='comment']/h3/span[@class='comment-vote']/span[@class='votes']/text()")[0] 33 | print('content:', content) 34 | print('time:', time) 35 | print('is_useful:', is_useful) 36 | # detail = (author, time, is_useful, content) 37 | detail = (is_useful,content) 38 | writer.writerow(detail) 39 | 40 | 41 | if __name__ == '__main__': 42 | with open('douban.txt', 'a+', encoding='utf-8', newline='') as csvf: 43 | writer = csv.writer(csvf) 44 | writer.writerow(('作者', '时间', '有用数', '内容')) 45 | for page in range(0, 260, 20): 46 | url = 'https://movie.douban.com/subject/26662193/comments?start={}&limit=20&sort=new_score&status=P&percent_type='.format( 47 | page) 48 | r = get_html(url) 49 | parse_html(r,writer) -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/22_python爬取Bilibili用户信息并导入mysql数据库/bilibili_user.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf8-*- 2 | 3 | import requests 4 | import json 5 | import random 6 | import pymysql 7 | import sys 8 | import datetime 9 | import time 10 | from imp import reload 11 | from multiprocessing.dummy import Pool as ThreadPool 12 | 13 | 14 | def datetime_to_timestamp_in_milliseconds(d): 15 | def current_milli_time(): return int(round(time.time() * 1000)) 16 | 17 | return current_milli_time() 18 | 19 | 20 | reload(sys) 21 | 22 | 23 | def LoadUserAgents(uafile): 24 | """ 25 | uafile : string 26 | path to text file of user agents, one per line 27 | """ 28 | uas = [] 29 | with open(uafile, 'rb') as uaf: 30 | for ua in uaf.readlines(): 31 | if ua: 32 | uas.append(ua.strip()[1:-1 - 1]) 33 | random.shuffle(uas) 34 | return uas 35 | 36 | 37 | uas = LoadUserAgents("user_agents.txt") 38 | head = { 39 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36', 40 | 'X-Requested-With': 'XMLHttpRequest', 41 | 'Referer': 'http://space.bilibili.com/45388', 42 | 'Origin': 'http://space.bilibili.com', 43 | 'Host': 'space.bilibili.com', 44 | 'AlexaToolbar-ALX_NS_PH': 'AlexaToolbar/alx-4.0', 45 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4', 46 | 'Accept': 'application/json, text/javascript, */*; q=0.01', 47 | } 48 | proxies = { 49 | 'http': 'http://61.155.164.108:3128', 50 | 'http': 'http://116.199.115.79:80', 51 | 'http': 'http://42.245.252.35:80', 52 | 'http': 'http://106.14.51.145:8118', 53 | 'http': 'http://116.199.115.78:80', 54 | 'http': 'http://123.147.165.143:8080', 55 | 'http': 'http://58.62.86.216:9999', 56 | 'http': 'http://202.201.3.121:3128', 57 | 'http': 'http://119.29.201.134:808', 58 | 'http': 'http://61.155.164.112:3128', 59 | 'http': 'http://123.57.76.102:80', 60 | 'http': 'http://116.199.115.78:80', 61 | } 62 | time1 = time.time() 63 | 64 | for m in range(99, 101): # 26 ,1000 65 | urls = [] 66 | for i in range(m * 100, (m + 1) * 100): 67 | url = 'https://space.bilibili.com/' + str(i) 68 | urls.append(url) 69 | 70 | 71 | def getsource(url): 72 | payload = { 73 | '_': datetime_to_timestamp_in_milliseconds(datetime.datetime.now()), 74 | 'mid': url.replace('https://space.bilibili.com/', '') 75 | } 76 | ua = random.choice(uas) 77 | head = { 78 | 'User-Agent': ua, 79 | 'Referer': 'https://space.bilibili.com/' + str(i) + '?from=search&seid=' + str(random.randint(10000, 50000)) 80 | } 81 | jscontent = requests \ 82 | .session() \ 83 | .post('http://space.bilibili.com/ajax/member/GetInfo', 84 | headers=head, 85 | data=payload, 86 | proxies=proxies) \ 87 | .text 88 | time2 = time.time() 89 | try: 90 | jsDict = json.loads(jscontent) 91 | statusJson = jsDict['status'] if 'status' in jsDict.keys() else False 92 | if statusJson == True: 93 | if 'data' in jsDict.keys(): 94 | jsData = jsDict['data'] 95 | mid = jsData['mid'] 96 | name = jsData['name'] 97 | sex = jsData['sex'] 98 | face = jsData['face'] 99 | coins = jsData['coins'] 100 | spacesta = jsData['spacesta'] 101 | birthday = jsData['birthday'] if 'birthday' in jsData.keys() else 'nobirthday' 102 | place = jsData['place'] if 'place' in jsData.keys() else 'noplace' 103 | description = jsData['description'] 104 | article = jsData['article'] 105 | playnum = jsData['playNum'] 106 | sign = jsData['sign'] 107 | level = jsData['level_info']['current_level'] 108 | exp = jsData['level_info']['current_exp'] 109 | print("Succeed: " + mid + "\t" + str(time2 - time1)) 110 | try: 111 | res = requests.get( 112 | 'https://api.bilibili.com/x/space/navnum?mid=' + str(mid) + '&jsonp=jsonp').text 113 | js_fans_data = json.loads(res) 114 | following = js_fans_data['data']['following'] 115 | fans = js_fans_data['data']['follower'] 116 | except: 117 | following = 0 118 | fans = 0 119 | else: 120 | print('no data now') 121 | try: 122 | conn = pymysql.connect( 123 | host='127.0.0.1', port=3306, user='root', passwd='******', db='sunshine',charset="utf8") 124 | cur = conn.cursor() 125 | cur.execute('INSERT INTO bilibili_user_info(mid, name, sex, face, coins, spacesta, \ 126 | birthday, place, description, article, following, fans, playnum, sign, level, exp) \ 127 | VALUES ("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' 128 | % ( 129 | mid, name, sex, face, coins, spacesta, 130 | birthday, place, description, article, 131 | following, fans, playnum, sign, level, exp 132 | )) 133 | conn.commit() 134 | except Exception: 135 | print("MySQL Error") 136 | else: 137 | print("Error: " + url) 138 | except ValueError: 139 | pass 140 | 141 | 142 | pool = ThreadPool(1) 143 | try: 144 | results = pool.map(getsource, urls) 145 | except Exception: 146 | print('ConnectionError') 147 | pool.close() 148 | pool.join() 149 | time.sleep(11) 150 | pool = ThreadPool(1) 151 | results = pool.map(getsource, urls) 152 | 153 | time.sleep(30) 154 | 155 | pool.close() 156 | pool.join() 157 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/22_python爬取Bilibili用户信息并导入mysql数据库/bilibili_user_info.sql: -------------------------------------------------------------------------------- 1 | # ************************************************************ 2 | # Sequel Pro SQL dump 3 | # Version 4135 4 | # 5 | # http://www.sequelpro.com/ 6 | # http://code.google.com/p/sequel-pro/ 7 | # 8 | # Host: 127.0.0.1 (MySQL 5.1.63) 9 | # Database: sunshine 10 | # Generation Time: 2018-04-26 13:33:32 +0000 11 | # ************************************************************ 12 | 13 | 14 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 15 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 16 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 17 | /*!40101 SET NAMES utf8 */; 18 | /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; 19 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; 20 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 21 | 22 | 23 | # Dump of table bilibili_user_info 24 | # ------------------------------------------------------------ 25 | 26 | CREATE TABLE `bilibili_user_info` ( 27 | `id` int(11) unsigned NOT NULL AUTO_INCREMENT, 28 | `mid` varchar(11) DEFAULT NULL, 29 | `name` varchar(45) DEFAULT NULL, 30 | `sex` varchar(11) DEFAULT NULL, 31 | `face` varchar(200) DEFAULT NULL, 32 | `coins` int(11) DEFAULT NULL, 33 | `spacesta` int(11) DEFAULT NULL, 34 | `birthday` varchar(45) DEFAULT NULL, 35 | `place` varchar(45) DEFAULT NULL, 36 | `description` varchar(45) DEFAULT NULL, 37 | `article` int(11) DEFAULT NULL, 38 | `following` int(11) DEFAULT NULL, 39 | `fans` int(11) DEFAULT NULL, 40 | `playnum` int(30) DEFAULT NULL, 41 | `sign` varchar(300) DEFAULT NULL, 42 | `level` int(11) DEFAULT NULL, 43 | `exp` int(11) DEFAULT NULL, 44 | PRIMARY KEY (`id`) 45 | ) ENGINE=MyISAM DEFAULT CHARSET=utf8; 46 | 47 | 48 | 49 | 50 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 51 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 52 | /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; 53 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 54 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 55 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 56 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/23_python爬取网易云音乐所有歌曲的评论数/README.md: -------------------------------------------------------------------------------- 1 | #### 这是一个爬取网易云音乐的所有的歌曲的评论数的爬虫。 2 | 3 | 以下为主要思路: 4 | 5 | - 1. 爬取所有的歌手信息([artists.py]); 6 | - 2. 根据上一步爬取到的歌手信息去爬取所有的专辑信息([album_by_artist.py]); 7 | - 3. 根据专辑信息爬取所有的歌曲信息([music_by_album.py]); 8 | - 4. 根据歌曲信息爬取其评论条数([comments_by_music.py]) 9 | - 5. 数据库相关的语句都存放于([sql.py])中。 -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/23_python爬取网易云音乐所有歌曲的评论数/album_by_artist.py: -------------------------------------------------------------------------------- 1 | """ 2 | 根据上一步获取的歌手的 ID 来用于获取所有的专辑 ID 3 | """ 4 | import requests 5 | from bs4 import BeautifulSoup 6 | import time 7 | from music_163 import sql 8 | 9 | 10 | class Album(object): 11 | headers = { 12 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 13 | 'Accept-Encoding': 'gzip, deflate, sdch', 14 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 15 | 'Cache-Control': 'no-cache', 16 | 'Connection': 'keep-alive', 17 | 'Cookie': '_ntes_nnid=7eced19b27ffae35dad3f8f2bf5885cd,1476521011210; _ntes_nuid=7eced19b27ffae35dad3f8f2bf5885cd; usertrack=c+5+hlgB7TgnsAmACnXtAg==; Province=025; City=025; _ga=GA1.2.1405085820.1476521280; NTES_PASSPORT=6n9ihXhbWKPi8yAqG.i2kETSCRa.ug06Txh8EMrrRsliVQXFV_orx5HffqhQjuGHkNQrLOIRLLotGohL9s10wcYSPiQfI2wiPacKlJ3nYAXgM; P_INFO=hourui93@163.com|1476523293|1|study|11&12|jis&1476511733&mail163#jis&320100#10#0#0|151889&0|g37_client_check&mailsettings&mail163&study&blog|hourui93@163.com; JSESSIONID-WYYY=189f31767098c3bd9d03d9b968c065daf43cbd4c1596732e4dcb471beafe2bf0605b85e969f92600064a977e0b64a24f0af7894ca898b696bd58ad5f39c8fce821ec2f81f826ea967215de4d10469e9bd672e75d25f116a9d309d360582a79620b250625859bc039161c78ab125a1e9bf5d291f6d4e4da30574ccd6bbab70b710e3f358f%3A1476594130342; _iuqxldmzr_=25; __utma=94650624.1038096298.1476521011.1476588849.1476592408.6; __utmb=94650624.11.10.1476592408; __utmc=94650624; __utmz=94650624.1476521011.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)', 18 | 'DNT': '1', 19 | 'Host': 'music.163.com', 20 | 'Pragma': 'no-cache', 21 | 'Referer': 'http://music.163.com/', 22 | 'Upgrade-Insecure-Requests': '1', 23 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' 24 | } 25 | 26 | def save_albums(self, artist_id): 27 | params = {'id': artist_id, 'limit': '200'} 28 | # 获取歌手个人主页 29 | r = requests.get('http://music.163.com/artist/album', headers=self.headers, params=params) 30 | 31 | # 网页解析 32 | soup = BeautifulSoup(r.content.decode(), 'html.parser') 33 | body = soup.body 34 | 35 | albums = body.find_all('a', attrs={'class': 'tit f-thide s-fc0'}) # 获取所有专辑 36 | 37 | for album in albums: 38 | albume_id = album['href'].replace('/album?id=', '') 39 | sql.insert_album(albume_id, artist_id) 40 | 41 | 42 | if __name__ == '__main__': 43 | artists = sql.get_all_artist() 44 | my_album = Album() 45 | for i in artists: 46 | try: 47 | my_album.save_albums(i['ARTIST_ID']) 48 | # print(i) 49 | except Exception as e: 50 | # 打印错误日志 51 | print(str(i) + ': ' + str(e)) 52 | time.sleep(5) 53 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/23_python爬取网易云音乐所有歌曲的评论数/artists.py: -------------------------------------------------------------------------------- 1 | """ 2 | 获取所有的歌手信息 3 | """ 4 | import requests 5 | from bs4 import BeautifulSoup 6 | from music_163 import sql 7 | 8 | headers = { 9 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 10 | 'Accept-Encoding': 'gzip, deflate, sdch', 11 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 12 | 'Cache-Control': 'no-cache', 13 | 'Connection': 'keep-alive', 14 | 'Cookie': '_ntes_nnid=7eced19b27ffae35dad3f8f2bf5885cd,1476521011210; _ntes_nuid=7eced19b27ffae35dad3f8f2bf5885cd; usertrack=c+5+hlgB7TgnsAmACnXtAg==; Province=025; City=025; NTES_PASSPORT=6n9ihXhbWKPi8yAqG.i2kETSCRa.ug06Txh8EMrrRsliVQXFV_orx5HffqhQjuGHkNQrLOIRLLotGohL9s10wcYSPiQfI2wiPacKlJ3nYAXgM; P_INFO=hourui93@163.com|1476523293|1|study|11&12|jis&1476511733&mail163#jis&320100#10#0#0|151889&0|g37_client_check&mailsettings&mail163&study&blog|hourui93@163.com; NTES_SESS=Fa2uk.YZsGoj59AgD6tRjTXGaJ8_1_4YvGfXUkS7C1NwtMe.tG1Vzr255TXM6yj2mKqTZzqFtoEKQrgewi9ZK60ylIqq5puaG6QIaNQ7EK5MTcRgHLOhqttDHfaI_vsBzB4bibfamzx1.fhlpqZh_FcnXUYQFw5F5KIBUmGJg7xdasvGf_EgfICWV; S_INFO=1476597594|1|0&80##|hourui93; NETEASE_AUTH_SOURCE=space; NETEASE_AUTH_USERNAME=hourui93; _ga=GA1.2.1405085820.1476521280; JSESSIONID-WYYY=cbd082d2ce2cffbcd5c085d8bf565a95aee3173ddbbb00bfa270950f93f1d8bb4cb55a56a4049fa8c828373f630c78f4a43d6c3d252c4c44f44b098a9434a7d8fc110670a6e1e9af992c78092936b1e19351435ecff76a181993780035547fa5241a5afb96e8c665182d0d5b911663281967d675ff2658015887a94b3ee1575fa1956a5a%3A1476607977016; _iuqxldmzr_=25; __utma=94650624.1038096298.1476521011.1476595468.1476606177.8; __utmb=94650624.20.10.1476606177; __utmc=94650624; __utmz=94650624.1476521011.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)', 15 | 'DNT': '1', 16 | 'Host': 'music.163.com', 17 | 'Pragma': 'no-cache', 18 | 'Referer': 'http://music.163.com/', 19 | 'Upgrade-Insecure-Requests': '1', 20 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' 21 | } 22 | 23 | 24 | def save_artist(group_id, initial): 25 | params = {'id': group_id, 'initial': initial} 26 | r = requests.get('http://music.163.com/discover/artist/cat', params=params) 27 | 28 | # 网页解析 29 | soup = BeautifulSoup(r.content.decode(), 'html.parser') 30 | body = soup.body 31 | 32 | hot_artists = body.find_all('a', attrs={'class': 'msk'}) 33 | artists = body.find_all('a', attrs={'class': 'nm nm-icn f-thide s-fc0'}) 34 | 35 | for artist in hot_artists: 36 | artist_id = artist['href'].replace('/artist?id=', '').strip() 37 | artist_name = artist['title'].replace('的音乐', '') 38 | try: 39 | sql.insert_artist(artist_id, artist_name) 40 | except Exception as e: 41 | # 打印错误日志 42 | print(e) 43 | 44 | for artist in artists: 45 | artist_id = artist['href'].replace('/artist?id=', '').strip() 46 | artist_name = artist['title'].replace('的音乐', '') 47 | try: 48 | sql.insert_artist(artist_id, artist_name) 49 | except Exception as e: 50 | # 打印错误日志 51 | print(e) 52 | 53 | 54 | gg = 4003 55 | 56 | save_artist(gg, 0) 57 | for i in range(65, 91): 58 | save_artist(gg, i) 59 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/23_python爬取网易云音乐所有歌曲的评论数/comments_by_music.py: -------------------------------------------------------------------------------- 1 | """ 2 | 根据歌曲 ID 获得所有的歌曲所对应的评论信息 3 | """ 4 | 5 | import requests 6 | from music_163 import sql 7 | import time 8 | import threading 9 | import pymysql.cursors 10 | 11 | 12 | class Comments(object): 13 | headers = { 14 | 'Host': 'music.163.com', 15 | 'Connection': 'keep-alive', 16 | 'Content-Length': '484', 17 | 'Cache-Control': 'max-age=0', 18 | 'Origin': 'http://music.163.com', 19 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36', 20 | 'Content-Type': 'application/x-www-form-urlencoded', 21 | 'Accept': '*/*', 22 | 'DNT': '1', 23 | 'Accept-Encoding': 'gzip, deflate', 24 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4', 25 | 'Cookie': 'JSESSIONID-WYYY=b66d89ed74ae9e94ead89b16e475556e763dd34f95e6ca357d06830a210abc7b685e82318b9d1d5b52ac4f4b9a55024c7a34024fddaee852404ed410933db994dcc0e398f61e670bfeea81105cbe098294e39ac566e1d5aa7232df741870ba1fe96e5cede8372ca587275d35c1a5d1b23a11e274a4c249afba03e20fa2dafb7a16eebdf6%3A1476373826753; _iuqxldmzr_=25; _ntes_nnid=7fa73e96706f26f3ada99abba6c4a6b2,1476372027128; _ntes_nuid=7fa73e96706f26f3ada99abba6c4a6b2; __utma=94650624.748605760.1476372027.1476372027.1476372027.1; __utmb=94650624.4.10.1476372027; __utmc=94650624; __utmz=94650624.1476372027.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)', 26 | } 27 | 28 | params = { 29 | 'csrf_token': '' 30 | } 31 | 32 | data = { 33 | 'params': 'Ak2s0LoP1GRJYqE3XxJUZVYK9uPEXSTttmAS+8uVLnYRoUt/Xgqdrt/13nr6OYhi75QSTlQ9FcZaWElIwE+oz9qXAu87t2DHj6Auu+2yBJDr+arG+irBbjIvKJGfjgBac+kSm2ePwf4rfuHSKVgQu1cYMdqFVnB+ojBsWopHcexbvLylDIMPulPljAWK6MR8', 34 | 'encSecKey': '8c85d1b6f53bfebaf5258d171f3526c06980cbcaf490d759eac82145ee27198297c152dd95e7ea0f08cfb7281588cdab305946e01b9d84f0b49700f9c2eb6eeced8624b16ce378bccd24341b1b5ad3d84ebd707dbbd18a4f01c2a007cd47de32f28ca395c9715afa134ed9ee321caa7f28ec82b94307d75144f6b5b134a9ce1a' 35 | } 36 | 37 | proxies = {'http': 'http://127.0.0.1:10800'} 38 | 39 | def get_comments(self, music_id, flag): 40 | self.headers['Referer'] = 'http://music.163.com/playlist?id=' + str(music_id) 41 | if flag: 42 | r = requests.post('http://music.163.com/weapi/v1/resource/comments/R_SO_4_' + str(music_id), 43 | headers=self.headers, params=self.params, data=self.data, proxies=self.proxies) 44 | else: 45 | r = requests.post('http://music.163.com/weapi/v1/resource/comments/R_SO_4_' + str(music_id), 46 | headers=self.headers, params=self.params, data=self.data) 47 | return r.json() 48 | 49 | 50 | if __name__ == '__main__': 51 | my_comment = Comments() 52 | 53 | 54 | def save_comments(musics, flag, connection0): 55 | for i in musics: 56 | my_music_id = i['MUSIC_ID'] 57 | try: 58 | comments = my_comment.get_comments(my_music_id, flag) 59 | if comments['total'] > 0: 60 | sql.insert_comments(my_music_id, comments['total'], str(comments), connection0) 61 | except Exception as e: 62 | # 打印错误日志 63 | print(my_music_id) 64 | print(e) 65 | time.sleep(5) 66 | 67 | 68 | music_before = sql.get_before_music() 69 | music_after = sql.get_after_music() 70 | 71 | # pymysql 链接不是线程安全的 72 | connection1 = pymysql.connect(host='localhost', 73 | user='root', 74 | password='1234', 75 | db='test', 76 | charset='utf8mb4', 77 | cursorclass=pymysql.cursors.DictCursor) 78 | 79 | connection2 = pymysql.connect(host='localhost', 80 | user='root', 81 | password='1234', 82 | db='test', 83 | charset='utf8mb4', 84 | cursorclass=pymysql.cursors.DictCursor) 85 | 86 | t1 = threading.Thread(target=save_comments, args=(music_before, True, connection1)) 87 | t2 = threading.Thread(target=save_comments, args=(music_after, False, connection2)) 88 | t1.start() 89 | t2.start() 90 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/23_python爬取网易云音乐所有歌曲的评论数/music_by_album.py: -------------------------------------------------------------------------------- 1 | """ 2 | 根据专辑 ID 获取到所有的音乐 ID 3 | """ 4 | import requests 5 | from bs4 import BeautifulSoup 6 | import time 7 | from music_163 import sql 8 | 9 | 10 | class Music(object): 11 | headers = { 12 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 13 | 'Accept-Encoding': 'gzip, deflate, sdch', 14 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 15 | 'Cache-Control': 'no-cache', 16 | 'Connection': 'keep-alive', 17 | 'Cookie': '_ntes_nnid=7eced19b27ffae35dad3f8f2bf5885cd,1476521011210; _ntes_nuid=7eced19b27ffae35dad3f8f2bf5885cd; usertrack=c+5+hlgB7TgnsAmACnXtAg==; Province=025; City=025; NTES_PASSPORT=6n9ihXhbWKPi8yAqG.i2kETSCRa.ug06Txh8EMrrRsliVQXFV_orx5HffqhQjuGHkNQrLOIRLLotGohL9s10wcYSPiQfI2wiPacKlJ3nYAXgM; P_INFO=hourui93@163.com|1476523293|1|study|11&12|jis&1476511733&mail163#jis&320100#10#0#0|151889&0|g37_client_check&mailsettings&mail163&study&blog|hourui93@163.com; _ga=GA1.2.1405085820.1476521280; JSESSIONID-WYYY=fb5288e1c5f667324f1636d020704cab2f27ee915622b114f89027cbf60c38be2af6b9cbef2223c1f2581e3502f11b86efd60891d6f61b6f783c0d55114f8269fa801df7352f5cc4c8259876e563a6bd0212b504a8997723a0593b21d5b3d9076d4fa38c098be68e3c5d36d342e4a8e40c1f73378cec0b5851bd8a628886edbdd23a7093%3A1476623819662; _iuqxldmzr_=25; __utma=94650624.1038096298.1476521011.1476610320.1476622020.10; __utmb=94650624.14.10.1476622020; __utmc=94650624; __utmz=94650624.1476521011.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)', 18 | 'DNT': '1', 19 | 'Host': 'music.163.com', 20 | 'Pragma': 'no-cache', 21 | 'Referer': 'http://music.163.com/', 22 | 'Upgrade-Insecure-Requests': '1', 23 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' 24 | } 25 | 26 | def save_music(self, album_id): 27 | params = {'id': album_id} 28 | # 获取专辑对应的页面 29 | r = requests.get('http://music.163.com/album', headers=self.headers, params=params) 30 | 31 | # 网页解析 32 | soup = BeautifulSoup(r.content.decode(), 'html.parser') 33 | body = soup.body 34 | 35 | musics = body.find('ul', attrs={'class': 'f-hide'}).find_all('li') # 获取专辑的所有音乐 36 | 37 | for music in musics: 38 | music = music.find('a') 39 | music_id = music['href'].replace('/song?id=', '') 40 | music_name = music.getText() 41 | sql.insert_music(music_id, music_name, album_id) 42 | 43 | 44 | if __name__ == '__main__': 45 | albums = sql.get_all_album() 46 | my_music = Music() 47 | for i in albums: 48 | try: 49 | my_music.save_music(i['ALBUM_ID']) 50 | # print(i) 51 | except Exception as e: 52 | # 打印错误日志 53 | print(str(i) + ': ' + str(e)) 54 | time.sleep(5) 55 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/23_python爬取网易云音乐所有歌曲的评论数/sql.py: -------------------------------------------------------------------------------- 1 | """ 2 | 一般 Python 用于连接 MySQL 的工具:pymysql 3 | """ 4 | import pymysql.cursors 5 | 6 | connection = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='***', db='sunshine',charset="utf8") 7 | 8 | 9 | # 保存评论 10 | def insert_comments(music_id, comments, detail, connection): 11 | with connection.cursor() as cursor: 12 | sql = "INSERT INTO `comments` (`MUSIC_ID`, `COMMENTS`, `DETAILS`) VALUES (%s, %s, %s)" 13 | cursor.execute(sql, (music_id, comments, detail)) 14 | connection.commit() 15 | 16 | 17 | # 保存音乐 18 | def insert_music(music_id, music_name, album_id): 19 | with connection.cursor() as cursor: 20 | sql = "INSERT INTO `musics` (`MUSIC_ID`, `MUSIC_NAME`, `ALBUM_ID`) VALUES (%s, %s, %s)" 21 | cursor.execute(sql, (music_id, music_name, album_id)) 22 | connection.commit() 23 | 24 | 25 | # 保存专辑 26 | def insert_album(album_id, artist_id): 27 | with connection.cursor() as cursor: 28 | sql = "INSERT INTO `albums` (`ALBUM_ID`, `ARTIST_ID`) VALUES (%s, %s)" 29 | cursor.execute(sql, (album_id, artist_id)) 30 | connection.commit() 31 | 32 | 33 | # 保存歌手 34 | def insert_artist(artist_id, artist_name): 35 | with connection.cursor() as cursor: 36 | sql = "INSERT INTO `artists` (`ARTIST_ID`, `ARTIST_NAME`) VALUES (%s, %s)" 37 | cursor.execute(sql, (artist_id, artist_name)) 38 | connection.commit() 39 | 40 | 41 | # 获取所有歌手的 ID 42 | def get_all_artist(): 43 | with connection.cursor() as cursor: 44 | sql = "SELECT `ARTIST_ID` FROM `artists` ORDER BY ARTIST_ID" 45 | cursor.execute(sql, ()) 46 | return cursor.fetchall() 47 | 48 | 49 | # 获取所有专辑的 ID 50 | def get_all_album(): 51 | with connection.cursor() as cursor: 52 | sql = "SELECT `ALBUM_ID` FROM `albums` ORDER BY ALBUM_ID" 53 | cursor.execute(sql, ()) 54 | return cursor.fetchall() 55 | 56 | 57 | # 获取所有音乐的 ID 58 | def get_all_music(): 59 | with connection.cursor() as cursor: 60 | sql = "SELECT `MUSIC_ID` FROM `musics` ORDER BY MUSIC_ID" 61 | cursor.execute(sql, ()) 62 | return cursor.fetchall() 63 | 64 | 65 | # 获取前一半音乐的 ID 66 | def get_before_music(): 67 | with connection.cursor() as cursor: 68 | sql = "SELECT `MUSIC_ID` FROM `musics` ORDER BY MUSIC_ID LIMIT 0, 800000" 69 | cursor.execute(sql, ()) 70 | return cursor.fetchall() 71 | 72 | 73 | # 获取后一半音乐的 ID 74 | def get_after_music(): 75 | with connection.cursor() as cursor: 76 | sql = "SELECT `MUSIC_ID` FROM `musics` ORDER BY MUSIC_ID LIMIT 800000, 1197429" 77 | cursor.execute(sql, ()) 78 | return cursor.fetchall() 79 | 80 | 81 | def dis_connect(): 82 | connection.close() 83 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/ctrip_items.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/ctrip_items.csv -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__init__.py -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class FindtripItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | # pass 15 | site = scrapy.Field() 16 | company = scrapy.Field() 17 | flight_time = scrapy.Field() 18 | airports = scrapy.Field() 19 | passtime = scrapy.Field() 20 | price = scrapy.Field() 21 | 22 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class FindtripSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | from findtrip.spiders.washctrip import wash 8 | import pymongo 9 | from scrapy.conf import settings 10 | from scrapy import log 11 | 12 | class FindtripPipeline(object): 13 | def process_item(self, item, spider): 14 | return item 15 | 16 | 17 | class MongoDBPipeline(object): 18 | def __init__(self): 19 | self.client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT']) 20 | self.db = self.client[settings['MONGO_DB']] 21 | self.post = self.db[settings['MONGO_COLL']] 22 | 23 | def process_item(self, item, spider): 24 | if item['site'] == 'Qua': 25 | if item['company']: 26 | item['company'] = wash(item['company']) 27 | if item['flight_time']: 28 | item['flight_time'] = wash(item['flight_time']) 29 | if item['airports']: 30 | item['airports'] = wash(item['airports']) 31 | if item['passtime']: 32 | item['passtime'] = wash(item['passtime']) 33 | if item['price']: 34 | item['price'] = wash(item['price']) 35 | for data in item: 36 | if not data: 37 | raise DropItem("Missing data!") 38 | self.collection.insert(dict(item)) 39 | log.msg("Question added to MongoDB database!", 40 | level=log.DEBUG, spider=spider) 41 | elif item['site'] == 'Ctrip': 42 | self.collection.insert(dict(item)) 43 | log.msg("Question added to MongoDB database!", 44 | level=log.DEBUG, spider=spider) 45 | 46 | return item -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for findtrip project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'findtrip' 13 | 14 | SPIDER_MODULES = ['findtrip.spiders'] 15 | NEWSPIDER_MODULE = 'findtrip.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'findtrip (+http://www.yourdomain.com)' 20 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' 21 | 22 | 23 | # Obey robots.txt rules 24 | ROBOTSTXT_OBEY = False 25 | 26 | # 配置mongoDB 27 | MONGO_HOST = "127.0.0.1" # 主机IP 28 | MONGO_PORT = 27017 # 端口号 29 | MONGO_DB = "FindTrip" # 库名 30 | MONGO_COLL = "qua_findtrip" # collection 31 | 32 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 33 | #CONCURRENT_REQUESTS = 32 34 | 35 | # Configure a delay for requests for the same website (default: 0) 36 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 37 | # See also autothrottle settings and docs 38 | #DOWNLOAD_DELAY = 3 39 | # The download delay setting will honor only one of: 40 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 41 | #CONCURRENT_REQUESTS_PER_IP = 16 42 | 43 | # Disable cookies (enabled by default) 44 | COOKIES_ENABLED = False 45 | 46 | # Disable Telnet Console (enabled by default) 47 | #TELNETCONSOLE_ENABLED = False 48 | 49 | # Override the default request headers: 50 | #DEFAULT_REQUEST_HEADERS = { 51 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 52 | # 'Accept-Language': 'en', 53 | #} 54 | 55 | # Enable or disable spider middlewares 56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 57 | #SPIDER_MIDDLEWARES = { 58 | # 'findtrip.middlewares.FindtripSpiderMiddleware': 543, 59 | #} 60 | 61 | # Enable or disable downloader middlewares 62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 63 | #DOWNLOADER_MIDDLEWARES = { 64 | # 'findtrip.middlewares.MyCustomDownloaderMiddleware': 543, 65 | #} 66 | 67 | # Enable or disable extensions 68 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 69 | #EXTENSIONS = { 70 | # 'scrapy.extensions.telnet.TelnetConsole': None, 71 | #} 72 | 73 | # Configure item pipelines 74 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 75 | ITEM_PIPELINES = { 76 | # 'findtrip.pipelines.FindtripPipeline': 300, 77 | 'findtrip.pipelines.MongoDBPipeline': 300, 78 | } 79 | 80 | # Enable and configure the AutoThrottle extension (disabled by default) 81 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 82 | #AUTOTHROTTLE_ENABLED = True 83 | # The initial download delay 84 | #AUTOTHROTTLE_START_DELAY = 5 85 | # The maximum download delay to be set in case of high latencies 86 | #AUTOTHROTTLE_MAX_DELAY = 60 87 | # The average number of requests Scrapy should be sending in parallel to 88 | # each remote server 89 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 90 | # Enable showing throttling stats for every response received: 91 | #AUTOTHROTTLE_DEBUG = False 92 | 93 | # Enable and configure HTTP caching (disabled by default) 94 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 95 | #HTTPCACHE_ENABLED = True 96 | #HTTPCACHE_EXPIRATION_SECS = 0 97 | #HTTPCACHE_DIR = 'httpcache' 98 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 99 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 100 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/ctrip_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/ctrip_spider.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/qua_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/qua_spider.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/washctrip.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/washctrip.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/ctrip_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from findtrip.items import FindtripItem 3 | 4 | class CtripSpider(scrapy.Spider): 5 | name = 'ctrip' 6 | start_urls = [ 7 | "http://flights.ctrip.com/booking/XMN-BJS-day-1.html?DDate1=2016-04-19" 8 | ] 9 | 10 | def parse(self, response): 11 | sel = scrapy.Selector(response) 12 | fligint_div = "//div[@id='J_flightlist2']/div" 13 | dataList = sel.xpath(fligint_div) 14 | 15 | for index,each in enumerate(dataList): 16 | flight_each = fligint_div+'['+str(index+1)+']' 17 | flight_tr = flight_each+"//tr[@class='J_header_row']" 18 | istrain = sel.xpath(flight_each + "//div[@class='train_flight_tit']") 19 | 20 | if istrain: 21 | print ("this data is train add") 22 | else: 23 | company = sel.xpath(flight_tr + "//div[@class='info-flight J_flight_no']//text()").extract() 24 | 25 | flight_time_from = sel.xpath(flight_tr + "//td[@class='right']/div[1]//text()").extract() 26 | flight_time_to = sel.xpath(flight_tr + "//td[@class='left']/div[1]//text()").extract() 27 | flight_time = [flight_time_from,flight_time_to] 28 | 29 | airports_from = sel.xpath(flight_tr + "//td[@class='right']/div[2]//text()").extract() 30 | airports_to = sel.xpath(flight_tr + "//td[@class='left']/div[2]//text()").extract() 31 | airports = [airports_from,airports_to] 32 | 33 | price_middle = sel.xpath(flight_tr + "[1]//td[@class='price middle ']/span//text()").extract() 34 | price = sel.xpath(flight_tr + "[1]//td[@class='price ']/span//text()").extract() 35 | if price_middle: 36 | price = price_middle 37 | elif price: 38 | price = price 39 | else: 40 | price = '' 41 | 42 | item = FindtripItem() 43 | item['site'] = 'Ctrip' 44 | item['company'] = company 45 | item['flight_time'] = flight_time 46 | item['airports'] = airports 47 | item['price'] = price 48 | yield item 49 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/qua_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from findtrip.items import FindtripItem 3 | 4 | class QuaSpider(scrapy.Spider): 5 | name = "qua" 6 | start_urls = [ 7 | "http://www.qua.com/flights/PEK-XMN/2016-05-12?m=CNY&from=flight_home" 8 | ] 9 | 10 | def parse(self, response): 11 | sel = scrapy.Selector(response) 12 | dataList = sel.xpath("//div[@class='m-fly-item s-oneway']") 13 | 14 | for index,each in enumerate(dataList): 15 | flight_each = "//div[@id='list-box']/div["+str(index+1)+"]" 16 | detail_span = "//div[@class='fl-detail-nav']/ul/li[1]/span[@class='nav-label']" 17 | f_route_div = "//div[@class='m-fl-info-bd']/div" 18 | 19 | airports = sel.xpath(flight_each + f_route_div + '/p[3]//text()').extract() 20 | company = sel.xpath(flight_each + f_route_div + '/p[1]//text()').extract() 21 | flight_time = sel.xpath(flight_each + f_route_div + '/p[2]//text()').extract() 22 | passtime = sel.xpath(flight_each + f_route_div + '/p[4]//text()').extract() 23 | price = sel.xpath(flight_each + "//div[@class='fl-price-box']//em//text()").extract() 24 | 25 | item = FindtripItem() 26 | item['site'] = 'Qua' 27 | item['company'] = company 28 | item['flight_time'] = flight_time 29 | item['airports'] = airports 30 | item['passtime'] = passtime 31 | item['price'] = price 32 | yield item 33 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/washctrip.py: -------------------------------------------------------------------------------- 1 | def wash(dateList): 2 | dateList = map(lambda x : x.split(), dateList) 3 | cleanList = [] 4 | for each in dateList: 5 | if each: 6 | cleanList.append(each[0]) 7 | return cleanList 8 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/qua_items.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/qua_items.csv -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/qua_items.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/qua_items.json -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = findtrip.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = findtrip 12 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__init__.py -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Field, Item 9 | 10 | 11 | class PythonjobsItem(Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | #pass 15 | title = Field() 16 | city = Field() 17 | company = Field() 18 | location = Field() 19 | url = Field() 20 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class PythonjobsSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class PythonjobsPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for pythonjobs project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'pythonjobs' 13 | 14 | SPIDER_MODULES = ['pythonjobs.spiders'] 15 | NEWSPIDER_MODULE = 'pythonjobs.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'pythonjobs (+http://www.yourdomain.com)' 20 | USER_AGENT = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0' 21 | 22 | # Obey robots.txt rules 23 | ROBOTSTXT_OBEY = False 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | #CONCURRENT_REQUESTS = 32 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | #DOWNLOAD_DELAY = 3 32 | # The download delay setting will honor only one of: 33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 34 | #CONCURRENT_REQUESTS_PER_IP = 16 35 | 36 | # Disable cookies (enabled by default) 37 | COOKIES_ENABLED = False 38 | 39 | # Disable Telnet Console (enabled by default) 40 | #TELNETCONSOLE_ENABLED = False 41 | 42 | # Override the default request headers: 43 | #DEFAULT_REQUEST_HEADERS = { 44 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 45 | # 'Accept-Language': 'en', 46 | #} 47 | 48 | # Enable or disable spider middlewares 49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 50 | #SPIDER_MIDDLEWARES = { 51 | # 'pythonjobs.middlewares.PythonjobsSpiderMiddleware': 543, 52 | #} 53 | 54 | # Enable or disable downloader middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 56 | #DOWNLOADER_MIDDLEWARES = { 57 | # 'pythonjobs.middlewares.MyCustomDownloaderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable extensions 61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 68 | ITEM_PIPELINES = { 69 | 'pythonjobs.pipelines.PythonjobsPipeline': 300, 70 | } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/__pycache__/job_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/__pycache__/job_spider.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/job_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from pythonjobs.items import PythonjobsItem 4 | #from bs4 import BeautifulSoup 5 | 6 | class JobspiderSpider(scrapy.Spider): 7 | name = 'jobSpider' 8 | allowed_domains = ['search.51job.com','jobs.51job.com'] 9 | 10 | def start_requests(self): 11 | for i in range(1,20): # Set pages to crawl here. 12 | url = "http://search.51job.com/list/000000,000000,0000,00,9,99,python,2,{0}.html".format(i) 13 | yield scrapy.Request(url) 14 | 15 | def parse(self, response): 16 | for sel in response.css("html body div.dw_wp div#resultList.dw_table div.el p.t1 span a"): 17 | url = sel.re('href="(.*?)"')[0] 18 | yield scrapy.Request(url,callback=self.parse_item) 19 | 20 | def parse_item(self, response): 21 | item = PythonjobsItem() 22 | item['title'] = response.xpath('//div[@class="cn"]/h1/@title').extract()[0] 23 | item['url'] = response.url 24 | item['city'] = response.xpath('//span[@class="lname"]/text()').extract()[0] 25 | item['company'] = response.xpath('//p[@class="cname"]/a/@title').extract()[0] 26 | item['location'] = response.xpath('//p[@class="fp"]/text()').extract()[1].rstrip() 27 | return item -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = pythonjobs.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = pythonjobs 12 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/ghostdriver.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/ghostdriver.log -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = shuimujob.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = shuimujob 12 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__init__.py -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/platform.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/platform.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ShuimujobItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | # pass 15 | title = scrapy.Field() 16 | href = scrapy.Field() 17 | author = scrapy.Field() 18 | time = scrapy.Field() 19 | content = scrapy.Field() 20 | is_dev = scrapy.Field() 21 | is_alg = scrapy.Field() 22 | is_fin = scrapy.Field() 23 | base_url_index = scrapy.Field() 24 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class ShuimujobSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymongo 8 | from scrapy.conf import settings 9 | from scrapy.exceptions import DropItem 10 | from scrapy import log 11 | 12 | class ShuimujobPipeline(object): 13 | def process_item(self, item, spider): 14 | return item 15 | 16 | class MongoDBPipeline(object): 17 | 18 | def __init__(self): 19 | pass 20 | 21 | 22 | def open_spider(self, spider): 23 | self.client = pymongo.MongoClient( 24 | settings['MONGODB_SERVER'], 25 | settings['MONGODB_PORT'] 26 | ) 27 | self.db = self.client[settings['MONGODB_DB']] 28 | self.collection = self.db[settings['MONGODB_COLLECTION']] 29 | 30 | def close_spider(self, spider): 31 | self.client.close() 32 | 33 | def process_item(self, item, spider): 34 | valid = True 35 | for data in item: 36 | if not data : 37 | valid = False 38 | raise DropItem("Missing {0}!".format(data)) 39 | if item['title'] == '': 40 | valid = False 41 | raise DropItem("title is '' ") 42 | if item['content'] == '': 43 | valid = False 44 | raise DropItem("content is '' ") 45 | if valid: 46 | self.collection.insert(dict(item)) 47 | return item 48 | 49 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/platform.py: -------------------------------------------------------------------------------- 1 | import sys 2 | def getPlatform(): 3 | platform='' 4 | if sys.platform.startswith('win'): 5 | platform = 'win' 6 | elif sys.platform.startswith('linux'): 7 | platform = 'linux' 8 | return platform -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for shuimujob project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'shuimujob' 13 | 14 | SPIDER_MODULES = ['shuimujob.spiders'] 15 | NEWSPIDER_MODULE = 'shuimujob.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'shuimujob (+http://www.yourdomain.com)' 20 | 21 | 22 | MONGODB_SERVER = "localhost" 23 | MONGODB_PORT = 27017 24 | MONGODB_DB = "shuimujob" 25 | MONGODB_COLLECTION = "job_info" 26 | 27 | # Obey robots.txt rules 28 | ROBOTSTXT_OBEY = False 29 | 30 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 31 | #CONCURRENT_REQUESTS = 32 32 | 33 | # Configure a delay for requests for the same website (default: 0) 34 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 35 | # See also autothrottle settings and docs 36 | #DOWNLOAD_DELAY = 3 37 | # The download delay setting will honor only one of: 38 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 39 | #CONCURRENT_REQUESTS_PER_IP = 16 40 | 41 | # Disable cookies (enabled by default) 42 | COOKIES_ENABLED = False 43 | 44 | # Disable Telnet Console (enabled by default) 45 | #TELNETCONSOLE_ENABLED = False 46 | 47 | # Override the default request headers: 48 | #DEFAULT_REQUEST_HEADERS = { 49 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 50 | # 'Accept-Language': 'en', 51 | #} 52 | 53 | # Enable or disable spider middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 55 | #SPIDER_MIDDLEWARES = { 56 | # 'shuimujob.middlewares.ShuimujobSpiderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable downloader middlewares 60 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 61 | #DOWNLOADER_MIDDLEWARES = { 62 | # 'shuimujob.middlewares.MyCustomDownloaderMiddleware': 543, 63 | #} 64 | 65 | # Enable or disable extensions 66 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 67 | #EXTENSIONS = { 68 | # 'scrapy.extensions.telnet.TelnetConsole': None, 69 | #} 70 | 71 | # Configure item pipelines 72 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 73 | ITEM_PIPELINES = { 74 | # 'shuimujob.pipelines.ShuimujobPipeline': 300, 75 | 'shuimujob.pipelines.MongoDBPipeline':300 76 | } 77 | 78 | # Enable and configure the AutoThrottle extension (disabled by default) 79 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 80 | #AUTOTHROTTLE_ENABLED = True 81 | # The initial download delay 82 | #AUTOTHROTTLE_START_DELAY = 5 83 | # The maximum download delay to be set in case of high latencies 84 | #AUTOTHROTTLE_MAX_DELAY = 60 85 | # The average number of requests Scrapy should be sending in parallel to 86 | # each remote server 87 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 88 | # Enable showing throttling stats for every response received: 89 | #AUTOTHROTTLE_DEBUG = False 90 | 91 | # Enable and configure HTTP caching (disabled by default) 92 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 93 | #HTTPCACHE_ENABLED = True 94 | #HTTPCACHE_EXPIRATION_SECS = 0 95 | #HTTPCACHE_DIR = 'httpcache' 96 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 97 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 98 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/__pycache__/shuimu_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/__pycache__/shuimu_spider.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/shuimu_spider.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import scrapy 3 | from shuimujob.items import ShuimujobItem 4 | from selenium import webdriver 5 | from selenium.webdriver.common.by import By 6 | from selenium.webdriver.support.ui import WebDriverWait 7 | from selenium.webdriver.support import expected_conditions as EC 8 | from bs4 import BeautifulSoup 9 | from scrapy import signals 10 | from scrapy.xlib.pydispatch import dispatcher 11 | from shuimujob.platform import getPlatform 12 | 13 | class SMSpider(scrapy.spiders.CrawlSpider): 14 | ''' 15 | #要建立一个 Spider,你可以为 scrapy.spider.BaseSpider 创建一个子类,并确定三个主要的、强制的属性: 16 | #name :爬虫的识别名,它必须是唯一的,在不同的爬虫中你必须定义不同的名字. 17 | #start_urls :爬虫开始爬的一个 URL 列表。爬虫从这里开始抓取数据,所以,第一次下载的数据将会从这些 URLS 开始。其他子 URL 将会从这些起始 URL 中继承性生成。 18 | #parse() :爬虫的方法,调用时候传入从每一个 URL 传回的 Response 对象作为参数,response 将会是 parse 方法的唯一的一个参数, 19 | #这个方法负责解析返回的数据、匹配抓取的数据(解析为 item )并跟踪更多的 URL。 20 | ''' 21 | name="shuimujob" 22 | base_url = 'http://www.newsmth.net/nForum/board/Intern' 23 | start_urls = [base_url] 24 | start_urls.extend([base_url+'?p='+str(i) for i in range(2,4)]) 25 | # start_urls = ['http://www.newsmth.net/'] 26 | platform = getPlatform() 27 | 28 | def __init__(self): 29 | scrapy.spiders.Spider.__init__(self) 30 | if self.platform == 'linux': 31 | self.driver = webdriver.PhantomJS() 32 | elif self.platform == 'win': 33 | self.driver = webdriver.PhantomJS() 34 | self.driver.set_page_load_timeout(15) 35 | dispatcher.connect(self.spider_closed, signals.spider_closed) 36 | 37 | 38 | 39 | def spider_closed(self, spider): 40 | self.driver.quit() 41 | 42 | def parse(self,response): 43 | self.driver.get(response.url) 44 | 45 | element = WebDriverWait(self.driver,30).until(EC.presence_of_all_elements_located((By.TAG_NAME,'table'))) 46 | page_source = self.driver.page_source 47 | bs_obj = BeautifulSoup(page_source, "lxml") 48 | table = bs_obj.find('table',class_='board-list tiz') 49 | intern_messages = table.find_all('tr',class_=False) 50 | for message in intern_messages: 51 | title, href, time, author = '','','','' 52 | td_9 = message.find('td',class_='title_9') 53 | if td_9: 54 | title = td_9.a.get_text().encode('utf-8','ignore') 55 | href = td_9.a['href'] 56 | td_10 = message.find('td', class_='title_10') 57 | if td_10: 58 | time=td_10.get_text().encode('utf-8','ignore') 59 | td_12 = message.find('td', class_='title_12') 60 | if td_12: 61 | author = td_12.a.get_text().encode('utf-8','ignore') 62 | item = ShuimujobItem() 63 | item['title'] = title 64 | item['href'] = href 65 | item['time'] = time 66 | item['author'] = author 67 | item['base_url_index'] = 0 68 | root_url = 'http://www.newsmth.net' 69 | # content = scrapy.Request(root_url+href,self.parse_content) 70 | if href!='': 71 | content = self.parse_content(root_url+href) 72 | # print 'content:', content 73 | item['content'] = content 74 | yield item 75 | 76 | def parse_content(self,url): 77 | 78 | self.driver.get(url) 79 | element = WebDriverWait(self.driver, 30).until(EC.presence_of_all_elements_located((By.TAG_NAME, 'table'))) 80 | page_source = self.driver.page_source 81 | bs_obj = BeautifulSoup(page_source, "lxml") 82 | return bs_obj.find('td', class_='a-content').p.get_text().encode('utf-8','ignore') -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/house.csv: -------------------------------------------------------------------------------- 1 | house,house_area,house_room,total_price,unit_price 2 | 滨江奥城听雨苑 ,115.83,3室2厅,515.0,44462 3 | 虹苑新寓三村 ,63.8,2室2厅,196.0,30722 4 | 天坛新寓 ,75.16,3室1厅,243.0,32332 5 | 棉鞋营小区 ,69.74,3室1厅,220.0,31546 6 | 常府街10至16号 ,62.21,2室1厅,212.0,34079 7 | house,house_area,house_room,total_price,unit_price 8 | 滨江奥城听雨苑 ,115.83,3室2厅,515.0,44462 9 | 虹苑新寓三村 ,63.8,2室2厅,196.0,30722 10 | 天坛新寓 ,75.16,3室1厅,243.0,32332 11 | 棉鞋营小区 ,69.74,3室1厅,220.0,31546 12 | 常府街10至16号 ,62.21,2室1厅,212.0,34079 13 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__init__.py -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class NjHouseItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | house=scrapy.Field() 15 | total_price=scrapy.Field() 16 | unit_price=scrapy.Field() 17 | house_room=scrapy.Field() 18 | house_area=scrapy.Field() -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class NjHouseSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class NjHousePipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for nj_house project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'nj_house' 13 | 14 | SPIDER_MODULES = ['nj_house.spiders'] 15 | NEWSPIDER_MODULE = 'nj_house.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'nj_house (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'nj_house.middlewares.NjHouseSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'nj_house.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'nj_house.pipelines.NjHousePipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/__pycache__/lj_house.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/__pycache__/lj_house.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/lj_house.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import scrapy 4 | from nj_house.items import NjHouseItem 5 | 6 | class LjHouseSpider(scrapy.Spider): 7 | name = "lj_house" 8 | allowed_domains = ["nj.lianjia.com/ershoufang/"] 9 | start_urls = ['http://nj.lianjia.com/ershoufang//'] 10 | 11 | def parse(self, response): 12 | clears = response.css('.sellListContent li') 13 | item = NjHouseItem() 14 | for c in clears: 15 | house = c.css('.houseInfo a::text').extract_first() 16 | house_text = c.css('.houseInfo::text').extract_first() 17 | house_info_list = [e for e in re.split('\|', int(house_text)) if len(e) > 1] 18 | house_room = house_info_list[0].strip() 19 | house_area = ''.join(re.findall(r'[\d+\.]', house_info_list[1])) 20 | total_price = c.css('.totalPrice span::text').extract_first() 21 | unit_price = c.css('.unitPrice span::text').extract_first() 22 | unit_price = re.findall('\d+', unit_price)[0] 23 | 24 | item['house'] = house 25 | item['total_price'] = float(total_price) 26 | item['unit_price'] = int(unit_price) 27 | item['house_area'] = float(house_area) 28 | item['house_room'] = house_room 29 | yield item 30 | 31 | page_info = response.css('div[class="page-box fr"]').css('div::attr(page-data)').extract_first() 32 | page_list = re.findall('\d+', page_info) 33 | next_page = 'pg' + str(int(page_list[1]) + 1) 34 | url = self.start_urls[0] + next_page 35 | if next_page: 36 | yield Request(url=url, callback=self.parse) 37 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = nj_house.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = nj_house 12 | -------------------------------------------------------------------------------- /Python3网络爬虫快速入门篇/biqukan.py: -------------------------------------------------------------------------------- 1 | # -*- coding:UTF-8 -*- 2 | from bs4 import BeautifulSoup 3 | import requests, sys 4 | 5 | """ 6 | 类说明:下载《笔趣看》网小说《一念永恒》 7 | Parameters: 8 | 无 9 | Returns: 10 | 无 11 | """ 12 | class downloader(object): 13 | 14 | def __init__(self): 15 | self.server = 'http://www.biqukan.com/' 16 | self.target = 'http://www.biqukan.com/1_1094/' 17 | self.names = [] #存放章节名 18 | self.urls = [] #存放章节链接 19 | self.nums = 0 #章节数 20 | 21 | """ 22 | 函数说明:获取下载链接 23 | Parameters: 24 | 无 25 | Returns: 26 | 无 27 | """ 28 | def get_download_url(self): 29 | req = requests.get(url = self.target) 30 | html = req.text 31 | div_bf = BeautifulSoup(html) 32 | div = div_bf.find_all('div', class_ = 'listmain') 33 | a_bf = BeautifulSoup(str(div[0])) 34 | a = a_bf.find_all('a') 35 | self.nums = len(a[15:]) #剔除不必要的章节,并统计章节数 36 | for each in a[15:]: 37 | self.names.append(each.string) 38 | self.urls.append(self.server + each.get('href')) 39 | 40 | """ 41 | 函数说明:获取章节内容 42 | Parameters: 43 | target - 下载连接(string) 44 | Returns: 45 | texts - 章节内容(string) 46 | """ 47 | def get_contents(self, target): 48 | req = requests.get(url = target) 49 | html = req.text 50 | bf = BeautifulSoup(html, "lxml") 51 | texts = bf.find_all('div', class_ = 'showtxt') 52 | texts = texts[0].text.replace('\xa0'*8,'\n\n') 53 | return texts 54 | 55 | """ 56 | 函数说明:将爬取的文章内容写入文件 57 | Parameters: 58 | name - 章节名称(string) 59 | path - 当前路径下,小说保存名称(string) 60 | text - 章节内容(string) 61 | Returns: 62 | 无 63 | """ 64 | def writer(self, name, path, text): 65 | write_flag = True 66 | with open(path, 'a', encoding='utf-8') as f: 67 | f.write(name + '\n') 68 | f.writelines(text) 69 | f.write('\n\n') 70 | 71 | if __name__ == "__main__": 72 | dl = downloader() 73 | dl.get_download_url() 74 | print('《一年永恒》开始下载:') 75 | for i in range(dl.nums): 76 | dl.writer(dl.names[i], '一念永恒.txt', dl.get_contents(dl.urls[i])) 77 | sys.stdout.write(" 已下载:%.3f%%" % float(i/dl.nums) + '\r') 78 | sys.stdout.flush() 79 | print('《一年永恒》下载完成') -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Python3网络爬虫中小型项目实战清单 2 | 3 | 01.python爬取电影天堂 4 | 5 | 02.python爬取斗罗大陆小说 6 | 7 | 03.Python抓取欧洲足球联赛数据 8 | 9 | 04.python爬取豆瓣电影Top250 10 | 11 | 05.python爬取股票数据 12 | 13 | 06.python爬取人人贷网数据 14 | 15 | 07.python爬取创业邦创投库 16 | 17 | 08.python抓取美团网百万商家信息 *** 18 | 19 | 09.python爬取网易云音乐评论并把他们存入mysql数据库 *** 20 | 21 | 10.python爬取“网上购物”类APP 22 | 23 | 11.python爬取链家网房价信息 *** 24 | 25 | 12.python爬取并分析豆瓣中最新电影的影评(词云显示) 26 | 27 | 13.python爬取豆瓣书籍信息 28 | 29 | 14.python爬取今日头条信息并导入mongodb数据库 30 | 31 | 15.python爬取百度招聘内容并存入mongodb数据库 *** 32 | 33 | 16.python爬取熊猫直播用户信息 34 | 35 | 17.scrapy爬取游天下南京短租房信息并存入mongodb数据库 36 | 37 | 18.scrapy爬取中国医学人才网信息并以json格式保存 38 | 39 | 19.scrapy框架爬取豆瓣电影top250信息 40 | 41 | 20.scrapy爬取织梦者网站信息并存入mongodb数据库 *** 42 | 43 | 21.python爬取豆瓣电影<前任3>评论(词云显示) 44 | 45 | 22.python爬取Bilibili用户信息并导入mysql数据库 *** 46 | 47 | 23.python爬取网易云音乐所有歌曲的评论数 48 | 49 | 24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库 *** 50 | 51 | 25.scrapy爬取前程无忧网站python相关的工作信息 52 | 53 | 26.scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库 *** 54 | 55 | 27.scrapy爬取南京20000多套二手房信息 --------------------------------------------------------------------------------