├── Python3网络爬虫中小型项目实战集中营 ├── 01_python爬取电影天堂 │ ├── dytt.py │ └── 电影天堂.csv ├── 02_python爬取斗罗大陆小说 │ ├── dldl.py │ ├── 斗破苍穹小说.csv │ ├── 斗破苍穹小说.py │ └── 斗罗大陆小说.csv ├── 03_python爬取欧洲足球联赛数据 │ └── footballData.py ├── 04_python爬取豆瓣电影Top250 │ ├── douban_top250_movies.csv │ └── filmTop250.py ├── 05_python爬取股票数据 │ └── stockInfo.py ├── 06_python爬取人人贷网数据 │ └── peopleLoad.py ├── 07_python爬取创业邦创投库 │ ├── python爬取创业邦创投库.py │ └── resultsDatas.csv ├── 08_python抓取美团网百万商家信息 │ ├── meituan.csv │ └── python抓取美团网百万商家信息.py ├── 09_python爬取网易云音乐评论并把他们存入mysql数据库 │ └── python爬取网易云音乐评论并把他们存入mysql数据库.py ├── 10_python爬取“网上购物”类APP │ ├── apps.csv │ ├── python爬取网上购物类APP数据py │ └── 网上购物类APP数据分析并展示.py ├── 11_python爬取链家网房价信息 │ ├── Lianjia_Info_v1.py │ ├── Lianjia_Info_v2.py │ ├── Lianjia_Info_v3.py │ ├── Lianjia_Info_v4.py │ ├── Lianjia_Info_v4_analysis.py │ ├── lianjia.csv │ ├── lianjia_ershou_futian_100.xlsx │ └── lianjia_re_v4.csv ├── 12_python爬取并分析豆瓣中最新电影的影评(词云显示) │ ├── alice_mask.png │ ├── alice_mask1.png │ ├── python爬取并分析豆瓣中最新电影的影评.py │ ├── show_Chinese.png │ ├── stopwords.txt │ └── 豆瓣影评爬取入库.py ├── 13_python爬取豆瓣书籍信息 │ ├── books.csv │ └── python爬取豆瓣书籍信息.py ├── 14_python爬取今日头条信息并导入mongodb数据库 │ └── python爬取今日头条信息并导入mongodb数据库.py ├── 15_python使用selenium爬取百度招聘内容并存入mongodb数据库 │ └── python使用selenium爬取百度招聘内容并入mongodb数据库.py ├── 16_python爬取熊猫直播用户信息 │ └── python爬取熊猫直播用户信息.py ├── 17_scrapy爬取游天下南京短租房信息并存入mongodb数据库 │ └── youtxNanJin │ │ ├── README.txt │ │ ├── scrapy.cfg │ │ ├── youtxNanJin │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── items.cpython-36.pyc │ │ │ ├── pipelines.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── youtxNanJin_spider.cpython-36.pyc │ │ │ └── youtxNanJin_spider.py │ │ ├── 游天下南京.csv │ │ └── 游天下南京.json ├── 18_scrapy爬取中国医学人才网信息并以json格式保存 │ └── chinadoctornet │ │ ├── README.txt │ │ ├── chinadoctornet │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── items.cpython-36.pyc │ │ │ ├── pipelines.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── chinadoctornet_spider.cpython-36.pyc │ │ │ └── chinadoctornet_spider.py │ │ ├── scrapy.cfg │ │ ├── 中国医学人才网招聘最新招聘专栏.csv │ │ └── 中国医学人才网招聘最新招聘专栏.json ├── 19_scrapy框架爬取豆瓣电影top250信息 │ └── doubanmovie │ │ ├── README.txt │ │ ├── doubanmovie │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── items.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── doubanmovie_spider.cpython-36.pyc │ │ │ └── doubanmovie_spider.py │ │ ├── items.csv │ │ ├── items.json │ │ └── scrapy.cfg ├── 20_scrapy爬取织梦者网站信息并存入mongodb数据库 │ └── makedream │ │ ├── makedream │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── items.cpython-36.pyc │ │ │ ├── pipelines.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── makedream_spider.cpython-36.pyc │ │ │ └── makedream_spider.py │ │ └── scrapy.cfg ├── 21_python爬取豆瓣电影前任3评论(词云显示) │ ├── ComentsAnaylst.py │ ├── ciyun.jpg │ ├── ciyun.png │ ├── douban.txt │ └── douban_qianren3.py ├── 22_python爬取Bilibili用户信息并导入mysql数据库 │ ├── bilibili_user.py │ ├── bilibili_user_info.sql │ └── user_agents.txt ├── 23_python爬取网易云音乐所有歌曲的评论数 │ ├── README.md │ ├── album_by_artist.py │ ├── artists.py │ ├── comments_by_music.py │ ├── music_by_album.py │ └── sql.py ├── 24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库 │ └── findtrip │ │ ├── ctrip_items.csv │ │ ├── findtrip │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── items.cpython-36.pyc │ │ │ ├── pipelines.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── ctrip_spider.cpython-36.pyc │ │ │ ├── qua_spider.cpython-36.pyc │ │ │ └── washctrip.cpython-36.pyc │ │ │ ├── ctrip_spider.py │ │ │ ├── qua_spider.py │ │ │ └── washctrip.py │ │ ├── qua_items.csv │ │ ├── qua_items.json │ │ └── scrapy.cfg ├── 25_scrapy爬取前程无忧网站python相关的工作信息 │ └── pythonjobs │ │ ├── PythonJobs.csv │ │ ├── pythonjobs │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── items.cpython-36.pyc │ │ │ ├── pipelines.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── job_spider.cpython-36.pyc │ │ │ └── job_spider.py │ │ └── scrapy.cfg ├── 26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库 │ └── shuimujob │ │ ├── ghostdriver.log │ │ ├── scrapy.cfg │ │ └── shuimujob │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── items.cpython-36.pyc │ │ ├── platform.cpython-36.pyc │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── platform.py │ │ ├── settings.py │ │ └── spiders │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── shuimu_spider.cpython-36.pyc │ │ └── shuimu_spider.py └── 27_scrapy爬取南京20000多套二手房信息 │ └── nj_house │ ├── house.csv │ ├── nj_house │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── items.cpython-36.pyc │ │ └── settings.cpython-36.pyc │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── lj_house.cpython-36.pyc │ │ └── lj_house.py │ └── scrapy.cfg ├── Python3网络爬虫快速入门篇 ├── README.md ├── biqukan.py └── 一念永恒.txt └── README.md /Python3网络爬虫中小型项目实战集中营/01_python爬取电影天堂/dytt.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 今日主题:python抓取电影天堂最新电影迅雷下载地址链接信息 3 | 所用模块:requests bs4 pandas数据分析 4 | ''' 5 | import requests 6 | import re 7 | import pandas as pd 8 | 9 | url = 'https://www.dy2018.com/html/gndy/dyzz/index.html' 10 | 11 | headers = { 12 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' 13 | } 14 | 15 | items_list = [] 16 | 17 | html = requests.get(url,headers=headers) 18 | html.encoding = 'gb2312' 19 | data = re.findall('.*?',html_1.text) 27 | #print(data_1[0]) 28 | list_1 = [i[1], url_1, data_1[0]] 29 | 30 | # list_1 = [url_1] 31 | 32 | items_list.append(list_1) 33 | #print (list_1) 34 | 35 | #print ('==========================================================================================================') 36 | 37 | for m in range(2, 298): 38 | url_2 = 'https://www.dy2018.com/html/gndy/dyzz/index_'+str(m)+'.html' 39 | print(url_2) 40 | html_2 = requests.get(url_2,headers=headers) 41 | html_2.encoding = 'gb2312' 42 | data_2 = re.findall('.*?',html_3.text) 50 | #print(data_3[0]) 51 | if len(data_3) < 1: 52 | continue 53 | list_2 = [n[1], url_3, data_3[0]] 54 | # list_2 = [url_3] 55 | 56 | 57 | items_list.append(list_2) 58 | #print (list_2) 59 | #print ('=====================================================================================================') 60 | 61 | df = pd.DataFrame(items_list, columns = ['电影名称','电影网址链接','电影迅雷下载链接']) 62 | 63 | df.to_csv('dytt.csv') -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/01_python爬取电影天堂/电影天堂.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/01_python爬取电影天堂/电影天堂.csv -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/02_python爬取斗罗大陆小说/dldl.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 今日主题:python抓取斗罗大陆最新章节标题信息 3 | 所用模块:requests re bs4 pandas数据分析 4 | ''' 5 | import requests 6 | import re 7 | import pandas as pd 8 | from bs4 import BeautifulSoup #分析网页 获取标签内容 9 | 10 | url = 'https://www.freexs.org/novel/0/896/index.html' 11 | 12 | headers = { 13 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' 14 | } 15 | 16 | items_list = [] 17 | 18 | html = requests.get(url,headers=headers) 19 | html.encoding = 'gb2312' 20 | 21 | data = re.findall('
(.*?)
' 68 | regx = re.compile(reg) 69 | ads = re.findall(regx, str(addresss)) 70 | # print(ads) 71 | # for adds in ads: 72 | # data = adds.split('|') 73 | # print(data) 74 | for itm_url, job_detail, ver_compny, ver_salary, ver_addres in zip(item_url, jobs, compy, salarys, ads): 75 | data = { 76 | 'itme_url': 'http://zhaopin.baidu.com'+itm_url.get('href'), 77 | 'job_detail': job_detail.string, 78 | 'ver_compny': str(ver_compny.string), 79 | 'ver_salary': ver_salary.string, 80 | 'ver_addres': str(ver_addres).split('|'), 81 | } 82 | print(data) 83 | # 插入数据库 84 | ver_job.insert_one(data) # 插入数据库失败 85 | f.write(str(data)) 86 | 87 | 88 | def get_page_source(page_num): 89 | time.sleep(2) 90 | driver.find_element_by_xpath('//*[@id="pagination"]/p/span/a[%s]' % page_num).click() 91 | # //*[@id="pagination"]/p/span/a[1] 为在第一页的按钮 92 | # //*[@id="pagination"]/p/span/a[2] 为第二页的按钮 93 | set_winscroll(driver) 94 | we_data = driver.page_source 95 | return we_data 96 | 97 | f = open('百度招聘前30页杭州.csv', 'a',encoding='utf-8') 98 | # 首页的数据 99 | def getBaiduHangZhouJob(we_data): 100 | parse_html(we_data) 101 | for i in range(1, 50): 102 | if i==1: 103 | we_data = get_page_source(1) 104 | parse_html(we_data) 105 | elif i<=5: 106 | we_data = get_page_source(str(2)) 107 | parse_html(we_data) 108 | else: 109 | we_data = get_page_source(str(3)) 110 | parse_html(we_data) 111 | f.close() 112 | 113 | 114 | if __name__ == '__main__': 115 | getBaiduHangZhouJob(we_data) 116 | # pool = Pool(processes=10) 117 | # pool.map_async(getBaiduHangZhouJob(we_data)) 118 | # pool.close() 119 | # f.close() -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/16_python爬取熊猫直播用户信息/python爬取熊猫直播用户信息.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import requests 4 | import json 5 | import pandas as pd 6 | 7 | url = "https://www.panda.tv/live_lists?status=2&order=person_num&token=&pageno=%d&pagenum=120&_=%d".format(a=range(0,35),b=range(1501946526480,1501946526880)) 8 | 9 | headers = { 10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0' 11 | , 12 | 'Cookie': '__guid=96554777.3243119502220345300.1500627276199.6702; smid=608e0bde-ffe2-4251-90ca-2938cabdc074; monitor_count=18' 13 | , 14 | } 15 | 16 | 17 | def getHtml(url): 18 | req = requests.get(url, headers=headers) 19 | print(req.text) 20 | return req.text 21 | 22 | 23 | def printInfos(data): 24 | jsondata = json.loads(data, "utf-8") 25 | # print(jsondata) 26 | itemsinfo = jsondata['data']['items'] 27 | items_list = [] 28 | for pinfo in itemsinfo: 29 | name = pinfo['name'] 30 | person_num = pinfo['person_num'] 31 | nickName = pinfo['userinfo']['nickName'] 32 | lelvel = pinfo['host_level_info'] 33 | lable = pinfo['label'] 34 | cname = pinfo['classification'] 35 | item_list = [name, person_num, nickName, lelvel, label, cname] 36 | items_list.append(item_list) 37 | df = pd.DataFrame(items_list, columns = ['name','person_num','nickName','host_level_info','label','classification']) 38 | df.to_csv('熊猫直播用户信息.csv') 39 | 40 | 41 | def mainStart(): 42 | for n in range(0, 3): 43 | pageindex = 1 + n 44 | pagetime = int(1501946526480 + n) 45 | url = "https://www.panda.tv/live_lists?status=2&order=person_num&token=&pageno=%d&pagenum=120&_=%d"%(pageindex,pagetime) 46 | data = getHtml(url) 47 | printInfos(data) 48 | 49 | mainStart() -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/README.txt: -------------------------------------------------------------------------------- 1 | 输入:scrapy crawl youtx -o items.json 时以json格式保存下载数据 2 | 输入:scrapy crawl youtx -o items.csv 时以csv格式保存下载数据 3 | 4 | 5 | Scrapy必须背下来的命令: 6 | 1 创建项目: scrapy startproject youtxNanJin 7 | startproject: 表示创建项目 8 | youtxNanJin: 表示创建的项目名 9 | 10 | 2 创建爬虫: scrapy genspider youtx "http://www.youtx.com" 11 | genspider: 表示生成一个爬虫(默认是scrapy.Spider类) 12 | youtx: 表示爬虫名(对应爬虫代码里的 name 参数) 13 | "http://www.youtx.com": 表示允许爬虫爬取的域范围 14 | 15 | 3 执行爬虫: scrapy crawl youtx 16 | crawl: 表示启动一个sc rapy爬虫 17 | youtx: 表示需要启动的爬虫名(对应爬虫代码里的 name 参数) -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = youtxNanJin.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = youtxNanJin 12 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__init__.py -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class YoutxnanjinItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | # pass 15 | 16 | # 房源名称 17 | homeName = scrapy.Field() 18 | # 房源链接 19 | homeLine = scrapy.Field() 20 | # 房租单价 21 | homeSinglePrice = scrapy.Field() 22 | # 房租地址 23 | homeAddress = scrapy.Field() 24 | # 房租近期信息 25 | homeDetai = scrapy.Field() 26 | # 满七天价格 27 | homeSeven = scrapy.Field() 28 | # 满30天价格 29 | homeThirth = scrapy.Field() 30 | 31 | # 房东 32 | homePerson = scrapy.Field() 33 | # 房东头像 34 | homePersonImg = scrapy.Field() 35 | # 房东头像链接 36 | homePersonLink = scrapy.Field() 37 | 38 | # 房子大图 39 | homePicBg = scrapy.Field() 40 | # 房子大图链接 41 | homePicLink = scrapy.Field() 42 | 43 | # 品牌店铺信息 44 | # homePinPai = scrapy.Field() 45 | # 明星房东 46 | # homeStarrPerson = scrapy.Field() 47 | 48 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class YoutxnanjinSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import json 8 | from scrapy.conf import settings 9 | import pymongo 10 | 11 | 12 | class YoutxnanjinPipeline(object): 13 | def process_item(self, item, spider): 14 | return item 15 | 16 | 17 | class YouTXMongo(object): 18 | def __init__(self): 19 | self.client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT']) 20 | self.db = self.client[settings['MONGO_DB']] 21 | self.post = self.db[settings['MONGO_COLL']] 22 | 23 | def process_item(self, item, spider): 24 | postItem = dict(item) 25 | self.post.insert(postItem) 26 | return item 27 | 28 | # 写入json文件 29 | class JsonWritePipline(object): 30 | def __init__(self): 31 | self.file = open('游天下南京.json','w',encoding='utf-8') 32 | 33 | def process_item(self,item,spider): 34 | line = json.dumps(dict(item),ensure_ascii=False)+"\n" 35 | self.file.write(line) 36 | return item 37 | 38 | def spider_closed(self,spider): 39 | self.file.close() -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for youtxNanJin project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'youtxNanJin' 13 | 14 | SPIDER_MODULES = ['youtxNanJin.spiders'] 15 | NEWSPIDER_MODULE = 'youtxNanJin.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'youtxNanJin (+http://www.yourdomain.com)' 20 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' 21 | 22 | 23 | # Obey robots.txt rules 24 | ROBOTSTXT_OBEY = False 25 | 26 | # 配置mongoDB 27 | MONGO_HOST = "127.0.0.1" # 主机IP 28 | MONGO_PORT = 27017 # 端口号 29 | MONGO_DB = "YouTianXia" # 库名 30 | MONGO_COLL = "house_nanjin" # collection 31 | 32 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 33 | #CONCURRENT_REQUESTS = 32 34 | 35 | # Configure a delay for requests for the same website (default: 0) 36 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 37 | # See also autothrottle settings and docs 38 | #DOWNLOAD_DELAY = 3 39 | # The download delay setting will honor only one of: 40 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 41 | #CONCURRENT_REQUESTS_PER_IP = 16 42 | 43 | # Disable cookies (enabled by default) 44 | COOKIES_ENABLED = False 45 | 46 | # Disable Telnet Console (enabled by default) 47 | #TELNETCONSOLE_ENABLED = False 48 | 49 | # Override the default request headers: 50 | #DEFAULT_REQUEST_HEADERS = { 51 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 52 | # 'Accept-Language': 'en', 53 | #} 54 | 55 | # Enable or disable spider middlewares 56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 57 | #SPIDER_MIDDLEWARES = { 58 | # 'youtxNanJin.middlewares.YoutxnanjinSpiderMiddleware': 543, 59 | #} 60 | 61 | # Enable or disable downloader middlewares 62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 63 | #DOWNLOADER_MIDDLEWARES = { 64 | # 'youtxNanJin.middlewares.MyCustomDownloaderMiddleware': 543, 65 | #} 66 | 67 | # Enable or disable extensions 68 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 69 | #EXTENSIONS = { 70 | # 'scrapy.extensions.telnet.TelnetConsole': None, 71 | #} 72 | 73 | # Configure item pipelines 74 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 75 | ITEM_PIPELINES = { 76 | # 'youtxNanJin.pipelines.YoutxnanjinPipeline': 300, 77 | 'youtxNanJin.pipelines.YouTXMongo': 300, 78 | 'youtxNanJin.pipelines.JsonWritePipline': 300, 79 | } 80 | 81 | # Enable and configure the AutoThrottle extension (disabled by default) 82 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 83 | #AUTOTHROTTLE_ENABLED = True 84 | # The initial download delay 85 | #AUTOTHROTTLE_START_DELAY = 5 86 | # The maximum download delay to be set in case of high latencies 87 | #AUTOTHROTTLE_MAX_DELAY = 60 88 | # The average number of requests Scrapy should be sending in parallel to 89 | # each remote server 90 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 91 | # Enable showing throttling stats for every response received: 92 | #AUTOTHROTTLE_DEBUG = False 93 | 94 | # Enable and configure HTTP caching (disabled by default) 95 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 96 | #HTTPCACHE_ENABLED = True 97 | #HTTPCACHE_EXPIRATION_SECS = 0 98 | #HTTPCACHE_DIR = 'httpcache' 99 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 100 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 101 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/__pycache__/youtxNanJin_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/__pycache__/youtxNanJin_spider.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/youtxNanJin_spider.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | import scrapy 3 | from youtxNanJin.items import YoutxnanjinItem 4 | 5 | class NanJinDefault(scrapy.Spider): 6 | name = 'youtx' 7 | allowed_domains = ['youtx.com'] 8 | start_urls = ["http://www.youtx.com/nanjing/longrent1-page{}".format(n) for n in range(0,6)] 9 | def parse(self, response): 10 | # print(response.body) 11 | node_list = response.xpath("//div[@class='duanzu houseList']/ul/li[@class='clearfix']") 12 | # print(node_list) 13 | for node in node_list: 14 | item = YoutxnanjinItem() 15 | homeName = node.xpath("./div[@class='houseInfo clearfix']/div[@class='house-tit clearfix']/h3/a/text()").extract() 16 | homeLink = node.xpath("./div[@class='houseInfo clearfix']/div[@class='house-tit clearfix']/h3/a/@href").extract() 17 | print(homeName) 18 | print(homeLink) 19 | 20 | # 单日价格 21 | homeSinglePrice = node.xpath("./div[@class='houseInfo clearfix']/div[@class='house-tit clearfix']/div[@class='house-price mt9']/span/span[@class='housePrice']/text()").extract() 22 | print(homeSinglePrice) 23 | 24 | # 获取房源地址 25 | homeAddress = node.xpath("./div[@class='houseInfo clearfix']/div[@class='houseInfo-left mt2']/p[@class='clearfix mt5']/text()").extract() 26 | # 房租信息 27 | homeDesc =node.xpath("./div[@class='houseInfo clearfix']/div[@class='houseInfo-left mt2']/p[@class='mt5']/text()").extract() 28 | homeDesc2 =node.xpath("./div[@class='houseInfo clearfix']/div[@class='houseInfo-left mt2']/p[@class='mt5']/span[2]/text()").extract() 29 | print(homeAddress) 30 | print(homeDesc) 31 | print(homeDesc2) 32 | 33 | # 满30天的信息 34 | homeThrty = node.xpath("./div[@class='houseInfo clearfix']/div[@class='house-tit clearfix']/div[@class='house-price mt9']/div[@class='mix12_5']/div[@class='discount']/div[@class='discount-price']/span//text()").extract() 35 | print(homeThrty) 36 | # 房东信息 37 | homePerson = node.xpath("./div[@class='houseInfo clearfix']/div[@class='agentInfo mt16']/p[1]/a/text()").extract() 38 | # 房东链接 39 | homePersonLink = node.xpath("./div[@class='houseInfo clearfix']/div[@class='agentInfo mt16']/p[1]/a/@href").extract() 40 | print(homePerson) 41 | print(homePersonLink) 42 | 43 | # 房源大图图片 44 | homeBigPic = node.xpath("./div[@class='house-img']/a[1]/img/@src").extract() 45 | homeBigPicLink = node.xpath("./div[@class='house-img']/a[1]/@href").extract() 46 | print(homeBigPic) 47 | print(homeBigPicLink) 48 | # 房东头像信息 49 | personPic = node.xpath("./div[@class='house-img']/a[2]/img/@src").extract() 50 | # 房东头像链接地址 51 | personPicLink = node.xpath("./div[@class='house-img']/a[2]/img/@href").extract() 52 | 53 | print(personPic) 54 | print(homePersonLink) 55 | item['homeName'] ="".join(homeName) 56 | item['homeLine'] ="".join(homeLink) 57 | item['homeSinglePrice'] ="".join(homeSinglePrice) 58 | item['homeAddress'] ="".join(homeAddress) 59 | item['homeDetai'] ="".join(homeDesc)+"".join(homeDesc2) 60 | # 这里的值暂时没有取出来 61 | item['homeSeven'] ="".join(homeThrty) 62 | item['homeThirth'] ="".join(homeThrty) 63 | 64 | item['homePerson'] ="".join(homePerson) 65 | item['homePersonImg'] ="".join(personPic) 66 | item['homePersonLink'] ="".join(homePersonLink) 67 | item['homePicBg'] ="".join(homeBigPic) 68 | item['homePicLink'] ="".join(homeBigPicLink) 69 | yield item -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/README.txt: -------------------------------------------------------------------------------- 1 | 输入:scrapy crawl docNet -o items.json 时以json格式保存下载数据 2 | 输入:scrapy crawl docNet -o items.csv 时以csv格式保存下载数据 -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__init__.py -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ChinadoctornetItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | 15 | # 爬取中国医学人才网的条目(共5个条目) 16 | # 医院名称 17 | hospitalName = scrapy.Field() 18 | # 医院规模 19 | hospitalSize = scrapy.Field() 20 | # 医院所在地 21 | hospitalAddress = scrapy.Field() 22 | # 医院科目 23 | hospitalDesc = scrapy.Field() 24 | # pass 25 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class ChinadoctornetSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | # import json 8 | 9 | class ChinadoctornetPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | 13 | 14 | # class JsonWriterPipeline(object): 15 | # def __init__(self): 16 | # self.file = open('中国医学人才网招聘最新招聘专栏2.json', 'w', encoding='utf-8') 17 | 18 | # def process_item(self, item, spider): 19 | # line = json.dumps(dict(item), ensure_ascii=False) + "\n" 20 | # self.file.write(line) 21 | # return item 22 | 23 | # def spider_closed(self, spider): 24 | # self.file.close() -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for chinadoctornet project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'chinadoctornet' 13 | 14 | SPIDER_MODULES = ['chinadoctornet.spiders'] 15 | NEWSPIDER_MODULE = 'chinadoctornet.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'chinadoctornet (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'chinadoctornet.middlewares.ChinadoctornetSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'chinadoctornet.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | # ITEM_PIPELINES = { 68 | # # 'chinadoctornet.pipelines.ChinadoctornetPipeline': 300, 69 | # 'chinadoctornet.pipelines.JsonWritePipline': 300, 70 | # } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/__pycache__/chinadoctornet_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/__pycache__/chinadoctornet_spider.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/chinadoctornet_spider.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | import scrapy 3 | from chinadoctornet.items import ChinadoctornetItem 4 | 5 | 6 | class ChinaDocNet(scrapy.Spider): 7 | # 启动爬虫的名称 8 | name = 'docNet' 9 | # 爬取域名的范围 10 | allowed_domains = ['yixuezp.com'] 11 | # 爬虫第一个url地址 12 | start_urls = ['http://www.yixuezp.com/zhaopin?page={}'.format(n) for n in range(0, 464)] # 463 13 | 14 | def parse(self, response): 15 | # 医院name 16 | node_list = response.xpath("//div[@class='newsjob']/ul/li") 17 | items = [] 18 | for node in node_list: 19 | item = ChinadoctornetItem() 20 | hospitalName = node.xpath("./a/text()").extract() 21 | hospitalSize = node.xpath("./span[1]/text()").extract() 22 | hospitalAddress = node.xpath("./span[2]/text()").extract() 23 | hospitalDesc = node.xpath("./p/a/text()").extract() 24 | 25 | item['hospitalName'] = hospitalName 26 | item['hospitalSize'] = hospitalSize 27 | item['hospitalAddress'] = hospitalAddress 28 | item['hospitalDesc'] = hospitalDesc 29 | items.append(item) 30 | # return items # 如果直接return的话,一页数据只会返回一条数据 31 | yield item #用yield 的话,可以交给下载器,继续执行下一步操作。 -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = chinadoctornet.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = chinadoctornet 12 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/中国医学人才网招聘最新招聘专栏.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/中国医学人才网招聘最新招聘专栏.csv -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/README.txt: -------------------------------------------------------------------------------- 1 | 输入:scrapy crawl doubanMovie -o items.json 时以json格式保存下载数据 2 | 输入:scrapy crawl doubanMovie -o items.csv 时以csv格式保存下载数据 -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__init__.py -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class DoubanmovieItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | # pass 15 | 16 | # 电影名字 17 | name = scrapy.Field() 18 | # 电影信息 19 | info = scrapy.Field() 20 | # 评分 21 | rating = scrapy.Field() 22 | # 评论人数 23 | num = scrapy.Field() 24 | # 经典语句 25 | quote = scrapy.Field() 26 | # 电影图片 27 | img_url = scrapy.Field() 28 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class DoubanmovieSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class DoubanmoviePipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for doubanmovie project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'doubanmovie' 13 | 14 | SPIDER_MODULES = ['doubanmovie.spiders'] 15 | NEWSPIDER_MODULE = 'doubanmovie.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'doubanmovie (+http://www.yourdomain.com)' 20 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0' 21 | 22 | # Obey robots.txt rules 23 | ROBOTSTXT_OBEY = False 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | #CONCURRENT_REQUESTS = 32 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | #DOWNLOAD_DELAY = 3 32 | # The download delay setting will honor only one of: 33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 34 | #CONCURRENT_REQUESTS_PER_IP = 16 35 | 36 | # Disable cookies (enabled by default) 37 | COOKIES_ENABLED = False 38 | 39 | # Disable Telnet Console (enabled by default) 40 | #TELNETCONSOLE_ENABLED = False 41 | 42 | # Override the default request headers: 43 | #DEFAULT_REQUEST_HEADERS = { 44 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 45 | # 'Accept-Language': 'en', 46 | #} 47 | 48 | # Enable or disable spider middlewares 49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 50 | #SPIDER_MIDDLEWARES = { 51 | # 'doubanmovie.middlewares.DoubanmovieSpiderMiddleware': 543, 52 | #} 53 | 54 | # Enable or disable downloader middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 56 | #DOWNLOADER_MIDDLEWARES = { 57 | # 'doubanmovie.middlewares.MyCustomDownloaderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable extensions 61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 68 | #ITEM_PIPELINES = { 69 | # 'doubanmovie.pipelines.DoubanmoviePipeline': 300, 70 | #} 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/__pycache__/doubanmovie_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/__pycache__/doubanmovie_spider.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/doubanmovie_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from doubanmovie.items import DoubanmovieItem 3 | 4 | class Movie(scrapy.Spider): 5 | # 爬虫唯一标识符 6 | name = 'doubanMovie' 7 | # 爬取域名 8 | allowed_domain = ['movie.douban.com'] 9 | # 爬取页面地址 10 | start_urls = ['https://movie.douban.com/top250'] 11 | 12 | def parse(self, response): 13 | selector = scrapy.Selector(response) 14 | # 解析出各个电影 15 | movies = selector.xpath('//div[@class="item"]') 16 | # 存放电影信息 17 | item = DoubanmovieItem() 18 | 19 | for movie in movies: 20 | 21 | # 电影各种语言名字的列表 22 | titles = movie.xpath('.//span[@class="title"]/text()').extract() 23 | # 将中文名与英文名合成一个字符串 24 | name = '' 25 | for title in titles: 26 | name += title.strip() 27 | item['name'] = name 28 | 29 | # 电影信息列表 30 | infos = movie.xpath('.//div[@class="bd"]/p/text()').extract() 31 | # 电影信息合成一个字符串 32 | fullInfo = '' 33 | for info in infos: 34 | fullInfo += info.strip() 35 | item['info'] = fullInfo 36 | # 提取评分信息 37 | item['rating'] = movie.xpath('.//span[@class="rating_num"]/text()').extract()[0].strip() 38 | # 提取评价人数 39 | item['num'] = movie.xpath('.//div[@class="star"]/span[last()]/text()').extract()[0].strip()[:-3] 40 | # 提取经典语句,quote可能为空 41 | quote = movie.xpath('.//span[@class="inq"]/text()').extract() 42 | if quote: 43 | quote = quote[0].strip() 44 | item['quote'] = quote 45 | # 提取电影图片 46 | item['img_url'] = movie.xpath('.//img/@src').extract()[0] 47 | 48 | yield item 49 | 50 | next_page = selector.xpath('//span[@class="next"]/a/@href').extract()[0] 51 | url = 'https://movie.douban.com/top250' + next_page 52 | if next_page: 53 | yield scrapy.Request(url, callback=self.parse) 54 | 55 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = doubanmovie.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = doubanmovie 12 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__init__.py -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class MakedreamItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | # pass 15 | 16 | # 文章标题 17 | articleTitle = scrapy.Field() 18 | # 文章标题url 19 | articleUrl = scrapy.Field() 20 | # 文章描述 21 | articleDesc = scrapy.Field() 22 | # 文章发布时间 23 | articlePublic = scrapy.Field() 24 | # 文章类型 25 | articleType = scrapy.Field() 26 | # 文章标签 27 | articleTag = scrapy.Field() 28 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class MakedreamSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import json 8 | import pymongo 9 | from scrapy.conf import settings 10 | 11 | class MakedreamPipeline(object): 12 | def process_item(self, item, spider): 13 | return item 14 | 15 | 16 | class DreamMongo(object): 17 | def __init__(self): 18 | self.client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT']) 19 | self.db = self.client[settings['MONGO_DB']] 20 | self.post = self.db[settings['MONGO_COLL']] 21 | 22 | def process_item(self, item, spider): 23 | postItem = dict(item) 24 | self.post.insert(postItem) 25 | return item 26 | 27 | 28 | # 写入json文件类 29 | class JsonWritePipeline(object): 30 | def __init__(self): 31 | self.file = open('织梦网其他编程.json', 'w', encoding='utf-8') 32 | 33 | def process_item(self, item, spider): 34 | line = json.dumps(dict(item), ensure_ascii=False) + "\n" 35 | self.file.write(line) 36 | return item 37 | 38 | def spider_closed(self, spider): 39 | self.file.close() -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for makedream project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'makedream' 13 | 14 | SPIDER_MODULES = ['makedream.spiders'] 15 | NEWSPIDER_MODULE = 'makedream.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'makedream (+http://www.yourdomain.com)' 20 | # 配置mongoDB 21 | MONGO_HOST = "127.0.0.1" # 主机IP 22 | MONGO_PORT = 27017 # 端口号 23 | MONGO_DB = "DreamDB" # 库名 24 | MONGO_COLL = "Dream_info" # collection 25 | 26 | 27 | 28 | # Obey robots.txt rules 29 | ROBOTSTXT_OBEY = False 30 | 31 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 32 | #CONCURRENT_REQUESTS = 32 33 | 34 | # Configure a delay for requests for the same website (default: 0) 35 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 36 | # See also autothrottle settings and docs 37 | #DOWNLOAD_DELAY = 3 38 | # The download delay setting will honor only one of: 39 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 40 | #CONCURRENT_REQUESTS_PER_IP = 16 41 | 42 | # Disable cookies (enabled by default) 43 | # COOKIES_ENABLED = False 44 | 45 | # Disable Telnet Console (enabled by default) 46 | #TELNETCONSOLE_ENABLED = False 47 | 48 | # Override the default request headers: 49 | #DEFAULT_REQUEST_HEADERS = { 50 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 51 | # 'Accept-Language': 'en', 52 | #} 53 | 54 | # Enable or disable spider middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 56 | #SPIDER_MIDDLEWARES = { 57 | # 'makedream.middlewares.MakedreamSpiderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable downloader middlewares 61 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 62 | #DOWNLOADER_MIDDLEWARES = { 63 | # 'makedream.middlewares.MyCustomDownloaderMiddleware': 543, 64 | #} 65 | 66 | # Enable or disable extensions 67 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 68 | #EXTENSIONS = { 69 | # 'scrapy.extensions.telnet.TelnetConsole': None, 70 | #} 71 | 72 | # Configure item pipelines 73 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 74 | ITEM_PIPELINES = { 75 | # 'makedream.pipelines.MakedreamPipeline': 300, 76 | 'makedream.pipelines.JsonWritePipeline':300, 77 | 'makedream.pipelines.DreamMongo':300 78 | } 79 | 80 | # Enable and configure the AutoThrottle extension (disabled by default) 81 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 82 | #AUTOTHROTTLE_ENABLED = True 83 | # The initial download delay 84 | #AUTOTHROTTLE_START_DELAY = 5 85 | # The maximum download delay to be set in case of high latencies 86 | #AUTOTHROTTLE_MAX_DELAY = 60 87 | # The average number of requests Scrapy should be sending in parallel to 88 | # each remote server 89 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 90 | # Enable showing throttling stats for every response received: 91 | #AUTOTHROTTLE_DEBUG = False 92 | 93 | # Enable and configure HTTP caching (disabled by default) 94 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 95 | #HTTPCACHE_ENABLED = True 96 | #HTTPCACHE_EXPIRATION_SECS = 0 97 | #HTTPCACHE_DIR = 'httpcache' 98 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 99 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 100 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/__pycache__/makedream_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/__pycache__/makedream_spider.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/makedream_spider.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | import scrapy 3 | from makedream.items import MakedreamItem 4 | 5 | 6 | class DramingNet(scrapy.Spider): 7 | # 启动爬虫的名称 8 | name = 'dreaming' 9 | # 爬虫的域范围 10 | allowed_domains = ['zhimengzhe.com'] 11 | # 爬虫的第一个url 12 | start_urls = ['http://www.zhimengzhe.com/bianchengjiaocheng/qitabiancheng/index_{}.html'.format(n) for n in 13 | range(0, 1466)] 14 | 15 | # 爬取结果解析 16 | def parse(self, response): 17 | base_url = 'http://www.zhimengzhe.com' 18 | # print(response.body) 19 | node_list = response.xpath("//ul[@class='list-unstyled list-article']/li") 20 | for node in node_list: 21 | item = MakedreamItem() 22 | nextNode = node.xpath("./div[@class='pull-left ltxt w658']") 23 | print('*' * 30) 24 | title = nextNode.xpath('./h3/a/text()').extract() 25 | link = nextNode.xpath('./h3/a/@href').extract() 26 | desc = nextNode.xpath('./p/text()').extract() 27 | 28 | # 创建时间,类型,标签 29 | publicTime = nextNode.xpath("./div[@class='tagtime']/span[1]/text()").extract() 30 | publicType = nextNode.xpath("./div[@class='tagtime']/span[2]/a/text()").extract() 31 | publicTag = nextNode.xpath("./div[@class='tagtime']/span[3]/a/text()").extract() 32 | # node 33 | titleLink = base_url + ''.join(link) 34 | item['articleTitle'] = title 35 | # 文章标题url 36 | item['articleUrl'] = titleLink 37 | # 文章描述 38 | item['articleDesc'] = desc 39 | # 文章发布时间 40 | item['articlePublic'] = publicTime 41 | # 文章类型 42 | item['articleType'] = publicType 43 | # 文章标签 44 | item['articleTag'] = publicTag 45 | yield item -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = makedream.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = makedream 12 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/21_python爬取豆瓣电影前任3评论(词云显示)/ComentsAnaylst.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2018/4/25 11:15 3 | # @File : commentsAnaylst.py(再见前任3的影评f词云) 4 | 5 | import matplotlib.pyplot as plt 6 | from PIL import Image 7 | from wordcloud import WordCloud 8 | import jieba 9 | import numpy as np 10 | #读取txt格式的文本内容 11 | text_from_file_with_apath = open('douban.txt','rb').read() 12 | 13 | #使用jieba进行分词,并对分词的结果以空格隔开 14 | wordlist_after_jieba = jieba.cut(text_from_file_with_apath, cut_all = True) 15 | wl_space_split = " ".join(wordlist_after_jieba) 16 | 17 | #对分词后的文本生成词云 18 | # my_wordcloud = WordCloud().generate(wl_space_split) 19 | 20 | font = r'C:\Windows\Fonts\simfang.ttf' 21 | mask = np.array(Image.open('ciyun.jpg')) 22 | wc = WordCloud(mask=mask,max_words=3000,collocations=False, font_path=font, width=5800, height=2400, margin=10,background_color='black').generate(wl_space_split) 23 | default_colors = wc.to_array() 24 | plt.title("QR 3") 25 | plt.imshow(wc) 26 | plt.axis("off") 27 | plt.savefig("ciyun.png") 28 | plt.show() -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/21_python爬取豆瓣电影前任3评论(词云显示)/ciyun.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/21_python爬取豆瓣电影前任3评论(词云显示)/ciyun.jpg -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/21_python爬取豆瓣电影前任3评论(词云显示)/ciyun.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/21_python爬取豆瓣电影前任3评论(词云显示)/ciyun.png -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/21_python爬取豆瓣电影前任3评论(词云显示)/douban_qianren3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2018/4/25 11:15 3 | # @File : test_douban_qianren3.py(再见前任3的影评) 4 | 5 | import csv 6 | import requests 7 | from lxml import etree 8 | import time 9 | 10 | 11 | url = 'https://movie.douban.com/subject/26662193/comments?start=0&limit=20&sort=new_score&status=P&percent_type=' 12 | 13 | headers = { 14 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36', 15 | 'Cookie': 'gr_user_id=ffdf2f63-ec37-49b5-99e8-0e0d28741172; bid=qh9RXgIGopg; viewed="26826540_24703171"; ap=1; ll="118172"; ct=y; _vwo_uuid_v2=8C5B24903B1D1D3886FE478B91C5DE97|7eac18658e7fecbbf3798b88cfcf6113; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1522129522%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DdnHqCRiT1HlhToCp0h1cpdyV8rB9f_OfOvJhjRPO3p1jrl764LGvi7gbYSdskDMh%26wd%3D%26eqid%3De15db1bb0000e3cd000000045ab9b6fe%22%5D; _pk_id.100001.4cf6=4e61f4192b9486a8.1485672092.10.1522130672.1522120744.; _pk_ses.100001.4cf6=*'} 16 | 17 | 18 | def get_html(current_url): 19 | time.sleep(2) 20 | r = requests.get(current_url, headers=headers) 21 | r.raise_for_status() 22 | return etree.HTML(r.text) 23 | 24 | 25 | def parse_html(content,writer): 26 | links = content.xpath("//*[@class='comment-item']") 27 | for link in links: 28 | content = link.xpath("./div[@class='comment']/p/text()")[0].strip() 29 | author = link.xpath("./div[@class='comment']/h3/span[@class='comment-info']/a/text()")[0].strip() 30 | time = link.xpath("./div[@class='comment']/h3/span[@class='comment-info']/span[@class='comment-time ']/text()")[ 31 | 0].strip() 32 | is_useful = link.xpath("./div[@class='comment']/h3/span[@class='comment-vote']/span[@class='votes']/text()")[0] 33 | print('content:', content) 34 | print('time:', time) 35 | print('is_useful:', is_useful) 36 | # detail = (author, time, is_useful, content) 37 | detail = (is_useful,content) 38 | writer.writerow(detail) 39 | 40 | 41 | if __name__ == '__main__': 42 | with open('douban.txt', 'a+', encoding='utf-8', newline='') as csvf: 43 | writer = csv.writer(csvf) 44 | writer.writerow(('作者', '时间', '有用数', '内容')) 45 | for page in range(0, 260, 20): 46 | url = 'https://movie.douban.com/subject/26662193/comments?start={}&limit=20&sort=new_score&status=P&percent_type='.format( 47 | page) 48 | r = get_html(url) 49 | parse_html(r,writer) -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/22_python爬取Bilibili用户信息并导入mysql数据库/bilibili_user.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf8-*- 2 | 3 | import requests 4 | import json 5 | import random 6 | import pymysql 7 | import sys 8 | import datetime 9 | import time 10 | from imp import reload 11 | from multiprocessing.dummy import Pool as ThreadPool 12 | 13 | 14 | def datetime_to_timestamp_in_milliseconds(d): 15 | def current_milli_time(): return int(round(time.time() * 1000)) 16 | 17 | return current_milli_time() 18 | 19 | 20 | reload(sys) 21 | 22 | 23 | def LoadUserAgents(uafile): 24 | """ 25 | uafile : string 26 | path to text file of user agents, one per line 27 | """ 28 | uas = [] 29 | with open(uafile, 'rb') as uaf: 30 | for ua in uaf.readlines(): 31 | if ua: 32 | uas.append(ua.strip()[1:-1 - 1]) 33 | random.shuffle(uas) 34 | return uas 35 | 36 | 37 | uas = LoadUserAgents("user_agents.txt") 38 | head = { 39 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36', 40 | 'X-Requested-With': 'XMLHttpRequest', 41 | 'Referer': 'http://space.bilibili.com/45388', 42 | 'Origin': 'http://space.bilibili.com', 43 | 'Host': 'space.bilibili.com', 44 | 'AlexaToolbar-ALX_NS_PH': 'AlexaToolbar/alx-4.0', 45 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4', 46 | 'Accept': 'application/json, text/javascript, */*; q=0.01', 47 | } 48 | proxies = { 49 | 'http': 'http://61.155.164.108:3128', 50 | 'http': 'http://116.199.115.79:80', 51 | 'http': 'http://42.245.252.35:80', 52 | 'http': 'http://106.14.51.145:8118', 53 | 'http': 'http://116.199.115.78:80', 54 | 'http': 'http://123.147.165.143:8080', 55 | 'http': 'http://58.62.86.216:9999', 56 | 'http': 'http://202.201.3.121:3128', 57 | 'http': 'http://119.29.201.134:808', 58 | 'http': 'http://61.155.164.112:3128', 59 | 'http': 'http://123.57.76.102:80', 60 | 'http': 'http://116.199.115.78:80', 61 | } 62 | time1 = time.time() 63 | 64 | for m in range(99, 101): # 26 ,1000 65 | urls = [] 66 | for i in range(m * 100, (m + 1) * 100): 67 | url = 'https://space.bilibili.com/' + str(i) 68 | urls.append(url) 69 | 70 | 71 | def getsource(url): 72 | payload = { 73 | '_': datetime_to_timestamp_in_milliseconds(datetime.datetime.now()), 74 | 'mid': url.replace('https://space.bilibili.com/', '') 75 | } 76 | ua = random.choice(uas) 77 | head = { 78 | 'User-Agent': ua, 79 | 'Referer': 'https://space.bilibili.com/' + str(i) + '?from=search&seid=' + str(random.randint(10000, 50000)) 80 | } 81 | jscontent = requests \ 82 | .session() \ 83 | .post('http://space.bilibili.com/ajax/member/GetInfo', 84 | headers=head, 85 | data=payload, 86 | proxies=proxies) \ 87 | .text 88 | time2 = time.time() 89 | try: 90 | jsDict = json.loads(jscontent) 91 | statusJson = jsDict['status'] if 'status' in jsDict.keys() else False 92 | if statusJson == True: 93 | if 'data' in jsDict.keys(): 94 | jsData = jsDict['data'] 95 | mid = jsData['mid'] 96 | name = jsData['name'] 97 | sex = jsData['sex'] 98 | face = jsData['face'] 99 | coins = jsData['coins'] 100 | spacesta = jsData['spacesta'] 101 | birthday = jsData['birthday'] if 'birthday' in jsData.keys() else 'nobirthday' 102 | place = jsData['place'] if 'place' in jsData.keys() else 'noplace' 103 | description = jsData['description'] 104 | article = jsData['article'] 105 | playnum = jsData['playNum'] 106 | sign = jsData['sign'] 107 | level = jsData['level_info']['current_level'] 108 | exp = jsData['level_info']['current_exp'] 109 | print("Succeed: " + mid + "\t" + str(time2 - time1)) 110 | try: 111 | res = requests.get( 112 | 'https://api.bilibili.com/x/space/navnum?mid=' + str(mid) + '&jsonp=jsonp').text 113 | js_fans_data = json.loads(res) 114 | following = js_fans_data['data']['following'] 115 | fans = js_fans_data['data']['follower'] 116 | except: 117 | following = 0 118 | fans = 0 119 | else: 120 | print('no data now') 121 | try: 122 | conn = pymysql.connect( 123 | host='127.0.0.1', port=3306, user='root', passwd='******', db='sunshine',charset="utf8") 124 | cur = conn.cursor() 125 | cur.execute('INSERT INTO bilibili_user_info(mid, name, sex, face, coins, spacesta, \ 126 | birthday, place, description, article, following, fans, playnum, sign, level, exp) \ 127 | VALUES ("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' 128 | % ( 129 | mid, name, sex, face, coins, spacesta, 130 | birthday, place, description, article, 131 | following, fans, playnum, sign, level, exp 132 | )) 133 | conn.commit() 134 | except Exception: 135 | print("MySQL Error") 136 | else: 137 | print("Error: " + url) 138 | except ValueError: 139 | pass 140 | 141 | 142 | pool = ThreadPool(1) 143 | try: 144 | results = pool.map(getsource, urls) 145 | except Exception: 146 | print('ConnectionError') 147 | pool.close() 148 | pool.join() 149 | time.sleep(11) 150 | pool = ThreadPool(1) 151 | results = pool.map(getsource, urls) 152 | 153 | time.sleep(30) 154 | 155 | pool.close() 156 | pool.join() 157 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/22_python爬取Bilibili用户信息并导入mysql数据库/bilibili_user_info.sql: -------------------------------------------------------------------------------- 1 | # ************************************************************ 2 | # Sequel Pro SQL dump 3 | # Version 4135 4 | # 5 | # http://www.sequelpro.com/ 6 | # http://code.google.com/p/sequel-pro/ 7 | # 8 | # Host: 127.0.0.1 (MySQL 5.1.63) 9 | # Database: sunshine 10 | # Generation Time: 2018-04-26 13:33:32 +0000 11 | # ************************************************************ 12 | 13 | 14 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 15 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 16 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 17 | /*!40101 SET NAMES utf8 */; 18 | /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; 19 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; 20 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 21 | 22 | 23 | # Dump of table bilibili_user_info 24 | # ------------------------------------------------------------ 25 | 26 | CREATE TABLE `bilibili_user_info` ( 27 | `id` int(11) unsigned NOT NULL AUTO_INCREMENT, 28 | `mid` varchar(11) DEFAULT NULL, 29 | `name` varchar(45) DEFAULT NULL, 30 | `sex` varchar(11) DEFAULT NULL, 31 | `face` varchar(200) DEFAULT NULL, 32 | `coins` int(11) DEFAULT NULL, 33 | `spacesta` int(11) DEFAULT NULL, 34 | `birthday` varchar(45) DEFAULT NULL, 35 | `place` varchar(45) DEFAULT NULL, 36 | `description` varchar(45) DEFAULT NULL, 37 | `article` int(11) DEFAULT NULL, 38 | `following` int(11) DEFAULT NULL, 39 | `fans` int(11) DEFAULT NULL, 40 | `playnum` int(30) DEFAULT NULL, 41 | `sign` varchar(300) DEFAULT NULL, 42 | `level` int(11) DEFAULT NULL, 43 | `exp` int(11) DEFAULT NULL, 44 | PRIMARY KEY (`id`) 45 | ) ENGINE=MyISAM DEFAULT CHARSET=utf8; 46 | 47 | 48 | 49 | 50 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 51 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 52 | /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; 53 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 54 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 55 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 56 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/23_python爬取网易云音乐所有歌曲的评论数/README.md: -------------------------------------------------------------------------------- 1 | #### 这是一个爬取网易云音乐的所有的歌曲的评论数的爬虫。 2 | 3 | 以下为主要思路: 4 | 5 | - 1. 爬取所有的歌手信息([artists.py]); 6 | - 2. 根据上一步爬取到的歌手信息去爬取所有的专辑信息([album_by_artist.py]); 7 | - 3. 根据专辑信息爬取所有的歌曲信息([music_by_album.py]); 8 | - 4. 根据歌曲信息爬取其评论条数([comments_by_music.py]) 9 | - 5. 数据库相关的语句都存放于([sql.py])中。 -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/23_python爬取网易云音乐所有歌曲的评论数/album_by_artist.py: -------------------------------------------------------------------------------- 1 | """ 2 | 根据上一步获取的歌手的 ID 来用于获取所有的专辑 ID 3 | """ 4 | import requests 5 | from bs4 import BeautifulSoup 6 | import time 7 | from music_163 import sql 8 | 9 | 10 | class Album(object): 11 | headers = { 12 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 13 | 'Accept-Encoding': 'gzip, deflate, sdch', 14 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 15 | 'Cache-Control': 'no-cache', 16 | 'Connection': 'keep-alive', 17 | 'Cookie': '_ntes_nnid=7eced19b27ffae35dad3f8f2bf5885cd,1476521011210; _ntes_nuid=7eced19b27ffae35dad3f8f2bf5885cd; usertrack=c+5+hlgB7TgnsAmACnXtAg==; Province=025; City=025; _ga=GA1.2.1405085820.1476521280; NTES_PASSPORT=6n9ihXhbWKPi8yAqG.i2kETSCRa.ug06Txh8EMrrRsliVQXFV_orx5HffqhQjuGHkNQrLOIRLLotGohL9s10wcYSPiQfI2wiPacKlJ3nYAXgM; P_INFO=hourui93@163.com|1476523293|1|study|11&12|jis&1476511733&mail163#jis&320100#10#0#0|151889&0|g37_client_check&mailsettings&mail163&study&blog|hourui93@163.com; JSESSIONID-WYYY=189f31767098c3bd9d03d9b968c065daf43cbd4c1596732e4dcb471beafe2bf0605b85e969f92600064a977e0b64a24f0af7894ca898b696bd58ad5f39c8fce821ec2f81f826ea967215de4d10469e9bd672e75d25f116a9d309d360582a79620b250625859bc039161c78ab125a1e9bf5d291f6d4e4da30574ccd6bbab70b710e3f358f%3A1476594130342; _iuqxldmzr_=25; __utma=94650624.1038096298.1476521011.1476588849.1476592408.6; __utmb=94650624.11.10.1476592408; __utmc=94650624; __utmz=94650624.1476521011.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)', 18 | 'DNT': '1', 19 | 'Host': 'music.163.com', 20 | 'Pragma': 'no-cache', 21 | 'Referer': 'http://music.163.com/', 22 | 'Upgrade-Insecure-Requests': '1', 23 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' 24 | } 25 | 26 | def save_albums(self, artist_id): 27 | params = {'id': artist_id, 'limit': '200'} 28 | # 获取歌手个人主页 29 | r = requests.get('http://music.163.com/artist/album', headers=self.headers, params=params) 30 | 31 | # 网页解析 32 | soup = BeautifulSoup(r.content.decode(), 'html.parser') 33 | body = soup.body 34 | 35 | albums = body.find_all('a', attrs={'class': 'tit f-thide s-fc0'}) # 获取所有专辑 36 | 37 | for album in albums: 38 | albume_id = album['href'].replace('/album?id=', '') 39 | sql.insert_album(albume_id, artist_id) 40 | 41 | 42 | if __name__ == '__main__': 43 | artists = sql.get_all_artist() 44 | my_album = Album() 45 | for i in artists: 46 | try: 47 | my_album.save_albums(i['ARTIST_ID']) 48 | # print(i) 49 | except Exception as e: 50 | # 打印错误日志 51 | print(str(i) + ': ' + str(e)) 52 | time.sleep(5) 53 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/23_python爬取网易云音乐所有歌曲的评论数/artists.py: -------------------------------------------------------------------------------- 1 | """ 2 | 获取所有的歌手信息 3 | """ 4 | import requests 5 | from bs4 import BeautifulSoup 6 | from music_163 import sql 7 | 8 | headers = { 9 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 10 | 'Accept-Encoding': 'gzip, deflate, sdch', 11 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 12 | 'Cache-Control': 'no-cache', 13 | 'Connection': 'keep-alive', 14 | 'Cookie': '_ntes_nnid=7eced19b27ffae35dad3f8f2bf5885cd,1476521011210; _ntes_nuid=7eced19b27ffae35dad3f8f2bf5885cd; usertrack=c+5+hlgB7TgnsAmACnXtAg==; Province=025; City=025; NTES_PASSPORT=6n9ihXhbWKPi8yAqG.i2kETSCRa.ug06Txh8EMrrRsliVQXFV_orx5HffqhQjuGHkNQrLOIRLLotGohL9s10wcYSPiQfI2wiPacKlJ3nYAXgM; P_INFO=hourui93@163.com|1476523293|1|study|11&12|jis&1476511733&mail163#jis&320100#10#0#0|151889&0|g37_client_check&mailsettings&mail163&study&blog|hourui93@163.com; NTES_SESS=Fa2uk.YZsGoj59AgD6tRjTXGaJ8_1_4YvGfXUkS7C1NwtMe.tG1Vzr255TXM6yj2mKqTZzqFtoEKQrgewi9ZK60ylIqq5puaG6QIaNQ7EK5MTcRgHLOhqttDHfaI_vsBzB4bibfamzx1.fhlpqZh_FcnXUYQFw5F5KIBUmGJg7xdasvGf_EgfICWV; S_INFO=1476597594|1|0&80##|hourui93; NETEASE_AUTH_SOURCE=space; NETEASE_AUTH_USERNAME=hourui93; _ga=GA1.2.1405085820.1476521280; JSESSIONID-WYYY=cbd082d2ce2cffbcd5c085d8bf565a95aee3173ddbbb00bfa270950f93f1d8bb4cb55a56a4049fa8c828373f630c78f4a43d6c3d252c4c44f44b098a9434a7d8fc110670a6e1e9af992c78092936b1e19351435ecff76a181993780035547fa5241a5afb96e8c665182d0d5b911663281967d675ff2658015887a94b3ee1575fa1956a5a%3A1476607977016; _iuqxldmzr_=25; __utma=94650624.1038096298.1476521011.1476595468.1476606177.8; __utmb=94650624.20.10.1476606177; __utmc=94650624; __utmz=94650624.1476521011.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)', 15 | 'DNT': '1', 16 | 'Host': 'music.163.com', 17 | 'Pragma': 'no-cache', 18 | 'Referer': 'http://music.163.com/', 19 | 'Upgrade-Insecure-Requests': '1', 20 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' 21 | } 22 | 23 | 24 | def save_artist(group_id, initial): 25 | params = {'id': group_id, 'initial': initial} 26 | r = requests.get('http://music.163.com/discover/artist/cat', params=params) 27 | 28 | # 网页解析 29 | soup = BeautifulSoup(r.content.decode(), 'html.parser') 30 | body = soup.body 31 | 32 | hot_artists = body.find_all('a', attrs={'class': 'msk'}) 33 | artists = body.find_all('a', attrs={'class': 'nm nm-icn f-thide s-fc0'}) 34 | 35 | for artist in hot_artists: 36 | artist_id = artist['href'].replace('/artist?id=', '').strip() 37 | artist_name = artist['title'].replace('的音乐', '') 38 | try: 39 | sql.insert_artist(artist_id, artist_name) 40 | except Exception as e: 41 | # 打印错误日志 42 | print(e) 43 | 44 | for artist in artists: 45 | artist_id = artist['href'].replace('/artist?id=', '').strip() 46 | artist_name = artist['title'].replace('的音乐', '') 47 | try: 48 | sql.insert_artist(artist_id, artist_name) 49 | except Exception as e: 50 | # 打印错误日志 51 | print(e) 52 | 53 | 54 | gg = 4003 55 | 56 | save_artist(gg, 0) 57 | for i in range(65, 91): 58 | save_artist(gg, i) 59 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/23_python爬取网易云音乐所有歌曲的评论数/comments_by_music.py: -------------------------------------------------------------------------------- 1 | """ 2 | 根据歌曲 ID 获得所有的歌曲所对应的评论信息 3 | """ 4 | 5 | import requests 6 | from music_163 import sql 7 | import time 8 | import threading 9 | import pymysql.cursors 10 | 11 | 12 | class Comments(object): 13 | headers = { 14 | 'Host': 'music.163.com', 15 | 'Connection': 'keep-alive', 16 | 'Content-Length': '484', 17 | 'Cache-Control': 'max-age=0', 18 | 'Origin': 'http://music.163.com', 19 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36', 20 | 'Content-Type': 'application/x-www-form-urlencoded', 21 | 'Accept': '*/*', 22 | 'DNT': '1', 23 | 'Accept-Encoding': 'gzip, deflate', 24 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4', 25 | 'Cookie': 'JSESSIONID-WYYY=b66d89ed74ae9e94ead89b16e475556e763dd34f95e6ca357d06830a210abc7b685e82318b9d1d5b52ac4f4b9a55024c7a34024fddaee852404ed410933db994dcc0e398f61e670bfeea81105cbe098294e39ac566e1d5aa7232df741870ba1fe96e5cede8372ca587275d35c1a5d1b23a11e274a4c249afba03e20fa2dafb7a16eebdf6%3A1476373826753; _iuqxldmzr_=25; _ntes_nnid=7fa73e96706f26f3ada99abba6c4a6b2,1476372027128; _ntes_nuid=7fa73e96706f26f3ada99abba6c4a6b2; __utma=94650624.748605760.1476372027.1476372027.1476372027.1; __utmb=94650624.4.10.1476372027; __utmc=94650624; __utmz=94650624.1476372027.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)', 26 | } 27 | 28 | params = { 29 | 'csrf_token': '' 30 | } 31 | 32 | data = { 33 | 'params': 'Ak2s0LoP1GRJYqE3XxJUZVYK9uPEXSTttmAS+8uVLnYRoUt/Xgqdrt/13nr6OYhi75QSTlQ9FcZaWElIwE+oz9qXAu87t2DHj6Auu+2yBJDr+arG+irBbjIvKJGfjgBac+kSm2ePwf4rfuHSKVgQu1cYMdqFVnB+ojBsWopHcexbvLylDIMPulPljAWK6MR8', 34 | 'encSecKey': '8c85d1b6f53bfebaf5258d171f3526c06980cbcaf490d759eac82145ee27198297c152dd95e7ea0f08cfb7281588cdab305946e01b9d84f0b49700f9c2eb6eeced8624b16ce378bccd24341b1b5ad3d84ebd707dbbd18a4f01c2a007cd47de32f28ca395c9715afa134ed9ee321caa7f28ec82b94307d75144f6b5b134a9ce1a' 35 | } 36 | 37 | proxies = {'http': 'http://127.0.0.1:10800'} 38 | 39 | def get_comments(self, music_id, flag): 40 | self.headers['Referer'] = 'http://music.163.com/playlist?id=' + str(music_id) 41 | if flag: 42 | r = requests.post('http://music.163.com/weapi/v1/resource/comments/R_SO_4_' + str(music_id), 43 | headers=self.headers, params=self.params, data=self.data, proxies=self.proxies) 44 | else: 45 | r = requests.post('http://music.163.com/weapi/v1/resource/comments/R_SO_4_' + str(music_id), 46 | headers=self.headers, params=self.params, data=self.data) 47 | return r.json() 48 | 49 | 50 | if __name__ == '__main__': 51 | my_comment = Comments() 52 | 53 | 54 | def save_comments(musics, flag, connection0): 55 | for i in musics: 56 | my_music_id = i['MUSIC_ID'] 57 | try: 58 | comments = my_comment.get_comments(my_music_id, flag) 59 | if comments['total'] > 0: 60 | sql.insert_comments(my_music_id, comments['total'], str(comments), connection0) 61 | except Exception as e: 62 | # 打印错误日志 63 | print(my_music_id) 64 | print(e) 65 | time.sleep(5) 66 | 67 | 68 | music_before = sql.get_before_music() 69 | music_after = sql.get_after_music() 70 | 71 | # pymysql 链接不是线程安全的 72 | connection1 = pymysql.connect(host='localhost', 73 | user='root', 74 | password='1234', 75 | db='test', 76 | charset='utf8mb4', 77 | cursorclass=pymysql.cursors.DictCursor) 78 | 79 | connection2 = pymysql.connect(host='localhost', 80 | user='root', 81 | password='1234', 82 | db='test', 83 | charset='utf8mb4', 84 | cursorclass=pymysql.cursors.DictCursor) 85 | 86 | t1 = threading.Thread(target=save_comments, args=(music_before, True, connection1)) 87 | t2 = threading.Thread(target=save_comments, args=(music_after, False, connection2)) 88 | t1.start() 89 | t2.start() 90 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/23_python爬取网易云音乐所有歌曲的评论数/music_by_album.py: -------------------------------------------------------------------------------- 1 | """ 2 | 根据专辑 ID 获取到所有的音乐 ID 3 | """ 4 | import requests 5 | from bs4 import BeautifulSoup 6 | import time 7 | from music_163 import sql 8 | 9 | 10 | class Music(object): 11 | headers = { 12 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 13 | 'Accept-Encoding': 'gzip, deflate, sdch', 14 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 15 | 'Cache-Control': 'no-cache', 16 | 'Connection': 'keep-alive', 17 | 'Cookie': '_ntes_nnid=7eced19b27ffae35dad3f8f2bf5885cd,1476521011210; _ntes_nuid=7eced19b27ffae35dad3f8f2bf5885cd; usertrack=c+5+hlgB7TgnsAmACnXtAg==; Province=025; City=025; NTES_PASSPORT=6n9ihXhbWKPi8yAqG.i2kETSCRa.ug06Txh8EMrrRsliVQXFV_orx5HffqhQjuGHkNQrLOIRLLotGohL9s10wcYSPiQfI2wiPacKlJ3nYAXgM; P_INFO=hourui93@163.com|1476523293|1|study|11&12|jis&1476511733&mail163#jis&320100#10#0#0|151889&0|g37_client_check&mailsettings&mail163&study&blog|hourui93@163.com; _ga=GA1.2.1405085820.1476521280; JSESSIONID-WYYY=fb5288e1c5f667324f1636d020704cab2f27ee915622b114f89027cbf60c38be2af6b9cbef2223c1f2581e3502f11b86efd60891d6f61b6f783c0d55114f8269fa801df7352f5cc4c8259876e563a6bd0212b504a8997723a0593b21d5b3d9076d4fa38c098be68e3c5d36d342e4a8e40c1f73378cec0b5851bd8a628886edbdd23a7093%3A1476623819662; _iuqxldmzr_=25; __utma=94650624.1038096298.1476521011.1476610320.1476622020.10; __utmb=94650624.14.10.1476622020; __utmc=94650624; __utmz=94650624.1476521011.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)', 18 | 'DNT': '1', 19 | 'Host': 'music.163.com', 20 | 'Pragma': 'no-cache', 21 | 'Referer': 'http://music.163.com/', 22 | 'Upgrade-Insecure-Requests': '1', 23 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' 24 | } 25 | 26 | def save_music(self, album_id): 27 | params = {'id': album_id} 28 | # 获取专辑对应的页面 29 | r = requests.get('http://music.163.com/album', headers=self.headers, params=params) 30 | 31 | # 网页解析 32 | soup = BeautifulSoup(r.content.decode(), 'html.parser') 33 | body = soup.body 34 | 35 | musics = body.find('ul', attrs={'class': 'f-hide'}).find_all('li') # 获取专辑的所有音乐 36 | 37 | for music in musics: 38 | music = music.find('a') 39 | music_id = music['href'].replace('/song?id=', '') 40 | music_name = music.getText() 41 | sql.insert_music(music_id, music_name, album_id) 42 | 43 | 44 | if __name__ == '__main__': 45 | albums = sql.get_all_album() 46 | my_music = Music() 47 | for i in albums: 48 | try: 49 | my_music.save_music(i['ALBUM_ID']) 50 | # print(i) 51 | except Exception as e: 52 | # 打印错误日志 53 | print(str(i) + ': ' + str(e)) 54 | time.sleep(5) 55 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/23_python爬取网易云音乐所有歌曲的评论数/sql.py: -------------------------------------------------------------------------------- 1 | """ 2 | 一般 Python 用于连接 MySQL 的工具:pymysql 3 | """ 4 | import pymysql.cursors 5 | 6 | connection = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='***', db='sunshine',charset="utf8") 7 | 8 | 9 | # 保存评论 10 | def insert_comments(music_id, comments, detail, connection): 11 | with connection.cursor() as cursor: 12 | sql = "INSERT INTO `comments` (`MUSIC_ID`, `COMMENTS`, `DETAILS`) VALUES (%s, %s, %s)" 13 | cursor.execute(sql, (music_id, comments, detail)) 14 | connection.commit() 15 | 16 | 17 | # 保存音乐 18 | def insert_music(music_id, music_name, album_id): 19 | with connection.cursor() as cursor: 20 | sql = "INSERT INTO `musics` (`MUSIC_ID`, `MUSIC_NAME`, `ALBUM_ID`) VALUES (%s, %s, %s)" 21 | cursor.execute(sql, (music_id, music_name, album_id)) 22 | connection.commit() 23 | 24 | 25 | # 保存专辑 26 | def insert_album(album_id, artist_id): 27 | with connection.cursor() as cursor: 28 | sql = "INSERT INTO `albums` (`ALBUM_ID`, `ARTIST_ID`) VALUES (%s, %s)" 29 | cursor.execute(sql, (album_id, artist_id)) 30 | connection.commit() 31 | 32 | 33 | # 保存歌手 34 | def insert_artist(artist_id, artist_name): 35 | with connection.cursor() as cursor: 36 | sql = "INSERT INTO `artists` (`ARTIST_ID`, `ARTIST_NAME`) VALUES (%s, %s)" 37 | cursor.execute(sql, (artist_id, artist_name)) 38 | connection.commit() 39 | 40 | 41 | # 获取所有歌手的 ID 42 | def get_all_artist(): 43 | with connection.cursor() as cursor: 44 | sql = "SELECT `ARTIST_ID` FROM `artists` ORDER BY ARTIST_ID" 45 | cursor.execute(sql, ()) 46 | return cursor.fetchall() 47 | 48 | 49 | # 获取所有专辑的 ID 50 | def get_all_album(): 51 | with connection.cursor() as cursor: 52 | sql = "SELECT `ALBUM_ID` FROM `albums` ORDER BY ALBUM_ID" 53 | cursor.execute(sql, ()) 54 | return cursor.fetchall() 55 | 56 | 57 | # 获取所有音乐的 ID 58 | def get_all_music(): 59 | with connection.cursor() as cursor: 60 | sql = "SELECT `MUSIC_ID` FROM `musics` ORDER BY MUSIC_ID" 61 | cursor.execute(sql, ()) 62 | return cursor.fetchall() 63 | 64 | 65 | # 获取前一半音乐的 ID 66 | def get_before_music(): 67 | with connection.cursor() as cursor: 68 | sql = "SELECT `MUSIC_ID` FROM `musics` ORDER BY MUSIC_ID LIMIT 0, 800000" 69 | cursor.execute(sql, ()) 70 | return cursor.fetchall() 71 | 72 | 73 | # 获取后一半音乐的 ID 74 | def get_after_music(): 75 | with connection.cursor() as cursor: 76 | sql = "SELECT `MUSIC_ID` FROM `musics` ORDER BY MUSIC_ID LIMIT 800000, 1197429" 77 | cursor.execute(sql, ()) 78 | return cursor.fetchall() 79 | 80 | 81 | def dis_connect(): 82 | connection.close() 83 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/ctrip_items.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/ctrip_items.csv -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__init__.py -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class FindtripItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | # pass 15 | site = scrapy.Field() 16 | company = scrapy.Field() 17 | flight_time = scrapy.Field() 18 | airports = scrapy.Field() 19 | passtime = scrapy.Field() 20 | price = scrapy.Field() 21 | 22 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class FindtripSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | from findtrip.spiders.washctrip import wash 8 | import pymongo 9 | from scrapy.conf import settings 10 | from scrapy import log 11 | 12 | class FindtripPipeline(object): 13 | def process_item(self, item, spider): 14 | return item 15 | 16 | 17 | class MongoDBPipeline(object): 18 | def __init__(self): 19 | self.client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT']) 20 | self.db = self.client[settings['MONGO_DB']] 21 | self.post = self.db[settings['MONGO_COLL']] 22 | 23 | def process_item(self, item, spider): 24 | if item['site'] == 'Qua': 25 | if item['company']: 26 | item['company'] = wash(item['company']) 27 | if item['flight_time']: 28 | item['flight_time'] = wash(item['flight_time']) 29 | if item['airports']: 30 | item['airports'] = wash(item['airports']) 31 | if item['passtime']: 32 | item['passtime'] = wash(item['passtime']) 33 | if item['price']: 34 | item['price'] = wash(item['price']) 35 | for data in item: 36 | if not data: 37 | raise DropItem("Missing data!") 38 | self.collection.insert(dict(item)) 39 | log.msg("Question added to MongoDB database!", 40 | level=log.DEBUG, spider=spider) 41 | elif item['site'] == 'Ctrip': 42 | self.collection.insert(dict(item)) 43 | log.msg("Question added to MongoDB database!", 44 | level=log.DEBUG, spider=spider) 45 | 46 | return item -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for findtrip project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'findtrip' 13 | 14 | SPIDER_MODULES = ['findtrip.spiders'] 15 | NEWSPIDER_MODULE = 'findtrip.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'findtrip (+http://www.yourdomain.com)' 20 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' 21 | 22 | 23 | # Obey robots.txt rules 24 | ROBOTSTXT_OBEY = False 25 | 26 | # 配置mongoDB 27 | MONGO_HOST = "127.0.0.1" # 主机IP 28 | MONGO_PORT = 27017 # 端口号 29 | MONGO_DB = "FindTrip" # 库名 30 | MONGO_COLL = "qua_findtrip" # collection 31 | 32 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 33 | #CONCURRENT_REQUESTS = 32 34 | 35 | # Configure a delay for requests for the same website (default: 0) 36 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 37 | # See also autothrottle settings and docs 38 | #DOWNLOAD_DELAY = 3 39 | # The download delay setting will honor only one of: 40 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 41 | #CONCURRENT_REQUESTS_PER_IP = 16 42 | 43 | # Disable cookies (enabled by default) 44 | COOKIES_ENABLED = False 45 | 46 | # Disable Telnet Console (enabled by default) 47 | #TELNETCONSOLE_ENABLED = False 48 | 49 | # Override the default request headers: 50 | #DEFAULT_REQUEST_HEADERS = { 51 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 52 | # 'Accept-Language': 'en', 53 | #} 54 | 55 | # Enable or disable spider middlewares 56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 57 | #SPIDER_MIDDLEWARES = { 58 | # 'findtrip.middlewares.FindtripSpiderMiddleware': 543, 59 | #} 60 | 61 | # Enable or disable downloader middlewares 62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 63 | #DOWNLOADER_MIDDLEWARES = { 64 | # 'findtrip.middlewares.MyCustomDownloaderMiddleware': 543, 65 | #} 66 | 67 | # Enable or disable extensions 68 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 69 | #EXTENSIONS = { 70 | # 'scrapy.extensions.telnet.TelnetConsole': None, 71 | #} 72 | 73 | # Configure item pipelines 74 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 75 | ITEM_PIPELINES = { 76 | # 'findtrip.pipelines.FindtripPipeline': 300, 77 | 'findtrip.pipelines.MongoDBPipeline': 300, 78 | } 79 | 80 | # Enable and configure the AutoThrottle extension (disabled by default) 81 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 82 | #AUTOTHROTTLE_ENABLED = True 83 | # The initial download delay 84 | #AUTOTHROTTLE_START_DELAY = 5 85 | # The maximum download delay to be set in case of high latencies 86 | #AUTOTHROTTLE_MAX_DELAY = 60 87 | # The average number of requests Scrapy should be sending in parallel to 88 | # each remote server 89 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 90 | # Enable showing throttling stats for every response received: 91 | #AUTOTHROTTLE_DEBUG = False 92 | 93 | # Enable and configure HTTP caching (disabled by default) 94 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 95 | #HTTPCACHE_ENABLED = True 96 | #HTTPCACHE_EXPIRATION_SECS = 0 97 | #HTTPCACHE_DIR = 'httpcache' 98 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 99 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 100 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/ctrip_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/ctrip_spider.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/qua_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/qua_spider.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/washctrip.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/washctrip.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/ctrip_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from findtrip.items import FindtripItem 3 | 4 | class CtripSpider(scrapy.Spider): 5 | name = 'ctrip' 6 | start_urls = [ 7 | "http://flights.ctrip.com/booking/XMN-BJS-day-1.html?DDate1=2016-04-19" 8 | ] 9 | 10 | def parse(self, response): 11 | sel = scrapy.Selector(response) 12 | fligint_div = "//div[@id='J_flightlist2']/div" 13 | dataList = sel.xpath(fligint_div) 14 | 15 | for index,each in enumerate(dataList): 16 | flight_each = fligint_div+'['+str(index+1)+']' 17 | flight_tr = flight_each+"//tr[@class='J_header_row']" 18 | istrain = sel.xpath(flight_each + "//div[@class='train_flight_tit']") 19 | 20 | if istrain: 21 | print ("this data is train add") 22 | else: 23 | company = sel.xpath(flight_tr + "//div[@class='info-flight J_flight_no']//text()").extract() 24 | 25 | flight_time_from = sel.xpath(flight_tr + "//td[@class='right']/div[1]//text()").extract() 26 | flight_time_to = sel.xpath(flight_tr + "//td[@class='left']/div[1]//text()").extract() 27 | flight_time = [flight_time_from,flight_time_to] 28 | 29 | airports_from = sel.xpath(flight_tr + "//td[@class='right']/div[2]//text()").extract() 30 | airports_to = sel.xpath(flight_tr + "//td[@class='left']/div[2]//text()").extract() 31 | airports = [airports_from,airports_to] 32 | 33 | price_middle = sel.xpath(flight_tr + "[1]//td[@class='price middle ']/span//text()").extract() 34 | price = sel.xpath(flight_tr + "[1]//td[@class='price ']/span//text()").extract() 35 | if price_middle: 36 | price = price_middle 37 | elif price: 38 | price = price 39 | else: 40 | price = '' 41 | 42 | item = FindtripItem() 43 | item['site'] = 'Ctrip' 44 | item['company'] = company 45 | item['flight_time'] = flight_time 46 | item['airports'] = airports 47 | item['price'] = price 48 | yield item 49 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/qua_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from findtrip.items import FindtripItem 3 | 4 | class QuaSpider(scrapy.Spider): 5 | name = "qua" 6 | start_urls = [ 7 | "http://www.qua.com/flights/PEK-XMN/2016-05-12?m=CNY&from=flight_home" 8 | ] 9 | 10 | def parse(self, response): 11 | sel = scrapy.Selector(response) 12 | dataList = sel.xpath("//div[@class='m-fly-item s-oneway']") 13 | 14 | for index,each in enumerate(dataList): 15 | flight_each = "//div[@id='list-box']/div["+str(index+1)+"]" 16 | detail_span = "//div[@class='fl-detail-nav']/ul/li[1]/span[@class='nav-label']" 17 | f_route_div = "//div[@class='m-fl-info-bd']/div" 18 | 19 | airports = sel.xpath(flight_each + f_route_div + '/p[3]//text()').extract() 20 | company = sel.xpath(flight_each + f_route_div + '/p[1]//text()').extract() 21 | flight_time = sel.xpath(flight_each + f_route_div + '/p[2]//text()').extract() 22 | passtime = sel.xpath(flight_each + f_route_div + '/p[4]//text()').extract() 23 | price = sel.xpath(flight_each + "//div[@class='fl-price-box']//em//text()").extract() 24 | 25 | item = FindtripItem() 26 | item['site'] = 'Qua' 27 | item['company'] = company 28 | item['flight_time'] = flight_time 29 | item['airports'] = airports 30 | item['passtime'] = passtime 31 | item['price'] = price 32 | yield item 33 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/washctrip.py: -------------------------------------------------------------------------------- 1 | def wash(dateList): 2 | dateList = map(lambda x : x.split(), dateList) 3 | cleanList = [] 4 | for each in dateList: 5 | if each: 6 | cleanList.append(each[0]) 7 | return cleanList 8 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/qua_items.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/qua_items.csv -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/qua_items.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/qua_items.json -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = findtrip.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = findtrip 12 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__init__.py -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Field, Item 9 | 10 | 11 | class PythonjobsItem(Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | #pass 15 | title = Field() 16 | city = Field() 17 | company = Field() 18 | location = Field() 19 | url = Field() 20 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class PythonjobsSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class PythonjobsPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for pythonjobs project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'pythonjobs' 13 | 14 | SPIDER_MODULES = ['pythonjobs.spiders'] 15 | NEWSPIDER_MODULE = 'pythonjobs.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'pythonjobs (+http://www.yourdomain.com)' 20 | USER_AGENT = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0' 21 | 22 | # Obey robots.txt rules 23 | ROBOTSTXT_OBEY = False 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | #CONCURRENT_REQUESTS = 32 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | #DOWNLOAD_DELAY = 3 32 | # The download delay setting will honor only one of: 33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 34 | #CONCURRENT_REQUESTS_PER_IP = 16 35 | 36 | # Disable cookies (enabled by default) 37 | COOKIES_ENABLED = False 38 | 39 | # Disable Telnet Console (enabled by default) 40 | #TELNETCONSOLE_ENABLED = False 41 | 42 | # Override the default request headers: 43 | #DEFAULT_REQUEST_HEADERS = { 44 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 45 | # 'Accept-Language': 'en', 46 | #} 47 | 48 | # Enable or disable spider middlewares 49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 50 | #SPIDER_MIDDLEWARES = { 51 | # 'pythonjobs.middlewares.PythonjobsSpiderMiddleware': 543, 52 | #} 53 | 54 | # Enable or disable downloader middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 56 | #DOWNLOADER_MIDDLEWARES = { 57 | # 'pythonjobs.middlewares.MyCustomDownloaderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable extensions 61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 68 | ITEM_PIPELINES = { 69 | 'pythonjobs.pipelines.PythonjobsPipeline': 300, 70 | } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/__pycache__/job_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/__pycache__/job_spider.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/job_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from pythonjobs.items import PythonjobsItem 4 | #from bs4 import BeautifulSoup 5 | 6 | class JobspiderSpider(scrapy.Spider): 7 | name = 'jobSpider' 8 | allowed_domains = ['search.51job.com','jobs.51job.com'] 9 | 10 | def start_requests(self): 11 | for i in range(1,20): # Set pages to crawl here. 12 | url = "http://search.51job.com/list/000000,000000,0000,00,9,99,python,2,{0}.html".format(i) 13 | yield scrapy.Request(url) 14 | 15 | def parse(self, response): 16 | for sel in response.css("html body div.dw_wp div#resultList.dw_table div.el p.t1 span a"): 17 | url = sel.re('href="(.*?)"')[0] 18 | yield scrapy.Request(url,callback=self.parse_item) 19 | 20 | def parse_item(self, response): 21 | item = PythonjobsItem() 22 | item['title'] = response.xpath('//div[@class="cn"]/h1/@title').extract()[0] 23 | item['url'] = response.url 24 | item['city'] = response.xpath('//span[@class="lname"]/text()').extract()[0] 25 | item['company'] = response.xpath('//p[@class="cname"]/a/@title').extract()[0] 26 | item['location'] = response.xpath('//p[@class="fp"]/text()').extract()[1].rstrip() 27 | return item -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = pythonjobs.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = pythonjobs 12 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/ghostdriver.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/ghostdriver.log -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = shuimujob.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = shuimujob 12 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__init__.py -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/platform.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/platform.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ShuimujobItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | # pass 15 | title = scrapy.Field() 16 | href = scrapy.Field() 17 | author = scrapy.Field() 18 | time = scrapy.Field() 19 | content = scrapy.Field() 20 | is_dev = scrapy.Field() 21 | is_alg = scrapy.Field() 22 | is_fin = scrapy.Field() 23 | base_url_index = scrapy.Field() 24 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class ShuimujobSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymongo 8 | from scrapy.conf import settings 9 | from scrapy.exceptions import DropItem 10 | from scrapy import log 11 | 12 | class ShuimujobPipeline(object): 13 | def process_item(self, item, spider): 14 | return item 15 | 16 | class MongoDBPipeline(object): 17 | 18 | def __init__(self): 19 | pass 20 | 21 | 22 | def open_spider(self, spider): 23 | self.client = pymongo.MongoClient( 24 | settings['MONGODB_SERVER'], 25 | settings['MONGODB_PORT'] 26 | ) 27 | self.db = self.client[settings['MONGODB_DB']] 28 | self.collection = self.db[settings['MONGODB_COLLECTION']] 29 | 30 | def close_spider(self, spider): 31 | self.client.close() 32 | 33 | def process_item(self, item, spider): 34 | valid = True 35 | for data in item: 36 | if not data : 37 | valid = False 38 | raise DropItem("Missing {0}!".format(data)) 39 | if item['title'] == '': 40 | valid = False 41 | raise DropItem("title is '' ") 42 | if item['content'] == '': 43 | valid = False 44 | raise DropItem("content is '' ") 45 | if valid: 46 | self.collection.insert(dict(item)) 47 | return item 48 | 49 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/platform.py: -------------------------------------------------------------------------------- 1 | import sys 2 | def getPlatform(): 3 | platform='' 4 | if sys.platform.startswith('win'): 5 | platform = 'win' 6 | elif sys.platform.startswith('linux'): 7 | platform = 'linux' 8 | return platform -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for shuimujob project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'shuimujob' 13 | 14 | SPIDER_MODULES = ['shuimujob.spiders'] 15 | NEWSPIDER_MODULE = 'shuimujob.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'shuimujob (+http://www.yourdomain.com)' 20 | 21 | 22 | MONGODB_SERVER = "localhost" 23 | MONGODB_PORT = 27017 24 | MONGODB_DB = "shuimujob" 25 | MONGODB_COLLECTION = "job_info" 26 | 27 | # Obey robots.txt rules 28 | ROBOTSTXT_OBEY = False 29 | 30 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 31 | #CONCURRENT_REQUESTS = 32 32 | 33 | # Configure a delay for requests for the same website (default: 0) 34 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 35 | # See also autothrottle settings and docs 36 | #DOWNLOAD_DELAY = 3 37 | # The download delay setting will honor only one of: 38 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 39 | #CONCURRENT_REQUESTS_PER_IP = 16 40 | 41 | # Disable cookies (enabled by default) 42 | COOKIES_ENABLED = False 43 | 44 | # Disable Telnet Console (enabled by default) 45 | #TELNETCONSOLE_ENABLED = False 46 | 47 | # Override the default request headers: 48 | #DEFAULT_REQUEST_HEADERS = { 49 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 50 | # 'Accept-Language': 'en', 51 | #} 52 | 53 | # Enable or disable spider middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 55 | #SPIDER_MIDDLEWARES = { 56 | # 'shuimujob.middlewares.ShuimujobSpiderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable downloader middlewares 60 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 61 | #DOWNLOADER_MIDDLEWARES = { 62 | # 'shuimujob.middlewares.MyCustomDownloaderMiddleware': 543, 63 | #} 64 | 65 | # Enable or disable extensions 66 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 67 | #EXTENSIONS = { 68 | # 'scrapy.extensions.telnet.TelnetConsole': None, 69 | #} 70 | 71 | # Configure item pipelines 72 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 73 | ITEM_PIPELINES = { 74 | # 'shuimujob.pipelines.ShuimujobPipeline': 300, 75 | 'shuimujob.pipelines.MongoDBPipeline':300 76 | } 77 | 78 | # Enable and configure the AutoThrottle extension (disabled by default) 79 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 80 | #AUTOTHROTTLE_ENABLED = True 81 | # The initial download delay 82 | #AUTOTHROTTLE_START_DELAY = 5 83 | # The maximum download delay to be set in case of high latencies 84 | #AUTOTHROTTLE_MAX_DELAY = 60 85 | # The average number of requests Scrapy should be sending in parallel to 86 | # each remote server 87 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 88 | # Enable showing throttling stats for every response received: 89 | #AUTOTHROTTLE_DEBUG = False 90 | 91 | # Enable and configure HTTP caching (disabled by default) 92 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 93 | #HTTPCACHE_ENABLED = True 94 | #HTTPCACHE_EXPIRATION_SECS = 0 95 | #HTTPCACHE_DIR = 'httpcache' 96 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 97 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 98 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/__pycache__/shuimu_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/__pycache__/shuimu_spider.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/shuimu_spider.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import scrapy 3 | from shuimujob.items import ShuimujobItem 4 | from selenium import webdriver 5 | from selenium.webdriver.common.by import By 6 | from selenium.webdriver.support.ui import WebDriverWait 7 | from selenium.webdriver.support import expected_conditions as EC 8 | from bs4 import BeautifulSoup 9 | from scrapy import signals 10 | from scrapy.xlib.pydispatch import dispatcher 11 | from shuimujob.platform import getPlatform 12 | 13 | class SMSpider(scrapy.spiders.CrawlSpider): 14 | ''' 15 | #要建立一个 Spider,你可以为 scrapy.spider.BaseSpider 创建一个子类,并确定三个主要的、强制的属性: 16 | #name :爬虫的识别名,它必须是唯一的,在不同的爬虫中你必须定义不同的名字. 17 | #start_urls :爬虫开始爬的一个 URL 列表。爬虫从这里开始抓取数据,所以,第一次下载的数据将会从这些 URLS 开始。其他子 URL 将会从这些起始 URL 中继承性生成。 18 | #parse() :爬虫的方法,调用时候传入从每一个 URL 传回的 Response 对象作为参数,response 将会是 parse 方法的唯一的一个参数, 19 | #这个方法负责解析返回的数据、匹配抓取的数据(解析为 item )并跟踪更多的 URL。 20 | ''' 21 | name="shuimujob" 22 | base_url = 'http://www.newsmth.net/nForum/board/Intern' 23 | start_urls = [base_url] 24 | start_urls.extend([base_url+'?p='+str(i) for i in range(2,4)]) 25 | # start_urls = ['http://www.newsmth.net/'] 26 | platform = getPlatform() 27 | 28 | def __init__(self): 29 | scrapy.spiders.Spider.__init__(self) 30 | if self.platform == 'linux': 31 | self.driver = webdriver.PhantomJS() 32 | elif self.platform == 'win': 33 | self.driver = webdriver.PhantomJS() 34 | self.driver.set_page_load_timeout(15) 35 | dispatcher.connect(self.spider_closed, signals.spider_closed) 36 | 37 | 38 | 39 | def spider_closed(self, spider): 40 | self.driver.quit() 41 | 42 | def parse(self,response): 43 | self.driver.get(response.url) 44 | 45 | element = WebDriverWait(self.driver,30).until(EC.presence_of_all_elements_located((By.TAG_NAME,'table'))) 46 | page_source = self.driver.page_source 47 | bs_obj = BeautifulSoup(page_source, "lxml") 48 | table = bs_obj.find('table',class_='board-list tiz') 49 | intern_messages = table.find_all('tr',class_=False) 50 | for message in intern_messages: 51 | title, href, time, author = '','','','' 52 | td_9 = message.find('td',class_='title_9') 53 | if td_9: 54 | title = td_9.a.get_text().encode('utf-8','ignore') 55 | href = td_9.a['href'] 56 | td_10 = message.find('td', class_='title_10') 57 | if td_10: 58 | time=td_10.get_text().encode('utf-8','ignore') 59 | td_12 = message.find('td', class_='title_12') 60 | if td_12: 61 | author = td_12.a.get_text().encode('utf-8','ignore') 62 | item = ShuimujobItem() 63 | item['title'] = title 64 | item['href'] = href 65 | item['time'] = time 66 | item['author'] = author 67 | item['base_url_index'] = 0 68 | root_url = 'http://www.newsmth.net' 69 | # content = scrapy.Request(root_url+href,self.parse_content) 70 | if href!='': 71 | content = self.parse_content(root_url+href) 72 | # print 'content:', content 73 | item['content'] = content 74 | yield item 75 | 76 | def parse_content(self,url): 77 | 78 | self.driver.get(url) 79 | element = WebDriverWait(self.driver, 30).until(EC.presence_of_all_elements_located((By.TAG_NAME, 'table'))) 80 | page_source = self.driver.page_source 81 | bs_obj = BeautifulSoup(page_source, "lxml") 82 | return bs_obj.find('td', class_='a-content').p.get_text().encode('utf-8','ignore') -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/house.csv: -------------------------------------------------------------------------------- 1 | house,house_area,house_room,total_price,unit_price 2 | 滨江奥城听雨苑 ,115.83,3室2厅,515.0,44462 3 | 虹苑新寓三村 ,63.8,2室2厅,196.0,30722 4 | 天坛新寓 ,75.16,3室1厅,243.0,32332 5 | 棉鞋营小区 ,69.74,3室1厅,220.0,31546 6 | 常府街10至16号 ,62.21,2室1厅,212.0,34079 7 | house,house_area,house_room,total_price,unit_price 8 | 滨江奥城听雨苑 ,115.83,3室2厅,515.0,44462 9 | 虹苑新寓三村 ,63.8,2室2厅,196.0,30722 10 | 天坛新寓 ,75.16,3室1厅,243.0,32332 11 | 棉鞋营小区 ,69.74,3室1厅,220.0,31546 12 | 常府街10至16号 ,62.21,2室1厅,212.0,34079 13 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__init__.py -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class NjHouseItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | house=scrapy.Field() 15 | total_price=scrapy.Field() 16 | unit_price=scrapy.Field() 17 | house_room=scrapy.Field() 18 | house_area=scrapy.Field() -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class NjHouseSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class NjHousePipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for nj_house project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'nj_house' 13 | 14 | SPIDER_MODULES = ['nj_house.spiders'] 15 | NEWSPIDER_MODULE = 'nj_house.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'nj_house (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'nj_house.middlewares.NjHouseSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'nj_house.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'nj_house.pipelines.NjHousePipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/__pycache__/lj_house.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Web-Spider-and-Data-Analysis/aa1458f8acff5da86c2ab5e1339a08ad7d5e81e4/Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/__pycache__/lj_house.cpython-36.pyc -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/lj_house.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import scrapy 4 | from nj_house.items import NjHouseItem 5 | 6 | class LjHouseSpider(scrapy.Spider): 7 | name = "lj_house" 8 | allowed_domains = ["nj.lianjia.com/ershoufang/"] 9 | start_urls = ['http://nj.lianjia.com/ershoufang//'] 10 | 11 | def parse(self, response): 12 | clears = response.css('.sellListContent li') 13 | item = NjHouseItem() 14 | for c in clears: 15 | house = c.css('.houseInfo a::text').extract_first() 16 | house_text = c.css('.houseInfo::text').extract_first() 17 | house_info_list = [e for e in re.split('\|', int(house_text)) if len(e) > 1] 18 | house_room = house_info_list[0].strip() 19 | house_area = ''.join(re.findall(r'[\d+\.]', house_info_list[1])) 20 | total_price = c.css('.totalPrice span::text').extract_first() 21 | unit_price = c.css('.unitPrice span::text').extract_first() 22 | unit_price = re.findall('\d+', unit_price)[0] 23 | 24 | item['house'] = house 25 | item['total_price'] = float(total_price) 26 | item['unit_price'] = int(unit_price) 27 | item['house_area'] = float(house_area) 28 | item['house_room'] = house_room 29 | yield item 30 | 31 | page_info = response.css('div[class="page-box fr"]').css('div::attr(page-data)').extract_first() 32 | page_list = re.findall('\d+', page_info) 33 | next_page = 'pg' + str(int(page_list[1]) + 1) 34 | url = self.start_urls[0] + next_page 35 | if next_page: 36 | yield Request(url=url, callback=self.parse) 37 | -------------------------------------------------------------------------------- /Python3网络爬虫中小型项目实战集中营/27_scrapy爬取南京20000多套二手房信息/nj_house/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = nj_house.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = nj_house 12 | -------------------------------------------------------------------------------- /Python3网络爬虫快速入门篇/biqukan.py: -------------------------------------------------------------------------------- 1 | # -*- coding:UTF-8 -*- 2 | from bs4 import BeautifulSoup 3 | import requests, sys 4 | 5 | """ 6 | 类说明:下载《笔趣看》网小说《一念永恒》 7 | Parameters: 8 | 无 9 | Returns: 10 | 无 11 | """ 12 | class downloader(object): 13 | 14 | def __init__(self): 15 | self.server = 'http://www.biqukan.com/' 16 | self.target = 'http://www.biqukan.com/1_1094/' 17 | self.names = [] #存放章节名 18 | self.urls = [] #存放章节链接 19 | self.nums = 0 #章节数 20 | 21 | """ 22 | 函数说明:获取下载链接 23 | Parameters: 24 | 无 25 | Returns: 26 | 无 27 | """ 28 | def get_download_url(self): 29 | req = requests.get(url = self.target) 30 | html = req.text 31 | div_bf = BeautifulSoup(html) 32 | div = div_bf.find_all('div', class_ = 'listmain') 33 | a_bf = BeautifulSoup(str(div[0])) 34 | a = a_bf.find_all('a') 35 | self.nums = len(a[15:]) #剔除不必要的章节,并统计章节数 36 | for each in a[15:]: 37 | self.names.append(each.string) 38 | self.urls.append(self.server + each.get('href')) 39 | 40 | """ 41 | 函数说明:获取章节内容 42 | Parameters: 43 | target - 下载连接(string) 44 | Returns: 45 | texts - 章节内容(string) 46 | """ 47 | def get_contents(self, target): 48 | req = requests.get(url = target) 49 | html = req.text 50 | bf = BeautifulSoup(html, "lxml") 51 | texts = bf.find_all('div', class_ = 'showtxt') 52 | texts = texts[0].text.replace('\xa0'*8,'\n\n') 53 | return texts 54 | 55 | """ 56 | 函数说明:将爬取的文章内容写入文件 57 | Parameters: 58 | name - 章节名称(string) 59 | path - 当前路径下,小说保存名称(string) 60 | text - 章节内容(string) 61 | Returns: 62 | 无 63 | """ 64 | def writer(self, name, path, text): 65 | write_flag = True 66 | with open(path, 'a', encoding='utf-8') as f: 67 | f.write(name + '\n') 68 | f.writelines(text) 69 | f.write('\n\n') 70 | 71 | if __name__ == "__main__": 72 | dl = downloader() 73 | dl.get_download_url() 74 | print('《一年永恒》开始下载:') 75 | for i in range(dl.nums): 76 | dl.writer(dl.names[i], '一念永恒.txt', dl.get_contents(dl.urls[i])) 77 | sys.stdout.write(" 已下载:%.3f%%" % float(i/dl.nums) + '\r') 78 | sys.stdout.flush() 79 | print('《一年永恒》下载完成') -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Python3网络爬虫中小型项目实战清单 2 | 3 | 01.python爬取电影天堂 4 | 5 | 02.python爬取斗罗大陆小说 6 | 7 | 03.Python抓取欧洲足球联赛数据 8 | 9 | 04.python爬取豆瓣电影Top250 10 | 11 | 05.python爬取股票数据 12 | 13 | 06.python爬取人人贷网数据 14 | 15 | 07.python爬取创业邦创投库 16 | 17 | 08.python抓取美团网百万商家信息 *** 18 | 19 | 09.python爬取网易云音乐评论并把他们存入mysql数据库 *** 20 | 21 | 10.python爬取“网上购物”类APP 22 | 23 | 11.python爬取链家网房价信息 *** 24 | 25 | 12.python爬取并分析豆瓣中最新电影的影评(词云显示) 26 | 27 | 13.python爬取豆瓣书籍信息 28 | 29 | 14.python爬取今日头条信息并导入mongodb数据库 30 | 31 | 15.python爬取百度招聘内容并存入mongodb数据库 *** 32 | 33 | 16.python爬取熊猫直播用户信息 34 | 35 | 17.scrapy爬取游天下南京短租房信息并存入mongodb数据库 36 | 37 | 18.scrapy爬取中国医学人才网信息并以json格式保存 38 | 39 | 19.scrapy框架爬取豆瓣电影top250信息 40 | 41 | 20.scrapy爬取织梦者网站信息并存入mongodb数据库 *** 42 | 43 | 21.python爬取豆瓣电影<前任3>评论(词云显示) 44 | 45 | 22.python爬取Bilibili用户信息并导入mysql数据库 *** 46 | 47 | 23.python爬取网易云音乐所有歌曲的评论数 48 | 49 | 24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库 *** 50 | 51 | 25.scrapy爬取前程无忧网站python相关的工作信息 52 | 53 | 26.scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库 *** 54 | 55 | 27.scrapy爬取南京20000多套二手房信息 --------------------------------------------------------------------------------