├── .gitignore ├── Fluent Python └── 数据结构 │ └── 序列构成的数组.py ├── README.md ├── Web Scrapying with Python ├── Chapter01 │ └── BeautifulSouptest.py ├── Chapter02 │ ├── child_descendant.py │ ├── css.py │ ├── regex_BeautifulSoup.py │ └── soup_lambda.py ├── Chapter03 │ ├── 1_wikipedia.py │ ├── 2_getlinks.py │ ├── 3_get_all_link.py │ ├── findlinks.py │ └── wikiSpider │ │ ├── scrapy.cfg │ │ └── wikiSpider │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── items.cpython-36.pyc │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── article.cpython-36.pyc │ │ └── article.py ├── Chapter05 │ ├── csv_use.py │ ├── download_logo.py │ ├── download_src.py │ └── email_text.py ├── Chapter06 │ ├── 6-degreescrawlwiki.py │ └── read_csv.py ├── Chapter07 │ ├── clean-n-grams.py │ └── n-grams.py ├── Chapter08 │ ├── 2-gram-summary.py │ ├── 6-degrees-demo.py │ ├── 6-degrees-find.py │ └── MarkovGenerator.py ├── Chapter09 │ ├── 1-simpleForm.py │ ├── 2-fileSubmission.py │ ├── 3-cookies.py │ ├── 4-sessionCookies.py │ └── 5-BasicAuth.py ├── Chapter10 │ ├── 1-seleniumBasic.py │ ├── 2-waitForLoad.py │ └── 3-javascriptRedirect.py ├── Chapter11 │ ├── 1-basicImage.py │ ├── 2-cleanImage.py │ ├── 3-readWebImages.py │ └── 4-solveCaptcha.py ├── Chapter12 │ ├── headers.py │ ├── honeypotDetection.py │ └── seleniumCookies.py └── README.md ├── mongoDB资料 └── ReferenceCards15-PDF.pdf ├── pymongo ├── README.md ├── create_index.py ├── delete.py ├── insert.py ├── update.py └── 查找 │ ├── 1嵌入文档.py │ ├── 2嵌入数组.py │ ├── 3数组中嵌入文档.py │ ├── 4从查询中返回的项目字段.py │ ├── 5空字段或缺失字段.py │ └── 6限制显示行数.py ├── singleton ├── README.md ├── __new__.py ├── decorator.py ├── metaclass.py └── new_threading_safe.py └── spiders ├── Bs4基本元素.py ├── RE库基本使用.py ├── ajax今日头条.py ├── csdn_ajax.py ├── jdsearch.py ├── jianshu.py ├── newhouse.py ├── scrapy ├── BaiduStocks │ ├── BaiduStockInfo.txt │ ├── BaiduStocks │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── pipelines.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ ├── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-36.pyc │ │ │ │ └── stocks.cpython-36.pyc │ │ │ └── stocks.py │ │ └── 调试.py │ └── scrapy.cfg ├── Tencent │ ├── Tencent │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── items.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── tencent.cpython-36.pyc │ │ │ └── tencent.py │ └── scrapy.cfg ├── jianshu │ ├── jishuspider │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ └── zhihu.py │ └── scrapy.cfg ├── movie │ ├── movie │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── movies.cpython-36.pyc │ │ │ └── movies.py │ └── scrapy.cfg ├── python123demo │ ├── python123demo │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── demo.cpython-36.pyc │ │ │ └── demo.py │ └── scrapy.cfg ├── quoteturorial │ ├── quotes.json │ ├── quoteturorial │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── items.cpython-36.pyc │ │ │ ├── pipelines.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ ├── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-36.pyc │ │ │ │ └── quotes.cpython-36.pyc │ │ │ └── quotes.py │ │ └── 调试.py │ └── scrapy.cfg ├── weibo │ ├── scrapy.cfg │ └── weibo │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── weibos.cpython-36.pyc │ │ └── weibos.py └── zhihuuser │ ├── scrapy.cfg │ └── zhihuuser │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── items.cpython-36.pyc │ ├── pipelines.cpython-36.pyc │ └── settings.cpython-36.pyc │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-36.pyc │ └── zhihu.cpython-36.pyc │ └── zhihu.py ├── secondSpider ├── Data_Output.py ├── Html_Downloader.py ├── Html_Parser.py ├── SpiderWork.py ├── URL_Manager.py ├── __pycache__ │ ├── Data_Output.cpython-36.pyc │ ├── Html_Downloader.cpython-36.pyc │ ├── Html_Parser.cpython-36.pyc │ └── URL_Manager.cpython-36.pyc ├── new_urls.txt ├── old_urls.txt └── start_Manager.py ├── selenium ├── Frame.py ├── Jiaohu.py ├── javaScript.py ├── 前进后退.py ├── 获取属性.py └── 选项卡管理.py ├── selenium个人邮箱.py ├── selenium模拟淘宝.py ├── 中国大学排名定向爬虫.py ├── 分布式爬虫 ├── Data_Output.py ├── Html_Downloader.py ├── Html_Parser.py ├── SpiderMan.py ├── URL_Manager.py └── __pycache__ │ ├── Data_Output.cpython-36.pyc │ ├── Html_Downloader.cpython-36.pyc │ ├── Html_Parser.cpython-36.pyc │ └── URL_Manager.cpython-36.pyc ├── 分布式进程 ├── taskManager.py └── taskWork.py ├── 医生信息索取.py ├── 多线程爬取医生.py ├── 淘宝商品信息爬取.py ├── 猫眼电影.py ├── 百度图片.py ├── 股票爬虫.py └── 豆瓣.py /.gitignore: -------------------------------------------------------------------------------- 1 | **/.idea/** 2 | -------------------------------------------------------------------------------- /Fluent Python/数据结构/序列构成的数组.py: -------------------------------------------------------------------------------- 1 | # 容器序列 2 | # list,tuple,collection.deque 3 | # 存放的是他们所包含的任意对象的引用 4 | # 扁平序列 5 | # str,bytes,bytearray,memoryview和array.array 6 | # 存放的是值,扁平序列其实是一段连续的内存空间 7 | 8 | # 可变序列 9 | # list, 10 | # 不可变序列 11 | # tuple,str和bytes 12 | 13 | text = [i for i in range(10)] # 列表推导式,如果超过二行,考虑for循环 14 | print(text) # 列表推导的作用只用一个:生成列表 15 | 16 | 17 | text = (i for i in range(10)) # 生成表达式,只不过把方括号换成圆括号而已 18 | print(text) 19 | 20 | 21 | text = tuple(i for i in range(10)) # 生成表达式是一个函数调用过程中的唯一参数时,那么不需要 22 | print(text) # 用额外的括号把它围起来 23 | 24 | 25 | # 用*处理剩下的元素 在平行赋值中,*前缀只能出现在一个变量前面 26 | # 1 函数 27 | # def main(*args, **kwargs): 28 | # 在python中,函数用*args来获取不确定数量的位置参数,**kwargs获取不确定数量的关键字传参 29 | 30 | # 2 元组 31 | a, b , *rest = range(5) 32 | print(a, b, rest) 33 | 34 | 35 | # 列表或元祖的方法和属性 36 | # 列表 元组 37 | # s.__add__(s2) * * s+s2 拼接 ————创建一个新对象 38 | # s.__iadd__(s2) * s += s2,就地拼接————元祖不可变 39 | # s.append(e) * 40 | # s.clear() * 删除所有元素 41 | # s.__contain(e) * * s是否包含e————一般使用in 42 | # s.count(e) * * e在s中出现的次数 43 | # s.extend(it) * 把可迭代对象it追加给s 44 | # s.index(e) * * 在s中找到元素e第一次出现的位置 45 | # s.insert(p, e) * 在位置p之前插入元素e 46 | # s.pop([p]) * 删除最后或位于p位置的元素,并返回他的值 47 | # s.remove(e) * 删除s中第一次出现的e 48 | # s.reverse() * 就地把s的元素倒序排列 49 | # s.sort([key], [reverse]) * 就地对s的元素进行排序 50 | # sorted内置函数,可接受任何可迭代对象,最后返回一个列表。 51 | 52 | 53 | 54 | 55 | # 当数组不是首选时 56 | # 存放大量float,数组array效率更高 57 | # 频繁的对序列做先进先出的操作,deque(双端队列)的速度会更快 58 | # 如果查找操作很频繁,用set会更合适 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #py爬虫 2 | 3 | 4 | ## 设计模式 5 | - [单例模式](/singleton) 6 | 7 | ## 读书 8 | - [Python网络数据采集](/Web%20Scrapying%20with%20Python) 9 | - 进度:100% 10 | 11 | - [流畅的python](/Fluent%20Python) 12 | - 进度:50% 13 | 14 | 15 | ## 爬虫 16 | - [自己写的一些爬虫实例](/spiders) 17 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter01/BeautifulSouptest.py: -------------------------------------------------------------------------------- 1 | import requsets 2 | from bs4 import BeautifulSoup 3 | 4 | 5 | html = requsets.get("http://www.pythonscraping.com/pages/page1.html") 6 | 7 | ''' 8 | To get rid of this warning, change this: 9 | BeautifulSoup([your markup]) 10 | to this: 11 | BeautifulSoup([your markup], "html.parser") 12 | markup_type=markup_type)) 13 | ''' 14 | #data = BeautifulSoup(html.read()) 15 | 16 | soup = BeautifulSoup(html.text, "html.parser") 17 | 18 | 19 | print(soup.title) 20 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter02/child_descendant.py: -------------------------------------------------------------------------------- 1 | # from urllib.request import urlopen 2 | # from bs4 import BeautifulSoup 3 | # 4 | # html = urlopen("http://www.pythonscraping.com/pages/page3.html") 5 | # 6 | # data = BeautifulSoup(html, "html.parser") 7 | # 8 | # for child in data.find("table", {"id": "giftList"}).children: 9 | # print(child) 10 | 11 | import requests 12 | from bs4 import BeautifulSoup 13 | 14 | html = requests.get("http://www.pythonscraping.com/pages/page3.html") 15 | 16 | soup = BeautifulSoup(html.text, "html.parser") 17 | 18 | # children list_iterator 19 | print(type(soup.find("table", {"id": "giftList"}).children)) 20 | # include '\n' 21 | print(len(list(soup.find("table", {"id": "giftList"}).children))) 22 | print(len(list(soup.find("table", {"id": "giftList"}).descendants))) 23 | for child in soup.find("table", {"id": "giftList"}).children: 24 | print(repr(child)) 25 | print('*************') 26 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter02/css.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | 4 | html = requests.get("http://www.pythonscraping.com/pages/warandpeace.html") 5 | 6 | soup = BeautifulSoup(html.text, "html.parser") 7 | 8 | ''' 9 | name_list = data.findAll("span", {"class":"green"}) 10 | for name in name_list: 11 | print(name.get_text()) 12 | ''' 13 | 14 | prince_list = soup.findAll(text="the prince") 15 | print(len(prince_list)) 16 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter02/regex_BeautifulSoup.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | import re 6 | 7 | html = requests.get("http://www.pythonscraping.com/pages/page3.html") 8 | 9 | soup = BeautifulSoup(html.text, "html.parser") 10 | 11 | images = soup.findAll("img", {"src": re.compile("\.\.\/img\/gifts/img.*\.jpg")}) 12 | 13 | for i in images: 14 | print(i["src"]) 15 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter02/soup_lambda.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | 4 | html = requests.get("http://www.pythonscraping.com/pages/page3.html") 5 | 6 | soup = BeautifulSoup(html.text, "html.parser") 7 | 8 | 9 | prince_list = soup.findAll(lambda tag: len(tag.attrs) == 2) 10 | for i in prince_list: 11 | print(i) 12 | 13 | print(len(prince_list)) 14 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter03/1_wikipedia.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import re 4 | 5 | html = requests.get("https://en.wikipedia.org/wiki/Kevin_Bacon") 6 | 7 | soup = BeautifulSoup(html.text, "html.parser") 8 | 9 | ''' 10 | for link in data.findAll("a"): 11 | if 'href' in link.attrs: 12 | print(link.attrs['href']) 13 | ''' 14 | 15 | for link in soup.find("div", {"id": "bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$")): 16 | if 'href' in link.attrs: 17 | print(link.attrs['href']) 18 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter03/2_getlinks.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | import datetime 6 | import random 7 | import re 8 | 9 | random.seed(datetime.datetime.now()) 10 | 11 | 12 | def getLinks(articeUrl): 13 | html = requests.get("https://en.wikipedia.org" + articeUrl) 14 | soup = BeautifulSoup(html.text, "html.parser") 15 | 16 | return soup.find("div", {"id": "bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$")) 17 | 18 | 19 | links = getLinks("/wiki/Kevin_Bacon") 20 | 21 | while len(links) > 0: 22 | newArticle = links[random.randint(0, len(links) - 1)].attrs['href'] 23 | print(newArticle) 24 | 25 | links = getLinks(newArticle) 26 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter03/3_get_all_link.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import requests 3 | import re 4 | 5 | pages = set() 6 | 7 | 8 | def getLinks(articeUrl): 9 | global pages 10 | 11 | html = requests.get("https://en.wikipedia.org" + articeUrl) 12 | data = BeautifulSoup(html.text, "html.parser") 13 | 14 | try: 15 | print(data.h1.text) 16 | print(data.find(id="mw-content-text").findAll("p")[0]) 17 | print(data.find(id="ca-edit").find("span").find("a").attrs['href']) 18 | except AttributeError: 19 | print("Missing some attributes") 20 | 21 | for link in data.findAll("a", href=re.compile("^(/wiki/)")): 22 | if 'href' in link.attrs: 23 | if link.attrs['href'] not in pages: 24 | newPage = link.attrs['href'] 25 | print("new Page:") 26 | print(newPage) 27 | 28 | pages.add(newPage) 29 | 30 | getLinks(newPage) 31 | 32 | 33 | getLinks("") 34 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter03/findlinks.py: -------------------------------------------------------------------------------- 1 | from urllib.request import urlopen 2 | from bs4 import BeautifulSoup 3 | 4 | import re 5 | import datetime 6 | import random 7 | 8 | pages = set() 9 | random.seed(datetime.datetime.now()) 10 | 11 | 12 | def getInternalLinks(data, includeUrl): 13 | inLinks = [] 14 | 15 | for link in data.findAll("a", href=re.compile("^(/|.*" + includeUrl + ")")): 16 | if link.attrs['href'] is not None: 17 | if link.attrs['href'] not in inLinks: 18 | inLinks.append(link.attrs['href']) 19 | 20 | return inLinks 21 | 22 | 23 | def getExLinks(data, excludeUrl): 24 | exLinks = [] 25 | 26 | for link in data.findAll("a", href=re.compile("^(http|www)((?!" + excludeUrl + ").)*$")): 27 | if link.attrs['href'] is not None: 28 | if link.attrs['href'] not in exLinks: 29 | exLinks.append(link.attrs['href']) 30 | 31 | return exLinks 32 | 33 | 34 | def splitAddress(address): 35 | addressParts = address.replace("http://", "").split("/") 36 | 37 | return addressParts 38 | 39 | 40 | def getRandomExtLink(startPage): 41 | html = urlopen(startPage) 42 | data = BeautifulSoup(html, "html.parser") 43 | exLinks = getExLinks(data, splitAddress(startPage)[0]) 44 | 45 | if len(exLinks) == 0: 46 | inLinks = getInternalLinks(startPage) 47 | return getRandomExtLink(inLinks[random.randint(0, len(inLinks) - 1)]) 48 | else: 49 | return exLinks[random.randint(0, len(exLinks) - 1)] 50 | 51 | 52 | def followExtOnly(startSite): 53 | extLink = getRandomExtLink("http://oreilly.com") 54 | print("extLink:" + extLink) 55 | followExtOnly(extLink) 56 | 57 | 58 | followExtOnly("http://oreilly.com") 59 | 60 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter03/wikiSpider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = wikiSpider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = wikiSpider 12 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/__init__.py -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Item, Field 9 | 10 | 11 | class WikispiderItem(Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | title = Field() 15 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class WikispiderPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for wikiSpider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'wikiSpider' 13 | 14 | SPIDER_MODULES = ['wikiSpider.spiders'] 15 | NEWSPIDER_MODULE = 'wikiSpider.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'wikiSpider (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'wikiSpider.middlewares.WikispiderSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'wikiSpider.middlewares.WikispiderDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'wikiSpider.pipelines.WikispiderPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/spiders/__pycache__/article.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/spiders/__pycache__/article.cpython-36.pyc -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/spiders/article.py: -------------------------------------------------------------------------------- 1 | from scrapy import Spider 2 | from wikiSpider.items import WikispiderItem 3 | 4 | 5 | class ArticleSpider(Spider): 6 | name = "article" 7 | allowed_domains = ["en.wikipedia.org"] 8 | start_urls = ["https://en.wikipedia.org/wiki/Main_Page", 9 | "https://en.wikipedia.org/wiki/Python_%28programming_language%29"] 10 | 11 | def parse(self, response): 12 | item = WikispiderItem() 13 | title = response.xpath('//h1/text()')[0].extract() 14 | print("Title is:" + title) 15 | item['title'] = title 16 | return item 17 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter05/csv_use.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from urllib.request import urlopen 3 | from bs4 import BeautifulSoup 4 | 5 | html = urlopen('https://en.wikipedia.org/wiki/Comparison_of_text_editors') 6 | soup = BeautifulSoup(html, 'lxml') 7 | table = soup.find('table', {'class': 'wikitable'}) 8 | print(type(table)) 9 | rows = table.findAll('tr') 10 | 11 | csvFile = open("editors.csv", 'wt', newline='', encoding='utf-8') 12 | writer = csv.writer(csvFile) 13 | 14 | try: 15 | for row in rows: 16 | csvRow = [] 17 | for cell in row.findAll(['td', 'th']): 18 | csvRow.append(cell.text) 19 | writer.writerow(csvRow) 20 | finally: 21 | csvFile.close() -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter05/download_logo.py: -------------------------------------------------------------------------------- 1 | from urllib.request import urlopen, urlretrieve 2 | from bs4 import BeautifulSoup 3 | 4 | html = urlopen("http://www.pythonscraping.com") 5 | data = BeautifulSoup(html, "html.parser") 6 | 7 | logo_location = data.find("a", {"id": "logo"}).find("img")["src"] 8 | urlretrieve(logo_location, "logo.jpg") 9 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter05/download_src.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from urllib.request import urlopen, urlretrieve 4 | from bs4 import BeautifulSoup 5 | 6 | download_dir = "downloaded" 7 | 8 | baseUrl = "http://pythonscraping.com" 9 | 10 | 11 | def getURI(url, source): 12 | if source.startswith("http://www."): 13 | url = "http://" + source[11:] 14 | elif source.startswith("http://"): 15 | url = source 16 | elif source.startswith("www."): 17 | url = "http://" + source[4:] 18 | else: 19 | url = baseUrl + "/" + source 20 | 21 | if baseUrl not in url: 22 | print('not in url') 23 | return None 24 | 25 | return url 26 | 27 | 28 | def getDownloadPath(baseUrl, url, download_dir): 29 | path = url.replace("www.", "") 30 | path = path.replace(baseUrl, "") 31 | path = download_dir + path 32 | 33 | print(path) 34 | dir = os.path.dirname(path) 35 | print(dir) 36 | if not os.path.exists(dir): 37 | os.makedirs(dir) 38 | 39 | return path 40 | 41 | 42 | def main(): 43 | html = urlopen(baseUrl) 44 | soup = BeautifulSoup(html, "html.parser") 45 | download_list = soup.findAll(src=True) 46 | 47 | for download in download_list: 48 | fileUrl = getURI(baseUrl, download["src"]) 49 | if fileUrl is not None: 50 | print(fileUrl) 51 | urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, download_dir)) 52 | # break 53 | 54 | 55 | if __name__ == "__main__": 56 | main() 57 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter05/email_text.py: -------------------------------------------------------------------------------- 1 | import smtplib 2 | 3 | from email.mime.text import MIMEText 4 | 5 | msg = MIMEText("This is a mail test") 6 | 7 | msg['Subject'] = "An Email ALERT" 8 | msg['From'] = "ds@ds-virtual-machine" 9 | msg['To'] = "942203701@qq.com" 10 | 11 | s = smtplib.SMTP('localhost') 12 | s.send_message(msg) 13 | s.quit() 14 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter06/6-degreescrawlwiki.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import re 3 | import pymysql 4 | from urllib.request import urlopen 5 | 6 | conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='123456', db='mysql', charset='utf8') 7 | cur = conn.cursor() 8 | cur.execute("USE wikipedia") 9 | 10 | 11 | def pageScraped(url): 12 | cur.execute("SELECT * FROM pages WHERE url = %s", (url)) 13 | if cur.rowcount == 0: 14 | return False 15 | page = cur.fetchone() 16 | 17 | cur.execute("SELECT * FROM links WHERE fromPageId = %s", (int(page[0]))) 18 | if cur.rowcount == 0: 19 | return False 20 | return True 21 | 22 | 23 | def insertPageIfNotExists(url): 24 | cur.execute("SELECT * FROM pages WHERE url = %s", (url)) 25 | if cur.rowcount == 0: 26 | cur.execute("INSERT INTO pages (url) VALUES (%s)", (url)) 27 | conn.commit() 28 | return cur.lastrowid 29 | else: 30 | return cur.fetchone()[0] 31 | 32 | 33 | def insertLink(fromPageId, toPageId): 34 | cur.execute("SELECT * FROM links WHERE fromPageId = %s AND toPageId = %s", (int(fromPageId), int(toPageId))) 35 | if cur.rowcount == 0: 36 | cur.execute("INSERT INTO links (fromPageId, toPageId) VALUES (%s, %s)", (int(fromPageId), int(toPageId))) 37 | conn.commit() 38 | 39 | 40 | def getLinks(pageUrl, recursionLevel): 41 | global pages 42 | if recursionLevel > 4: 43 | return 44 | pageId = insertPageIfNotExists(pageUrl) 45 | html = urlopen("http://en.wikipedia.org" + pageUrl) 46 | bsObj = BeautifulSoup(html, "html.parser") 47 | for link in bsObj.findAll("a", href=re.compile("^(/wiki/)((?!:).)*$")): 48 | insertLink(pageId, insertPageIfNotExists(link.attrs['href'])) 49 | if not pageScraped(link.attrs['href']): 50 | # We have encountered a new page, add it and search it for links 51 | newPage = link.attrs['href'] 52 | print(newPage) 53 | getLinks(newPage, recursionLevel + 1) 54 | else: 55 | print("Skipping: " + str(link.attrs['href']) + " found on " + pageUrl) 56 | 57 | 58 | getLinks("/wiki/Kevin_Bacon", 0) 59 | cur.close() 60 | conn.close() 61 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter06/read_csv.py: -------------------------------------------------------------------------------- 1 | from urllib.request import urlopen 2 | from io import StringIO 3 | import csv 4 | 5 | ''' 6 | Don't name your file csv.py. 7 | When you do, Python will look in your file for the csv code instead of the standard library csv module. 8 | ''' 9 | 10 | data = urlopen("http://pythonscraping.com/files/MontyPythonAlbums.csv").read().decode('ascii', 'ignore') 11 | 12 | dataFile = StringIO(data) 13 | 14 | ''' 15 | csvRead = csv.reader(dataFile) 16 | for row in csvRead: 17 | print(row) 18 | ''' 19 | 20 | dictReader = csv.DictReader(dataFile) 21 | 22 | print(dictReader.fieldnames) 23 | 24 | for row in dictReader: 25 | print(row) 26 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter07/clean-n-grams.py: -------------------------------------------------------------------------------- 1 | from urllib.request import urlopen 2 | from bs4 import BeautifulSoup 3 | import re 4 | import string 5 | from collections import OrderedDict 6 | 7 | 8 | def cleanInput(input): 9 | ''' 10 | 移除转义字符,过滤Unicode字符 11 | ''' 12 | # input = re.sub('\n+', " ", input) # 替换换行符 13 | input = re.sub('\[[0-9]*\]', "", input) # remove digit 14 | input = re.sub(r'\s', " ", input) # remove all blank character 15 | input = bytes(input, "UTF-8") # 更改编码 16 | input = input.decode("ascii", "ignore") 17 | 18 | cleanInput = [] 19 | input = input.split(' ') 20 | for item in input: 21 | item = item.strip(string.punctuation) 22 | if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'): 23 | cleanInput.append(item) 24 | return cleanInput 25 | 26 | 27 | def getNgrams(input, n): 28 | input = cleanInput(input) 29 | # print(input) 30 | output = dict() 31 | for i in range(len(input) - n + 1): 32 | newNGram = " ".join(input[i:i + n]) 33 | if newNGram in output: 34 | output[newNGram] += 1 35 | else: 36 | output[newNGram] = 1 37 | return output 38 | 39 | 40 | html = urlopen("https://en.wikipedia.org/wiki/Python_(programming_language)") 41 | soup = BeautifulSoup(html, "html.parser") 42 | content = soup.find("div", {"id": "mw-content-text"}).get_text() 43 | 44 | ngrams = getNgrams(content, 2) 45 | 46 | # Using OrderedDict sort 47 | ngrams = OrderedDict(sorted(ngrams.items(), key=lambda t: t[1], reverse=True)) 48 | 49 | print(ngrams) 50 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter07/n-grams.py: -------------------------------------------------------------------------------- 1 | from urllib.request import urlopen 2 | from bs4 import BeautifulSoup 3 | import re 4 | 5 | def ngrams(indata, n): 6 | # indata = re.split(r' |,|;|\n|', indata) 7 | indata = indata.split() 8 | print(indata) 9 | outodata = [] 10 | 11 | for i in range(len(indata) - n + 1): 12 | outodata.append(indata[i:i + n]) 13 | print(outodata) 14 | break 15 | 16 | return outodata 17 | 18 | 19 | html = urlopen("https://en.wikipedia.org/wiki/Python_(programming_language)") 20 | soup = BeautifulSoup(html, "html.parser") 21 | 22 | content = soup.find("div", {"id": "mw-content-text"}).get_text() 23 | # content = soup.find("div", id_="mw-content-text").get_text() 24 | 25 | ngram = ngrams(content, 2) 26 | 27 | print(ngram) 28 | print("2-ngrams count is:" + str(len(ngram))) 29 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter08/2-gram-summary.py: -------------------------------------------------------------------------------- 1 | from urllib.request import urlopen 2 | from bs4 import BeautifulSoup 3 | import re 4 | import string 5 | import operator 6 | 7 | 8 | def isCommon(ngram): 9 | commonWords = ["the", "be", "and", "of", "a", "in", "to", "have", "it", "i", "that", "for", "you", "he", "with", 10 | "on", "do", "say", "this", "they", "is", "an", "at", "but", "we", "his", "from", "that", "not", "by", 11 | "she", "or", "as", "what", "go", "their", "can", "who", "get", "if", "would", "her", "all", "my", 12 | "make", "about", "know", "will", "as", "up", "one", "time", "has", "been", "there", "year", "so", 13 | "think", "when", "which", "them", "some", "me", "people", "take", "out", "into", "just", "see", 14 | "him", "your", "come", "could", "now", "than", "like", "other", "how", "then", "its", "our", "two", 15 | "more", "these", "want", "way", "look", "first", "also", "new", "because", "day", "more", "use", 16 | "no", "man", "find", "here", "thing", "give", "many", "well"] 17 | for word in ngram: 18 | if word in commonWords: 19 | return True 20 | return False 21 | 22 | 23 | def cleanText(input): 24 | input = re.sub('\n+', " ", input).lower() 25 | input = re.sub('\[[0-9]*\]', "", input) 26 | input = re.sub(' +', " ", input) 27 | input = re.sub("u\.s\.", "us", input) 28 | input = bytes(input, "UTF-8") 29 | input = input.decode("ascii", "ignore") 30 | return input 31 | 32 | 33 | def cleanInput(input): 34 | input = cleanText(input) 35 | cleanInput = [] 36 | input = input.split(' ') 37 | for item in input: 38 | item = item.strip(string.punctuation) 39 | if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'): 40 | cleanInput.append(item) 41 | print(len(cleanInput)) 42 | return cleanInput 43 | 44 | 45 | def getNgrams(input, n): 46 | input = cleanInput(input) 47 | output = {} 48 | for i in range(len(input) - n + 1): 49 | ngramTemp = " ".join(input[i:i + n]) 50 | if ngramTemp not in output: 51 | output[ngramTemp] = 0 52 | output[ngramTemp] += 1 53 | return output 54 | 55 | 56 | def getFirstSentenceContaining(ngram, content): 57 | # print(ngram) 58 | sentences = content.split(".") 59 | # print(sentences) 60 | for sentence in sentences: 61 | if ngram in sentence.lower(): 62 | return sentence 63 | return "" 64 | 65 | 66 | content = str(urlopen("http://pythonscraping.com/files/inaugurationSpeech.txt").read(), 'utf-8') 67 | # print(content) 68 | ngrams = getNgrams(content, 2) 69 | sortedNGrams = sorted(ngrams.items(), key=operator.itemgetter(1), reverse=True) 70 | print(len(sortedNGrams)) 71 | 72 | selected_ngrams = [] 73 | for item in sortedNGrams: 74 | if item[1] > 2 and not isCommon(item[0].split()): 75 | selected_ngrams.append(item) 76 | print(selected_ngrams) 77 | print('the number of the significant 2-grams is:' + str(len(selected_ngrams))) 78 | 79 | count = 0 80 | for ngram in selected_ngrams: 81 | count += 1 82 | print(ngram) 83 | print(getFirstSentenceContaining(ngram[0], content)) 84 | if count > 5: 85 | break -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter08/6-degrees-demo.py: -------------------------------------------------------------------------------- 1 | class SolutionFound(RuntimeError): 2 | def __init__(self, message): 3 | self.message = message 4 | 5 | 6 | def getLinks(fromPageId): 7 | data = {1: [2, 3, 4], 8 | 2: [5, 6, 7], 9 | 3: [8, 9, 10], 10 | 4: [11, 12, 13], 11 | 6: [14, 15, 16]} 12 | if fromPageId not in data: 13 | return None 14 | return data[fromPageId] 15 | 16 | 17 | def constructDict(currentPageId): 18 | links = getLinks(currentPageId) 19 | if links: 20 | return dict(zip(links, [{}] * len(links))) 21 | return {} 22 | 23 | 24 | def searchDepth(targetPageId, currentPageId, linkTree, depth): 25 | print('depth: ', depth) 26 | # print(id(linkTree)) 27 | if depth == 0: 28 | return linkTree 29 | if not linkTree: 30 | linkTree = constructDict(currentPageId) 31 | if not linkTree: 32 | return {} 33 | if targetPageId in linkTree.keys(): 34 | print('TAREGT: ' + str(targetPageId) + ' FOUND!') 35 | raise SolutionFound('PAGE:' + str(currentPageId)) 36 | 37 | for branchkey, branchvalue in linkTree.items(): 38 | try: 39 | linkTree[branchkey] = searchDepth(targetPageId, branchkey, 40 | branchvalue, depth - 1) 41 | except SolutionFound as e: 42 | print(e.message) 43 | raise SolutionFound('PAGE:' + str(currentPageId)) 44 | return linkTree 45 | 46 | 47 | try: 48 | linkTree = searchDepth(14, 1, {}, 4) 49 | 50 | print('No solution found') 51 | except SolutionFound as e: 52 | print(e.message) 53 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter08/6-degrees-find.py: -------------------------------------------------------------------------------- 1 | from urllib.request import urlopen 2 | from bs4 import BeautifulSoup 3 | import pymysql 4 | 5 | conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='mysql', charset='utf8') 6 | cur = conn.cursor() 7 | cur.execute("USE wikipedia") 8 | 9 | 10 | def getUrl(pageId): 11 | cur.execute("SELECT url FROM pages WHERE id = %s", (int(pageId))) 12 | if cur.rowcount == 0: 13 | return None 14 | return cur.fetchone()[0] 15 | 16 | 17 | def getLinks(fromPageId): 18 | cur.execute("SELECT toPageId FROM links WHERE fromPageId = %s", (int(fromPageId))) 19 | if cur.rowcount == 0: 20 | return None 21 | return [x[0] for x in cur.fetchall()] 22 | 23 | 24 | def searchBreadth(targetPageId, currentPageId, depth, nodes): 25 | if nodes is None or len(nodes) == 0: 26 | return None 27 | if depth <= 0: 28 | for node in nodes: 29 | if node == targetPageId: 30 | return [node] 31 | return None 32 | # depth is greater than 0 -- go deeper! 33 | for node in nodes: 34 | found = searchBreadth(targetPageId, node, depth - 1, getLinks(node)) 35 | if found is not None: 36 | return found.append(currentPageId) 37 | return None 38 | 39 | 40 | nodes = getLinks(1) 41 | targetPageId = 123428 42 | for i in range(0, 4): 43 | found = searchBreadth(targetPageId, 1, i, nodes) 44 | if found is not None: 45 | print(found) 46 | for node in found: 47 | print(getUrl(node)) 48 | break 49 | else: 50 | print("No path found") 51 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter08/MarkovGenerator.py: -------------------------------------------------------------------------------- 1 | from urllib.request import urlopen 2 | from random import randint 3 | import collections 4 | 5 | 6 | def wordListSum(wordList): 7 | sum = 0 8 | for word, value in wordList.items(): 9 | sum += value 10 | return sum 11 | 12 | 13 | def retrieveRandomWord(wordList): 14 | randIndex = randint(1, wordListSum(wordList)) 15 | for word, value in wordList.items(): 16 | randIndex -= value 17 | if randIndex <= 0: 18 | return word 19 | 20 | 21 | def buildWordDict(text): 22 | # Remove newlines and quotes 23 | text = text.replace("\n", " ") 24 | text = text.replace("\"", "") 25 | 26 | # Make sure puncuation are treated as their own "word," so they will be included 27 | # in the Markov chain 28 | punctuation = [',', '.', ';', ':'] 29 | for symbol in punctuation: 30 | text = text.replace(symbol, " " + symbol + " ") 31 | 32 | words = text.split(" ") 33 | # Filter out empty words 34 | words = [word for word in words if word != ""] 35 | 36 | wordDict = {} 37 | for i in range(1, len(words)): 38 | if words[i - 1] not in wordDict: 39 | # Create a new dictionary for this word 40 | wordDict[words[i - 1]] = {} 41 | if words[i] not in wordDict[words[i - 1]]: 42 | wordDict[words[i - 1]][words[i]] = 0 43 | wordDict[words[i - 1]][words[i]] += 1 44 | 45 | """ 46 | # defaultdict: 47 | 48 | wordDict = collections.defaultdict(dict) 49 | for i in range(1, len(words)): 50 | if words[i] not in wordDict[words[i - 1]]: 51 | wordDict[words[i - 1]][words[i]] = 0 52 | wordDict[words[i - 1]][words[i]] += 1 53 | """ 54 | 55 | """ 56 | # setdefault: 57 | 58 | 59 | wordDict = {} 60 | for i in range(1, len(words)): 61 | if words[i] not in wordDict.setdefault(words[i - 1], {}): 62 | wordDict[words[i - 1]][words[i]] = 0 63 | wordDict[words[i - 1]][words[i]] += 1 64 | """ 65 | 66 | return wordDict 67 | 68 | 69 | text = str(urlopen("https://pythonscraping.com/files/inaugurationSpeech.txt").read(), 'utf-8') 70 | wordDict = buildWordDict(text) 71 | # print(wordDict) 72 | 73 | # Generate a Markov chain of length 100 74 | length = 100 75 | chain = "" 76 | currentWord = "I" 77 | for i in range(0, length): 78 | chain += currentWord + " " 79 | # print(wordDict[currentWord]) 80 | currentWord = retrieveRandomWord(wordDict[currentWord]) 81 | 82 | print(chain) 83 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter09/1-simpleForm.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | params = {'firstname': 'Ryan', 'lastname': 'Mitchell'} 4 | r = requests.post("http://pythonscraping.com/files/processing.php", data=params) 5 | print(r.text) 6 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter09/2-fileSubmission.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | files = {'uploadFile': open('../files/Python-logo.png', 'rb')} 4 | r = requests.post("http://pythonscraping.com/pages/processing2.php", 5 | files=files) 6 | print(r.text) 7 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter09/3-cookies.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | params = {'username': 'Ryan', 'password': 'password'} 4 | r = requests.post("http://pythonscraping.com/pages/cookies/welcome.php", params) 5 | print("Cookie is set to:") 6 | print(r.cookies.get_dict()) 7 | print("-----------") 8 | print("Going to profile page...") 9 | r = requests.get("http://pythonscraping.com/pages/cookies/profile.php", cookies=r.cookies) 10 | print(r.text) 11 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter09/4-sessionCookies.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | session = requests.Session() 4 | 5 | params = {'username': 'username', 'password': 'password'} 6 | s = session.post("http://pythonscraping.com/pages/cookies/welcome.php", params) 7 | print("Cookie is set to:") 8 | print(s.cookies.get_dict()) 9 | print("-----------") 10 | print("Going to profile page...") 11 | s = session.get("http://pythonscraping.com/pages/cookies/profile.php") 12 | print(s.text) 13 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter09/5-BasicAuth.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from requests.auth import AuthBase 3 | from requests.auth import HTTPBasicAuth 4 | 5 | auth = HTTPBasicAuth('ryan', 'password') 6 | r = requests.post(url="http://pythonscraping.com/pages/auth/login.php", auth=auth) 7 | print(r.text) 8 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter10/1-seleniumBasic.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | import time 3 | 4 | driver = webdriver.Firefox() 5 | # driver = webdriver.PhantomJS() 6 | driver.get('http://pythonscraping.com/pages/javascript/ajaxDemo.html') 7 | time.sleep(5) 8 | print(driver.find_element_by_id('content').text) 9 | driver.close() -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter10/2-waitForLoad.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.by import By 3 | from selenium.webdriver.support.ui import WebDriverWait 4 | from selenium.webdriver.support import expected_conditions as EC 5 | 6 | driver = webdriver.Firefox() 7 | driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html") 8 | try: 9 | element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "loadedButton"))) 10 | finally: 11 | print(driver.find_element_by_id("content").text) 12 | driver.close() -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter10/3-javascriptRedirect.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | import time 3 | from selenium.webdriver.remote.webelement import WebElement 4 | from selenium.common.exceptions import StaleElementReferenceException 5 | 6 | 7 | def waitForLoad(driver): 8 | elem = driver.find_element_by_tag_name("html") 9 | count = 0 10 | while True: 11 | count += 1 12 | if count > 20: 13 | print("Timing out after 10 seconds and returning") 14 | return 15 | time.sleep(.5) 16 | try: 17 | elem == driver.find_element_by_tag_name("html") 18 | except StaleElementReferenceException: 19 | return 20 | 21 | 22 | driver = webdriver.Firefox() 23 | driver.get("http://pythonscraping.com/pages/javascript/redirectDemo1.html") 24 | waitForLoad(driver) 25 | print(driver.page_source) 26 | 27 | driver.close() 28 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter11/1-basicImage.py: -------------------------------------------------------------------------------- 1 | from PIL import Image, ImageFilter 2 | 3 | kitten = Image.open("kitten.jpg") 4 | blurryKitten = kitten.filter(ImageFilter.GaussianBlur) 5 | blurryKitten.save("kitten_blurred.jpg") 6 | blurryKitten.show() -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter11/2-cleanImage.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import subprocess 3 | 4 | 5 | def cleanFile(filePath, newFilePath): 6 | image = Image.open(filePath) 7 | 8 | # Set a threshold value for the image, and save 9 | image = image.point(lambda x: 0 if x < 143 else 255) 10 | image.save(newFilePath) 11 | 12 | # call tesseract to do OCR on the newly created image 13 | subprocess.call(["tesseract", newFilePath, "output"]) 14 | 15 | # Open and read the resulting data file 16 | outputFile = open("output.txt", 'r') 17 | print(outputFile.read()) 18 | outputFile.close() 19 | 20 | 21 | cleanFile("text_2.png", "text_2_clean.png") -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter11/3-readWebImages.py: -------------------------------------------------------------------------------- 1 | import time 2 | from urllib.request import urlretrieve 3 | import subprocess 4 | from selenium import webdriver 5 | 6 | # driver = webdriver.PhantomJS(executable_path='/Users/ryan/Documents/pythonscraping/code/headless/phantomjs-1.9.8-macosx/bin/phantomjs') 7 | driver = webdriver.Firefox() 8 | driver.get("http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200") 9 | time.sleep(2) 10 | 11 | driver.find_element_by_id("img-canvas").click() 12 | # The easiest way to get exactly one of every page 13 | imageList = set() 14 | 15 | # Wait for the page to load 16 | time.sleep(10) 17 | print(driver.find_element_by_id("sitbReaderRightPageTurner").get_attribute("style")) 18 | while "pointer" in driver.find_element_by_id("sitbReaderRightPageTurner").get_attribute("style"): 19 | # While we can click on the right arrow, move through the pages 20 | driver.find_element_by_id("sitbReaderRightPageTurner").click() 21 | time.sleep(2) 22 | # Get any new pages that have loaded (multiple pages can load at once) 23 | pages = driver.find_elements_by_xpath("//div[@class='pageImage']/div/img") 24 | for page in pages: 25 | image = page.get_attribute("src") 26 | imageList.add(image) 27 | # break 28 | 29 | driver.quit() 30 | 31 | # Start processing the images we've collected URLs for with Tesseract 32 | for image in sorted(imageList): 33 | urlretrieve(image, "page.jpg") 34 | p = subprocess.Popen(["tesseract", "page.jpg", "page"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) 35 | p.wait() 36 | f = open("page.txt", "r") 37 | print(f.read()) 38 | # break 39 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter11/4-solveCaptcha.py: -------------------------------------------------------------------------------- 1 | from urllib.request import urlretrieve 2 | from urllib.request import urlopen 3 | from bs4 import BeautifulSoup 4 | import subprocess 5 | import requests 6 | from PIL import Image 7 | from PIL import ImageOps 8 | 9 | 10 | def cleanImage(imagePath): 11 | image = Image.open(imagePath) 12 | image = image.point(lambda x: 0 if x < 143 else 255) 13 | borderImage = ImageOps.expand(image, border=20, fill='white') 14 | borderImage.save(imagePath) 15 | 16 | 17 | html = urlopen("http://www.pythonscraping.com/humans-only") 18 | bsObj = BeautifulSoup(html, "html.parser") 19 | # Gather prepopulated form values 20 | imageLocation = bsObj.find("img", {"title": "Image CAPTCHA"})["src"] 21 | formBuildId = bsObj.find("input", {"name": "form_build_id"})["value"] 22 | captchaSid = bsObj.find("input", {"name": "captcha_sid"})["value"] 23 | captchaToken = bsObj.find("input", {"name": "captcha_token"})["value"] 24 | 25 | captchaUrl = "http://pythonscraping.com" + imageLocation 26 | urlretrieve(captchaUrl, "captcha.jpg") 27 | cleanImage("captcha.jpg") 28 | p = subprocess.Popen(["tesseract", "captcha.jpg", "captcha"], stdout= 29 | subprocess.PIPE, stderr=subprocess.PIPE) 30 | p.wait() 31 | f = open("captcha.txt", "r") 32 | 33 | # Clean any whitespace characters 34 | captchaResponse = f.read().replace(" ", "").replace("\n", "") 35 | print("Captcha solution attempt: " + captchaResponse) 36 | 37 | if len(captchaResponse) == 5: 38 | params = {"captcha_token": captchaToken, "captcha_sid": captchaSid, 39 | "form_id": "comment_node_page_form", "form_build_id": formBuildId, 40 | "captcha_response": captchaResponse, "name": "Ryan Mitchell", 41 | "subject": "I come to seek the Grail", 42 | "comment_body[und][0][value]": 43 | "...and I am definitely not a bot"} 44 | r = requests.post("http://www.pythonscraping.com/comment/reply/10", 45 | data=params) 46 | responseObj = BeautifulSoup(r.text, 'html.parse') 47 | if responseObj.find("div", {"class": "messages"}) is not None: 48 | print(responseObj.find("div", {"class": "messages"}).get_text()) 49 | else: 50 | print("There was a problem reading the CAPTCHA correctly!") 51 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter12/headers.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | 4 | session = requests.Session() 5 | headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"} 6 | url = "https://www.whatismybrowser.com/detect/what-http-headers-is-my-browser-sending" 7 | req = session.get(url, headers=headers) 8 | 9 | bsObj = BeautifulSoup(req.text, "lxml") 10 | print(bsObj.find("table",{"class":"table-striped"}).get_text) 11 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter12/honeypotDetection.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.remote.webelement import WebElement 3 | 4 | driver = webdriver.PhantomJS(executable_path='') 5 | driver.get("http://pythonscraping.com/pages/itsatrap.html") 6 | links = driver.find_elements_by_tag_name("a") 7 | for link in links: 8 | if not link.is_displayed(): 9 | print("The link "+link.get_attribute("href")+" is a trap") 10 | 11 | fields = driver.find_elements_by_tag_name("input") 12 | for field in fields: 13 | if not field.is_displayed(): 14 | print("Do not change value of "+field.get_attribute("name")) 15 | -------------------------------------------------------------------------------- /Web Scrapying with Python/Chapter12/seleniumCookies.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | 3 | # driver = webdriver.PhantomJS(executable_path='') 4 | driver = webdriver.Firefox() 5 | driver.get("http://pythonscraping.com") 6 | driver.implicitly_wait(1) 7 | print(driver.get_cookies()) 8 | 9 | savedCookies = driver.get_cookies() 10 | 11 | driver2 = webdriver.PhantomJS(executable_path='') 12 | driver2.get("http://pythonscraping.com") 13 | driver2.delete_all_cookies() 14 | for cookie in savedCookies: 15 | driver2.add_cookie(cookie) 16 | 17 | driver2.get("http://pythonscraping.com") 18 | driver.implicitly_wait(1) 19 | print(driver2.get_cookies()) 20 | -------------------------------------------------------------------------------- /Web Scrapying with Python/README.md: -------------------------------------------------------------------------------- 1 | ##Web Scraping with Python 2 | 3 | 4 | #Python网络数据采集 5 | - [第1章 初见网络爬虫](Chapter01/) 6 | - [第2章 复杂HTML解析](Chapter02/) 7 | - [第3章 开始采集](Chapter03/) 8 | - [第4章 使用API](Chapter04/) 9 | - [第5章 存储数据](Chapter05/) 10 | - [第6章 读取文档](Chapter06/) 11 | - [第7章 数据清洗](Chapter07/) 12 | - [第8章 自然语言处理](Chapter08/) 13 | - [第9章 穿越网页表单与登录窗口采集](Chapter09/) 14 | - [第10章 采集JavaScript](Chapter10/) 15 | - [第11章 图像识别与文字处理](Chapter11/) 16 | - [第12章 避开采集陷阱](Chapter12/) 17 | - [第13章 用爬虫测试网站](Chapter13/) 18 | - [第14章 远程采集](Chapter14/) 19 | 20 | 21 | ##注意 22 | 此书代码使用的是Python3 23 | 24 | ##实践 25 | - [马尔可夫文字生成器](Chapter08/MarkovGenerator.py) 26 | - [深度优先遍历](Chapter08/6-degrees-demo.py) 27 | - [维基百科广度优先遍历](Chapter08/6-degrees-find.py) 28 | -------------------------------------------------------------------------------- /mongoDB资料/ReferenceCards15-PDF.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/mongoDB资料/ReferenceCards15-PDF.pdf -------------------------------------------------------------------------------- /pymongo/README.md: -------------------------------------------------------------------------------- 1 | pymongo 对mongoDB进行 2 | 插入 3 | 查找 4 | 更新 5 | -------------------------------------------------------------------------------- /pymongo/create_index.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | import pymongo 3 | client = MongoClient(host='localhost', port=27017) 4 | db = client['text'] # 数据库名字 5 | 6 | result = db.profiles.create_index([('user_id', pymongo.ASCENDING)], unique=True) 7 | 8 | # The index prevents us from inserting a document whose user_id is already in the collection 9 | user_profiles = [ 10 | {'user_id': 222, 'name': 'Luke'}, 11 | {'user_id': 252, 'name': 'Ziltoid'}] 12 | result = db.profiles.insert_many(user_profiles) 13 | client.close() -------------------------------------------------------------------------------- /pymongo/delete.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | 3 | client = MongoClient(host='localhost', port=27017) 4 | db = client['text'] # 数据库名字 5 | 6 | db['inventory'].delete_one({}) 7 | # Delete operations do not drop indexes, even if deleting all documents from a collection 8 | db['inventory'].delete_many({}) 9 | 10 | db['inventory'].remove() 11 | client.close() -------------------------------------------------------------------------------- /pymongo/insert.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | 3 | client = MongoClient(host='localhost', port=27017) 4 | db = client['text'] # 数据库名字 5 | 6 | # -----------------Inserting a Document---------------------# 7 | post_id = db['inventory'].insert_one( 8 | {'item': "canvas", 'qty': 100, 'tags': ["cotton"], 'size': {'h': 28, 'w': 35.5, 'uom': "cm"}} 9 | ) 10 | 11 | db['inventory'].bulk_write() 12 | print(post_id) 13 | 14 | # ------------显示这个数据库下的所有几个名字-------------------# 15 | print(db.collection_names()) 16 | 17 | # --------Getting a Single Document With find_one()--------# 18 | 19 | print(db['inventory'].find_one()) 20 | print(db['inventory'].find_one({'item': 'canvas'})) 21 | 22 | # A common task in web applications is to get an ObjectId from the 23 | # request URL and find the matching document. 24 | # It’s necessary in this case to convert the ObjectId from a string before passing it to find_one: 25 | # 26 | 27 | 28 | # -----------------Bulk insert ---------------------# 29 | post_id = db['inventory'].insert_many( 30 | [{'item': "canvas1", 'qty': 100, 'tags': ["cotton"], 'size': {'h': 28, 'w': 35.5, 'uom': "cm"}}, 31 | {'item': "canvas2", 'qty': 100, 'tags': ["cotton"], 'size': {'h': 28, 'w': 35.5, 'uom': "cm"}} 32 | ] 33 | ) 34 | print(post_id) 35 | print(post_id.inserted_ids) 36 | client.close() 37 | 38 | # ------Querying for More Than One Document----------# 39 | # --------------------find()-------------------------# 40 | 41 | 42 | 43 | # -------------------Aggregation Framework-----------# 44 | result = db['aggregation'].insert_many( 45 | [{'x': "1", 'tags': ["cat", 'dog', 'mouse']}, 46 | {'x': "2", 'tags': ["cat", 'dog', 'mouse']}, 47 | {'x': "3", 'tags': ["cat", 'dog', 'mouse']}, 48 | {'x': "4", 'tags': ['dog']}, 49 | {'x': "5", 'tags': ['pig']}, 50 | ] 51 | ) 52 | 53 | from bson.son import SON 54 | pipeline = [ 55 | {'$unwind': '$tags'}, 56 | {'$group': {'_id': '$tags', 'count': {'$sum': 1}}}, 57 | {'$sort': SON([('count', -1), ('_id', -1)])}, 58 | ] 59 | print(list(db['aggregation'].aggregate(pipeline))) 60 | 61 | db['aggregation'].map_reduce() -------------------------------------------------------------------------------- /pymongo/update.py: -------------------------------------------------------------------------------- 1 | # 想一个数据库添加一个新的字段 2 | from pymongo import MongoClient 3 | 4 | client = MongoClient(host='localhost', port=27017) 5 | db = client['text'] # 数据库名字 6 | 7 | # -----------------Inserting a Document---------------------# 8 | db.inventory.insert_many([ 9 | {'item': "canvas", 'qty': 100, 'size': {'h': 28, 'w': 35.5, 'uom': "cm"}, 'status': "A"}, 10 | {'item': "journal", 'qty': 25, 'size': {'h': 14, 'w': 21, 'uom': "cm"}, 'status': "A"}, 11 | {'item': "mat", 'qty': 85, 'size': {'h': 27.9, 'w': 35.5, 'uom': "cm"}, 'status': "A"}, 12 | {'item': "mousepad", 'qty': 25, 'size': {'h': 19, 'w': 22.85, 'uom': "cm"}, 'status': "P"}, 13 | {'item': "notebook", 'qty': 50, 'size': {'h': 8.5, 'w': 11, 'uom': "in"}, 'status': "P"}, 14 | {'item': "paper", 'qty': 100, 'size': {'h': 8.5, 'w': 11, 'uom': "in"}, 'status': "D"}, 15 | {'item': "planner", 'qty': 75, 'size': {'h': 22.85, 'w': 30, 'uom': "cm"}, 'status': "D"}, 16 | {'item': "postcard", 'qty': 45, 'size': {'h': 10, 'w': 15.25, 'uom': "cm"}, 'status': "A"}, 17 | {'item': "sketchbook", 'qty': 80, 'size': {'h': 14, 'w': 21, 'uom': "cm"}, 'status': "A"}, 18 | {'item': "sketch pad", 'qty': 95, 'size': {'h': 22.85, 'w': 30.5, 'uom': "cm"}, 'status': "A"} 19 | ]) 20 | 21 | # db['inventory'].update_one({'item': 'paper'}, 22 | # {'$set': {'size.uom': 'cm', 'status': 'p'}, 23 | # '$currentDate': {'lastModified': True}}) 24 | # 25 | # db['inventory'].update_many({'qty': {'$lt': 50}}, 26 | # {'$set': {'size.uom': 'in', 'status': 'p'}, 27 | # '$currentDate': {'lastModified': True}}) 28 | 29 | 30 | # $addToSet:向数组中添加元素,若数组本身含有该元素,则不添加,否则,添加,这样就避免了数组中的元素重复现象; 31 | # $push:向数组尾部添加元素,但它不管数组中有没有该元素,都会添加 32 | db['inventory'].update({'item': 'canvas'}, 33 | {'$addToSet': {'comments': {'name': 456, 'status': 'p'}}, 34 | '$currentDate': {'lastModified': True}}, True) 35 | 36 | db['inventory'].update({'item': 'canvas'}, 37 | {'$addToSet': {'comments': {'name': 456, 'status': 'p'}}, 38 | '$currentDate': {'lastModified': True}}, True) 39 | 40 | db['inventory'].update({'item': 'canvas'}, 41 | {'$push': {'comments': {'name': 456, 'status': 'p'}}, 42 | '$currentDate': {'lastModified': True}}, True) 43 | # replace 44 | 45 | # db['inventory'].replace_one( 46 | # {'item': "paper"}, 47 | # {'item': "paper", 'instock': [{'warehouse': "A", 'qty': 60}, {'warehouse': "B", 'qty': 40}]} 48 | # ) 49 | # db['inventory'].update({'size.h': 30}, {'size': tmp}, True) 50 | client.close() 51 | -------------------------------------------------------------------------------- /pymongo/查找/1嵌入文档.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | 3 | client = MongoClient(host='localhost', port=27017) 4 | db = client['text'] # 数据库名字 5 | 6 | post_id = db['inventory'].insert_many( 7 | [{'item': "canvas1", 'qty': 20, 'tags': ["blank", 'red', 'blue'], 'size': {'h': 28, 'w': 35.5, 'uom': "cm"}}, 8 | {'item': "canvas2", 'qty': 100, 'tags': ["red", 'blank'], 'size': {'h': 8.5, 'w': 11.5, 'uom': "in"}}, 9 | {'item': "canvas3", 'qty': 60, 'tags': ["blank", 'red'], 'size': {'h': 8.5, 'w': 11.5, 'uom': "in"}}, 10 | {'item': "canvas4", 'qty': 45, 'tags': ["blank", 'red'], 'size': {'h': 30, 'w': 20.5, 'uom': "cm"}}, 11 | {'item': "canvas5", 'qty': 30, 'tags': ['blue'], 'size': {'h': 30, 'w': 20.5, 'uom': "cm"}} 12 | ] 13 | ) 14 | 15 | # --------------嵌入文档 ----------------------# 16 | print(list(db['inventory'].find({'size.h': 28}))) # 查询嵌入文档里面的字段,用. 17 | # 字典里面的字段(field)必须完全匹配 order 顺序不能乱 18 | print(list(db['inventory'].find({'size': {'h': 28, 'w': 35.5, 'uom': "cm"}}))) 19 | print(list(db['inventory'].find({'size': {'w': 35.5, 'uom': "cm", 'h': 28}}))) 20 | print(list(db['inventory'].find({'size': {'w': 35.5, 'uom': "cm"}}))) 21 | 22 | client.close() 23 | -------------------------------------------------------------------------------- /pymongo/查找/2嵌入数组.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | 3 | client = MongoClient(host='localhost', port=27017) 4 | db = client['text'] # 数据库名字 5 | 6 | post_id = db['inventory'].insert_many( 7 | [{'item': "canvas1", 'qty': 20, 'tags': ["blank", 'red', 'blue'], 'size': {'h': 28, 'w': 35.5, 'uom': "cm"}}, 8 | {'item': "canvas2", 'qty': 100, 'tags': ["red", 'blank'], 'size': {'h': 8.5, 'w': 11.5, 'uom': "in"}}, 9 | {'item': "canvas3", 'qty': 60, 'tags': ["blank", 'red'], 'size': {'h': 8.5, 'w': 11.5, 'uom': "in"}}, 10 | {'item': "canvas4", 'qty': 45, 'tags': ["blank", 'red'], 'size': {'h': 30, 'w': 20.5, 'uom': "cm"}}, 11 | {'item': "canvas5", 'qty': 30, 'tags': ['blue'], 'size': {'h': 30, 'w': 20.5, 'uom': "cm"}} 12 | ] 13 | ) 14 | 15 | # -------------嵌入数组------------------------# 16 | # print(list(db['inventory'].find({'tags': ['blue']}))) # 完全匹配 17 | # print(list(db['inventory'].find({'tags': ['red', 'blank']}))) 18 | 19 | # ---只要拥有red和blank即可,不必在意顺序---------# 20 | # all用在array上,不能在嵌入文档 21 | print(list(db['inventory'].find({'tags': {'$all': ['red', 'blank']}}))) 22 | 23 | # -----------tags字段至少包含一个blue-----------# 24 | # print(list(db['inventory'].find({'tags': 'blue'}))) 25 | 26 | 27 | # -----------tags字段的大小 -----------------# 28 | # print(list(db['inventory'].find({'tags': {'$size': 3}}))) 29 | 30 | # -------------. 31 | # print(list(db['inventory2'].find({'tags.0': 'blue'}))) 32 | -------------------------------------------------------------------------------- /pymongo/查找/3数组中嵌入文档.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | 3 | client = MongoClient(host='localhost', port=27017) 4 | db = client['text'] # 数据库名字 5 | 6 | # post_id = db['inventory'].insert_many( 7 | # [{'item': "canvas1", 'tags': [{'h': 28, 'uom': "in"}, {'h': 30, 'uom': "cm"}]}, 8 | # {'item': "canvas2", 'tags': [{'h': 15}]}, 9 | # {'item': "canvas3", 'tags': [{'h': 10, 'uom': "cm"}, {'h': 28, 'uom': "in"}]}, 10 | # {'item': "canvas4", 'tags': [{'h': 10, 'uom': "cm"}, {'h': 30, 'uom': "cm"}]}, 11 | # {'item': "canvas5", 'tags': [{'h': 28, 'uom': "cm"}, {'h': 30, 'uom': "cm"}]}, 12 | # {'item': "canvas6", 'tags': [{'h': 30, 'uom': "out"}, {'h': 28, 'uom': "cm"}]}, 13 | # ] 14 | # ) 15 | 16 | # -------------嵌入数组------------------------# 17 | # Equality matches on the whole embedded/nested document 18 | # require an exact match of the specified document, including the field orde 19 | # print(list(db['inventory'].find({'tags': {'h': 28, 'uom': 'in'}}))) 20 | # print(list(db['inventory'].find({'tags': {'uom': 'in', 'h': 28}}))) 21 | 22 | # 字段查询 23 | # print(list(db['inventory'].find({'tags.0.uom': 'in'}))) 24 | 25 | # elemMatch array文档至少一个满足 同一个元素中的键值组合 26 | # print(list(db['inventory'].find({'tags': {'$elemMatch': {'uom': 'in', 'h': 28}}}))) 27 | # print(list(db['inventory'].find({'tags': {'$elemMatch': {'h': 28}}}))) 28 | print(list(db['inventory'].find({'tags': {'$elemMatch': {'h': {'$gt': 10, '$lte': 20}}}}))) 29 | 30 | # 例如,以下查询匹配文档,其中嵌套在tags数组中的任何文档的h字段大于11, 31 | # 并且数组中的任何文档(但不一定是相同的嵌入文档)的h字段小于或等于20: 32 | print(list(db['inventory'].find({'tags.h': {'$gt': 11, '$lte': 20}}))) 33 | client.close() 34 | -------------------------------------------------------------------------------- /pymongo/查找/4从查询中返回的项目字段.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | 3 | client = MongoClient(host='localhost', port=27017) 4 | db = client['text'] # 数据库名字 5 | 6 | db['inventory'].insert_many([ 7 | {'item': "journal", 'status': "A", 'size': {'h': 14, 'w': 21, 'uom': "cm"}, 8 | 'instock': [{'warehouse': "A", 'qty': 5}]}, 9 | {'item': "notebook", 'status': "A", 'size': {'h': 8.5, 'w': 11, 'uom': "in"}, 10 | 'instock': [{'warehouse': "C", 'qty': 5}]}, 11 | {'item': "paper", 'status': "D", 'size': {'h': 8.5, 'w': 11, 'uom': "in"}, 12 | 'instock': [{'warehouse': "A", 'qty': 60}]}, 13 | {'item': "planner", 'status': "D", 'size': {'h': 22.85, 'w': 30, 'uom': "cm"}, 14 | 'instock': [{'warehouse': "A", 'qty': 40}]}, 15 | {'item': "postcard", 'status': "A", 'size': {'h': 10, 'w': 15.25, 'uom': "cm"}, 16 | 'instock': [{'warehouse': "B", 'qty': 15}, {'warehouse': "C", 'qty': 35}]} 17 | ]) 18 | 19 | # 结果中只显示item, status 和 _id(默认) 20 | print(list(db['inventory'].find({'status': 'A'}, {'item': 1, 'status': 1}))) 21 | 22 | # Suppress _id Field 23 | print(list(db['inventory'].find({'status': 'A'}, {'item': 1, 'status': 1, '_id': 0}))) 24 | 25 | # Return All But the Excluded Fields 26 | print(list(db['inventory'].find({'status': 'A'}, {'status': 0, 'instock': 0}))) 27 | 28 | # Return Specific Fields in Embedded Documents 29 | print(list(db['inventory'].find({'status': 'A'}, {'item': 1, 'status': 1, "size.uom": 1}))) 30 | 31 | # Suppress Specific Fields in Embedded Documents 32 | print(list(db['inventory'].find({'status': 'A'}, {"size.uom": 0}))) 33 | 34 | # Projection on Embedded Documents in an Array 35 | print(list(db['inventory'].find({'status': 'A'}, {'item': 1, 'status': 1, "instock.qty": 1}))) 36 | 37 | # Project Specific Array Elements in the Returned Array 38 | # The following example uses the $slice projection operator to return the last element in the instock array 39 | print(list(db['inventory'].find({'status': 'A'}, {'item': 1, 'status': 1, "instock": {'$slice': 1}}))) 40 | 41 | 42 | # print(list(db['inventory'].find({'status': 'A'}, {'item': 1, 'status': 1, "instock": {'$slice': [1, -1]}}))) 43 | print(list(db['inventory'].find({'status': 'A'}, {'item': 1, 'status': 1, "instock": 1}))) 44 | 45 | client.close() 46 | -------------------------------------------------------------------------------- /pymongo/查找/5空字段或缺失字段.py: -------------------------------------------------------------------------------- 1 | # Query for Null or Missing Fields 2 | 3 | from pymongo import MongoClient 4 | 5 | client = MongoClient(host='localhost', port=27017) 6 | db = client['text'] # 数据库名字 7 | 8 | # db['inventory'].insert_many([ 9 | # {'_id': 1, 'item': None}, 10 | # {'_id': 2} 11 | # ]) 12 | 13 | # The { item : null } query matches documents 14 | # that either contain the item field whose value is null or that do not contain the item field 15 | print(list(db['inventory'].find({'item': None}))) 16 | 17 | # only documents that contain the item field whose value is null 18 | print(list(db['inventory'].find({'item': {'$type': 10}}))) 19 | 20 | # The { item : { $exists: false } } query matches documents that do not contain the item field: 21 | print(list(db['inventory'].find({'item': {'$exists': True}}))) 22 | print(list(db['inventory'].find({'item': {'$exists': False}}))) 23 | 24 | client.close() -------------------------------------------------------------------------------- /pymongo/查找/6限制显示行数.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | 3 | client = MongoClient(host='localhost', port=27017) 4 | db = client['text'] # 数据库名字 5 | 6 | 7 | cursor = db['inventory'].find({}).limit(2) 8 | for i in cursor: 9 | print(i) 10 | 11 | 12 | cursor = db['inventory'].find({}).limit(2).skip(2) 13 | for i in cursor: 14 | print(i) 15 | 16 | 17 | # cursor = db['inventory'].find({'$or': [{'qty': {'$gt': 95}}, {'qty': {'$lt': 30}}]}) 18 | # cursor = db['inventory'].find({'qty': {'$gt': 80}}) 19 | # for i in cursor: 20 | # print(i) -------------------------------------------------------------------------------- /singleton/README.md: -------------------------------------------------------------------------------- 1 | ## 单例模式 2 | - [ 元类](./metaclass.py) 3 | - [ 装饰器](./decorator.py) 4 | - [ __new__方法](./__new__.py) 5 | - [new 方法 线程安全](./new_threading_safe.py) -------------------------------------------------------------------------------- /singleton/__new__.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import time 3 | 4 | 5 | class Singleton(object): 6 | 7 | def __new__(cls, *args, **kwargs): 8 | if not hasattr(Singleton, "_instance"): 9 | Singleton._instance = super(Singleton, cls).__new__(cls) 10 | return Singleton._instance 11 | 12 | 13 | def task(arg): 14 | obj = Singleton(arg) 15 | print(obj) 16 | 17 | 18 | for i in range(20): 19 | t = threading.Thread(target=task, args=[i, ]) 20 | t.start() 21 | -------------------------------------------------------------------------------- /singleton/decorator.py: -------------------------------------------------------------------------------- 1 | def Singleton(cls): 2 | _instance = {} 3 | 4 | def _singleton(*args, **kargs): 5 | if cls not in _instance: 6 | _instance[cls] = cls(*args, **kargs) 7 | return _instance[cls] 8 | return _singleton 9 | 10 | 11 | @Singleton 12 | class A(object): # A = singleton(A) 13 | a = 1 14 | 15 | def __init__(self, x=0): 16 | self.x = x 17 | 18 | 19 | obj1 = A(2) 20 | obj2 = A(3) 21 | print(obj1, obj2) 22 | -------------------------------------------------------------------------------- /singleton/metaclass.py: -------------------------------------------------------------------------------- 1 | class SingletonType(type): 2 | def __call__(cls, *args, **kwargs): 3 | if not hasattr(cls, "_instance"): 4 | cls._instance = super().__call__(*args, **kwargs) # 创建一个类对象 5 | return cls._instance 6 | 7 | 8 | class Foo(metaclass=SingletonType): 9 | def __init__(self, name): 10 | self.name = name 11 | 12 | obj1 = Foo('name1') 13 | obj2 = Foo('name2') 14 | obj3 = Foo('name3') 15 | print(obj1, obj2, obj3) 16 | print(obj1.name, obj2.name, obj3.name) 17 | 18 | -------------------------------------------------------------------------------- /singleton/new_threading_safe.py: -------------------------------------------------------------------------------- 1 | import threading 2 | 3 | import time 4 | 5 | 6 | class Singleton(object): 7 | _instance_lock = threading.Lock() 8 | 9 | def __new__(cls, *args, **kwargs): 10 | with Singleton._instance_lock: 11 | if not hasattr(Singleton, "_instance"): 12 | Singleton._instance = super(Singleton, cls).__new__(cls) 13 | return Singleton._instance 14 | 15 | 16 | def task(arg): 17 | obj = Singleton(arg) 18 | print(obj) 19 | 20 | 21 | for i in range(10): 22 | t = threading.Thread(target=task, args=[i, ]) 23 | t.start() 24 | -------------------------------------------------------------------------------- /spiders/Bs4基本元素.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import requests 3 | 4 | 5 | url = 'http://duanziwang.com/category/%E7%BB%8F%E5%85%B8%E6%AE%B5%E5%AD%90/' 6 | re = requests.get(url) 7 | print(re.url) 8 | html = re.text 9 | soup = BeautifulSoup(html) 10 | print(soup.attrs) 11 | print(soup.prettify()) 12 | # print(soup.b.string) # 标签内字符串的注释部分 13 | # print(soup.p.string) # 标签内非字符串 14 | -------------------------------------------------------------------------------- /spiders/RE库基本使用.py: -------------------------------------------------------------------------------- 1 | import re 2 | # group(0)永远是原始字符串,group(1)、group(2)……表示第1、2、……个子串。 3 | # re.search(pattern= , string= , flags=0) 4 | # 从一个字符串中搜索匹配正则表达式的第一个位置 5 | # pattern 正则表达式的字符串或原生字符串表示 6 | # string 待匹配字符串 7 | # flags 正则表达式使用时的控制标记 8 | ''' 9 | re.I 忽略正则表达式的大小写 10 | re.M 给定字符串的每行当中匹配开始 11 | re.S 默认匹配除换行外的所以匹配 12 | ''' 13 | 14 | # match = re.search(r'[1-9]\d{5}', 'BIT 100081 TSU 100084',) 15 | # print(match.re) 16 | # print(match.pos) 17 | # print(match.endpos) 18 | # print(match.string) 19 | # print('xxxxxxxxx') 20 | # print(match.group(0)) 21 | # print(match.start()) 22 | # print(match.end()) 23 | # print(match.span()) 24 | # 25 | # print(match) 26 | # pat = re.compile(r'') 27 | # rst = pat.search('BIT 100081 TSU 100084') 28 | # print(rst) 29 | # # 函数式用法,一次性操作 30 | # ''' 31 | # 面向对象用法:编译后的多次操作 32 | # pat = re.compile(r'[1-9]\d{5}') #将正则表达式的字符串形式编译成正则表达式对象 33 | # rst = pat.search('BIT 100081') 34 | # ''' 35 | # if match: 36 | # print(match.group(0)) 37 | # 38 | # ''' 39 | # re.match(pattern, string, flags=0) 40 | # 从一个字符串的开始位置起匹配正则表达式,返回match对象 41 | # ''' 42 | # # match = re.match(r'[1-9]\d{5}', 'BIT 100081') 43 | # match = re.match(r'[1-9]\d{5}', '100081 BIT') 44 | # if match: 45 | # print(match.group(0)) 46 | # 47 | # 48 | # 49 | # ''' 50 | # re.findall() 51 | # 搜素字符串,以列表类型返回全部匹配的字串 52 | # ''' 53 | # 54 | # ls = re.findall(r'[1-9]\d{5}', 'BIT100081 TSU100084') 55 | # ls = re.findall(r'(\+86[1][23456789]\d{9}|' 56 | # r'86[1][23456789]\d{9})', '8613125134887 +8611125134887 +8613125134887') 57 | # print(ls) 58 | # 59 | # ''' 60 | # re.split(pattern, string, maxsplit=0, flag=0) 61 | # 将一个字符串按照正则表达式匹配结果进行分割,返回列表类型 62 | # maxsplit: 正则表达式使用时的控制标记 63 | # ''' 64 | # ls = re.split(r'[1-9]\d{5}', 'BIT100081 TSU100084') 65 | # print(ls) 66 | # ls = re.split(r'[1-9]\d{5}', 'BIT100081 TSU100084', maxsplit=1) 67 | # print(ls) 68 | # 69 | # ''' 70 | # re.findite() 71 | # 搜素字符串,返回一个匹配结果的迭代类型,每个迭代元素为match对象 72 | # ''' 73 | # for m in re.finditer(r'[1-9]\d{5}', 'BIT100081 TSU100084'): 74 | # print(m.group(0)) 75 | # 76 | # ''' 77 | # re.sub(pattern, repl, string, count=0, flags=0) 78 | # 在一个字符串中替换所有匹配正则表达式的字串,返回替换后的字符串 79 | # repl: 替换匹配字符串的字符串 80 | # count:匹配的最大替换次数 81 | # ''' 82 | # # 83 | # ls = re.sub(r'[1-9]\d{5}', ':zipcode', 'BIT100081 TSU100084') 84 | # print(ls) 85 | -------------------------------------------------------------------------------- /spiders/ajax今日头条.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from urllib.parse import urlencode 4 | import requests 5 | from bs4 import BeautifulSoup 6 | from requests.exceptions import ConnectionError 7 | import re 8 | from multiprocessing import Pool 9 | from hashlib import md5 10 | from json.decoder import JSONDecodeError 11 | 12 | 13 | def get_page_index(offset, keyword): 14 | data = { 15 | 'autoload': 'true', 16 | 'count': 20, 17 | 'cur_tab': 1, 18 | 'format': 'json', 19 | 'keyword': keyword, 20 | 'offset': offset, 21 | } 22 | params = urlencode(data) 23 | base = 'http://www.toutiao.com/search_content/' 24 | url = base + '?' + params 25 | print(url) 26 | try: 27 | response = requests.get(url) 28 | if response.status_code == 200: 29 | return response.text 30 | return None 31 | except ConnectionError: 32 | print('Error occurred') 33 | return None 34 | 35 | 36 | def download_image(url): 37 | print('Downloading', url) 38 | try: 39 | response = requests.get(url) 40 | if response.status_code == 200: 41 | save_image(response.content) 42 | return None 43 | except ConnectionError: 44 | return None 45 | 46 | 47 | def save_image(content): 48 | file_path = '{0}'.format(os.getcwd() + '\今日头条照片') 49 | if not os.path.exists(file_path): 50 | os.makedirs(file_path) 51 | image_path = '{0}/{1}.{2}'.format(os.getcwd() + '\今日头条照片', md5(content).hexdigest(), 'jpg') 52 | if not os.path.exists(image_path): 53 | with open(image_path, 'wb') as f: 54 | f.write(content) 55 | f.close() 56 | 57 | 58 | def parse_page_index(text): 59 | try: 60 | data = json.loads(text) 61 | if data and 'data' in data.keys(): 62 | for item in data.get('data'): 63 | yield item.get('article_url') 64 | except JSONDecodeError: 65 | pass 66 | 67 | 68 | def get_page_detail(url): 69 | headers = { 70 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} 71 | try: 72 | response = requests.get(url, headers=headers) 73 | if response.status_code == 200: 74 | return response.text 75 | return None 76 | except ConnectionError: 77 | print('Error occurred') 78 | return None 79 | 80 | 81 | def parse_page_detail(html, url): 82 | soup = BeautifulSoup(html, 'lxml') 83 | result = soup.select('title') 84 | title = result[0].get_text() if result else '' 85 | images_pattern = re.compile('gallery: JSON.parse\("(.*)"\)', re.S) 86 | result = re.search(images_pattern, html) 87 | if result: 88 | data = json.loads(result.group(1).replace('\\', '')) 89 | if data and 'sub_images' in data.keys(): 90 | sub_images = data.get('sub_images') 91 | images = [item.get('url') for item in sub_images] 92 | for image in images: 93 | download_image(image) 94 | return { 95 | 'title': title, 96 | 'url': url, 97 | 'images': images 98 | } 99 | 100 | 101 | # def save_to_mongo(result): 102 | # if db[MONGO_TABLE].insert(result): 103 | # print('Successfully Saved to Mongo', result) 104 | # return True 105 | # return False 106 | 107 | 108 | def main(offset): 109 | KEYWORD = '街拍' 110 | text = get_page_index(offset, KEYWORD) 111 | urls = parse_page_index(text) 112 | for url in urls: 113 | if url != None: 114 | html = get_page_detail(url) 115 | result = parse_page_detail(html, url) 116 | # if result : save_to_mongo(result) 117 | 118 | 119 | if __name__ == '__main__': 120 | # GROUP_START = 1 121 | # GROUP_END = 3 122 | main(0) 123 | # pool = Pool() 124 | # groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)]) 125 | # pool.map(main, groups) 126 | # pool.close() 127 | # pool.join() -------------------------------------------------------------------------------- /spiders/csdn_ajax.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | 4 | def get_page(): 5 | headers = {'cookie':'', 6 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} 7 | url = 'https://www.csdn.net/api/articles?type=more&category=home&shown_offset=0' 8 | try: 9 | r = requests.get(url, headers=headers) 10 | if r.status_code == 200: 11 | html = r.json() 12 | articles = html['articles'] 13 | if len(articles) == 0: 14 | print(url) 15 | return r.json() 16 | return None 17 | except ConnectionError: 18 | return None 19 | 20 | 21 | def pares_page(html): 22 | articles = html['articles'] 23 | print(len(articles)) 24 | for article in articles: 25 | yield article['title'] 26 | 27 | 28 | def main(): 29 | for i in range(20): 30 | html = get_page() 31 | yield from pares_page(html) 32 | 33 | if __name__ == '__main__': 34 | l = list(main()) 35 | print(len(l)) -------------------------------------------------------------------------------- /spiders/jdsearch.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests 3 | from bs4 import BeautifulSoup 4 | import time 5 | 6 | 7 | def get_frist(page, s): 8 | params = { 9 | 'keyword': '小米手机', 10 | 'enc': 'utf-8', 11 | 'qrst': '1', 12 | 'rt': '1', 13 | 'stop': '1', 14 | 'vt': '2', 15 | 'bs': '1', 16 | 'psort':'3', 17 | 'ev': 'exbrand_小米(MI)^', 18 | 'page': str(page), 19 | 's': s, 20 | 'click': '0', 21 | } 22 | url = 'https://search.jd.com/Search?' 23 | try: 24 | r = requests.get(url, params=params) 25 | r.raise_for_status() 26 | r.encoding = 'utf-8' 27 | print(r.url) 28 | return r.text 29 | except Exception as e: 30 | print(r.status_code) 31 | 32 | 33 | def get_last(page, s): 34 | log_id = time.time() 35 | log_id = '%.5f' % log_id 36 | params = { 37 | 'keyword': '小米手机', 38 | 'enc': 'utf-8', 39 | 'qrst': '1', 40 | 'rt': '1', 41 | 'stop': '1', 42 | 'vt': '2', 43 | 'bs': '1', 44 | 'psort': '3', 45 | 'ev': 'exbrand_小米(MI)^', 46 | 'page': str(page), 47 | 's': s, 48 | 'scrolling': 'y', 49 | 'log_id': log_id, 50 | 'tpl': '3_M', 51 | } 52 | url = 'https://search.jd.com/Search?' 53 | try: 54 | r = requests.get(url, params=params) 55 | r.raise_for_status() 56 | r.encoding = 'utf-8' 57 | print(r.url) 58 | return r.text 59 | except Exception as e: 60 | print(r.status_code) 61 | 62 | 63 | def get_info(text, count): 64 | soup = BeautifulSoup(text, 'html.parser') 65 | for child in soup.find_all(class_='gl-item'): 66 | data = {} 67 | try: 68 | data['price'] = child.find('strong').attrs['data-price'] 69 | except: 70 | data['price'] = child.find('strong').find('i').text 71 | 72 | try: 73 | data['shop'] = child.find(class_='p-shop').find('a').text.strip() 74 | except: 75 | print('这是一个广告') 76 | continue 77 | 78 | try: 79 | data['name'] = child.find(class_='p-name').find('em').text 80 | data['commit'] = child.find(class_='p-commit').find('strong').find('a').text 81 | except: 82 | continue 83 | print(data) 84 | save_to_file(data) 85 | count += 1 86 | if count == 200: 87 | break 88 | return count 89 | 90 | 91 | def save_to_file(data): 92 | file = 'goods.txt' 93 | with open(file, 'a', encoding='utf-8') as f: 94 | f.write(str(data)) 95 | f.write('\n') 96 | 97 | 98 | if __name__ == '__main__': 99 | # text = get_frist() 100 | s = 1 101 | count = 0 102 | for i in range(0, 10, 2): 103 | text = get_frist(i, s) 104 | s = s + 30 105 | count = get_info(text, count) 106 | if count == 200: 107 | print('200个信息爬取完毕') 108 | break 109 | 110 | text = get_last(i+1, s) 111 | s = s + 30 112 | count = get_info(text, count) 113 | if count == 200: 114 | print('200个信息爬取完毕') 115 | break 116 | -------------------------------------------------------------------------------- /spiders/jianshu.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | from urllib.parse import urlencode 4 | import pymongo 5 | client = pymongo.MongoClient('localhost') 6 | db = client['jianshu'] 7 | data = [] 8 | 9 | 10 | def get_first_page(url): 11 | headers = { 12 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' 13 | } 14 | try: 15 | r = requests.get(url, headers=headers) 16 | 17 | if r.status_code == 200: 18 | return r.text 19 | return None 20 | except ConnectionError: 21 | print('抓取失败', url) 22 | return None 23 | 24 | 25 | def get_page(url): 26 | headers = { 27 | 'X-CSRF-Token': '6vJnbFxpgkYWu28t+TQd77DYYeG/HuELzV4vKveTleCyCWtAFd408Un7Z5cwn3b1hzZB3uGqzUQprnJKOL3lgw==', 28 | 'X-PJAX': 'true', 29 | 'X-Requested-With': 'XMLHttpRequest', 30 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' 31 | } 32 | try: 33 | r = requests.get(url, headers=headers) 34 | if r.status_code == 200: 35 | return r.text 36 | return None 37 | except ConnectionError: 38 | print('抓取失败', url) 39 | return None 40 | 41 | 42 | def save_to_mongo(result): 43 | db['result'].insert(result) 44 | 45 | 46 | def parse_first_page(html): 47 | global data 48 | soup = BeautifulSoup(html, 'lxml') 49 | note_list = soup.find('ul', class_='note-list') 50 | if note_list is None: 51 | return None 52 | for li in note_list.find_all('li'): 53 | try: 54 | id = 'seen_snote_ids%5B%5D=' + li.get('data-note-id') 55 | data.append(id) 56 | yield { 57 | 'title': li.find('div').find('a').text, 58 | 'abstract': li.find('p').text, 59 | 'nickname': li.find(class_='meta').find(class_='nickname').text 60 | } 61 | except: 62 | continue 63 | 64 | 65 | def parse_page(html): 66 | global data 67 | soup = BeautifulSoup(html, 'lxml') 68 | for li in soup.find_all('li'): 69 | try: 70 | id = 'seen_snote_ids%5B%5D=' + li.get('data-note-id') 71 | data.append(id) 72 | yield { 73 | 'title': li.find('div').find('a').text, 74 | 'abstract': li.find('p').text, 75 | 'nickname': li.find(class_='meta').find(class_='nickname').text 76 | } 77 | except: 78 | continue 79 | 80 | 81 | def main(): 82 | global data 83 | # 第一次请求 84 | print('正在解析第一页') 85 | url = 'https://www.jianshu.com/?&page=1' 86 | html = get_first_page(url) 87 | if html is None: 88 | return False 89 | for result in parse_first_page(html): 90 | save_to_mongo(result) 91 | # 弟二三请求都是get请求 92 | # 后面是post请求 93 | print('解析分页数据') 94 | for i in range(2, 16): 95 | params = '&'.join(data) 96 | url = 'https://www.jianshu.com/?' + params + '&page={}'.format(i) 97 | html = get_page(url) 98 | for result in parse_page(html): 99 | save_to_mongo(result) 100 | 101 | 102 | if __name__ == '__main__': 103 | main() -------------------------------------------------------------------------------- /spiders/newhouse.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import time 4 | import csv 5 | 6 | def get_html_text(page): 7 | url = f'http://newhouse.nj.house365.com/house/dist-4_p-{page}/' 8 | print('正在解析:', url) 9 | try: 10 | r = requests.get(url) 11 | r.raise_for_status() 12 | return r.text 13 | except Exception as e: 14 | print('抓取失败', r.status_code) 15 | return None 16 | 17 | 18 | def get_page(html): 19 | soup = BeautifulSoup(html, 'html.parser') 20 | page_num = soup.find(class_='fr orderby').find('b').text 21 | return int(page_num) 22 | 23 | 24 | def get_info(html): 25 | soup = BeautifulSoup(html, 'html.parser') 26 | for mc in soup.find_all(class_='mc_list'): 27 | data = {} 28 | data['name'] = mc.find(class_='tit').find('a').text 29 | data['addr'] = mc.find(class_='yh_info').find_all('p')[1].text.strip().split()[0] 30 | data['price'] = mc.find(class_='xiang_price').text.strip().split()[0] 31 | try: 32 | data['phone'] = ''.join(mc.find(class_='pt5').find('b').text.split()) 33 | except Exception as e: 34 | print('售空,没给电话号码') 35 | data['phone'] = 'Null' 36 | print(data) 37 | save_to_file(data) 38 | 39 | def save_to_file(data): 40 | # with open('houseInfo.txt', 'a', encoding='utf-8') as f: 41 | # f.write(str(data) + '\n') 42 | with open('houseInfo.csv', 'a', encoding='utf-8', newline ='') as f: 43 | writer = csv.DictWriter(f, ['name', 'phone', 'addr', 'price']) 44 | writer.writerow(data) 45 | 46 | 47 | if __name__ == '__main__': 48 | html = get_html_text(1) 49 | page_num = int(get_page(html) / 15) + 2 50 | get_info(html) 51 | for i in range(2, page_num): 52 | html = get_html_text(str(i)) 53 | get_info(html) 54 | time.sleep(3) 55 | -------------------------------------------------------------------------------- /spiders/scrapy/BaiduStocks/BaiduStockInfo.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/BaiduStocks/BaiduStockInfo.txt -------------------------------------------------------------------------------- /spiders/scrapy/BaiduStocks/BaiduStocks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/BaiduStocks/BaiduStocks/__init__.py -------------------------------------------------------------------------------- /spiders/scrapy/BaiduStocks/BaiduStocks/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/BaiduStocks/BaiduStocks/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/BaiduStocks/BaiduStocks/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/BaiduStocks/BaiduStocks/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/BaiduStocks/BaiduStocks/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/BaiduStocks/BaiduStocks/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/BaiduStocks/BaiduStocks/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class BaidustocksItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /spiders/scrapy/BaiduStocks/BaiduStocks/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class BaidustocksSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class BaidustocksDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /spiders/scrapy/BaiduStocks/BaiduStocks/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | 10 | class BaidustocksInfoPipeline(object): 11 | def process_item(self, item, spider): 12 | try: 13 | line = str(dict(item)) + '\n' 14 | self.f.write(line) 15 | except: 16 | pass 17 | return item -------------------------------------------------------------------------------- /spiders/scrapy/BaiduStocks/BaiduStocks/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for BaiduStocks project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'BaiduStocks' 13 | 14 | SPIDER_MODULES = ['BaiduStocks.spiders'] 15 | NEWSPIDER_MODULE = 'BaiduStocks.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'BaiduStocks (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'BaiduStocks.middlewares.BaidustocksSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'BaiduStocks.middlewares.BaidustocksDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'BaiduStocks.pipelines.BaidustocksInfoPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /spiders/scrapy/BaiduStocks/BaiduStocks/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /spiders/scrapy/BaiduStocks/BaiduStocks/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/BaiduStocks/BaiduStocks/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/BaiduStocks/BaiduStocks/spiders/__pycache__/stocks.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/BaiduStocks/BaiduStocks/spiders/__pycache__/stocks.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/BaiduStocks/BaiduStocks/spiders/stocks.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import re 4 | 5 | 6 | class StocksSpider(scrapy.Spider): 7 | name = 'stocks' 8 | allowed_domains = ['baidu.com'] 9 | start_urls = ['http://quote.eastmoney.com/stocklist.html'] 10 | 11 | 12 | def parse(self, response): 13 | # temp = response.css('.a') 14 | # print(temp.css('::text').extract(), '***************') 15 | # print(response.css('a::attr(href)').extract(), '***************') 16 | # print('***************************************************') 17 | for href in response.css('a::attr(href)').extract(): 18 | # for href in temp.css('::attr(href)').extract(): 19 | # 找到class为a的所有结点。提取a标签属性为href的内容 20 | # .extract()为了提取真实的原文数据 返回的系统自带的List 没有这个是SelectorList 21 | # print(href) 22 | try: 23 | # re.findall(r'[s][hz]\d{6}', href)[0] 24 | stock = re.findall(r'[s][hz]\d{6}', href)[0] #以列表形式返回能匹配的字符串 25 | url = 'https://gupiao.baidu.com/stock/' + stock + '.html' 26 | # print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%') 27 | # print(url) 28 | yield scrapy.Request(url, callback=self.parse_stock) 29 | # Request类 由scrapy生产。由downloader执行 30 | # 表示一个HTTP请求。.method对应请求的方法 31 | # classback 回调函数 将此请求返回的response传递给下一个函数进行处理 32 | except: 33 | continue 34 | 35 | def parse_stock(self, response): 36 | info_dict = {} 37 | temp = response 38 | stock_info = response.css('.stock-bets') 39 | name = stock_info.css('.bets-name').extract()[0] 40 | key_list = stock_info.css('dt').extract() 41 | value_list = stock_info.css('dd').extract() 42 | for i in range(len(key_list)): 43 | key = re.findall(r'>.*', key_list[i])[0][1:-5] 44 | try: 45 | val = re.findall(r'\d+.?.*.', value_list[i])[0][0:-5] 46 | except: 47 | val = '--' 48 | info_dict[key] = val 49 | info_dict.update( 50 | {'股票名称': re.findall('\s.*\(', name)[0].split()[0] + re.findall('\>.*\<', name)[0][1: -1]}) 51 | yield info_dict 52 | -------------------------------------------------------------------------------- /spiders/scrapy/BaiduStocks/BaiduStocks/调试.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from scrapy import cmdline 4 | name = 'stocks' 5 | cmd = 'scrapy crawl {0}'.format(name) 6 | cmdline.execute(cmd.split()) -------------------------------------------------------------------------------- /spiders/scrapy/BaiduStocks/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = BaiduStocks.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = BaiduStocks 12 | -------------------------------------------------------------------------------- /spiders/scrapy/Tencent/Tencent/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/Tencent/Tencent/__init__.py -------------------------------------------------------------------------------- /spiders/scrapy/Tencent/Tencent/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/Tencent/Tencent/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/Tencent/Tencent/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/Tencent/Tencent/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/Tencent/Tencent/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/Tencent/Tencent/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/Tencent/Tencent/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class TencentItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | position_name = scrapy.Field() 15 | position_link = scrapy.Field() 16 | position_type = scrapy.Field() 17 | position_number = scrapy.Field() 18 | work_location = scrapy.Field() 19 | publish_time = scrapy.Field() 20 | -------------------------------------------------------------------------------- /spiders/scrapy/Tencent/Tencent/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class TencentSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class TencentDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /spiders/scrapy/Tencent/Tencent/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class TencentPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /spiders/scrapy/Tencent/Tencent/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for Tencent project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'Tencent' 13 | 14 | SPIDER_MODULES = ['Tencent.spiders'] 15 | NEWSPIDER_MODULE = 'Tencent.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'Tencent (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'Tencent.middlewares.TencentSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'Tencent.middlewares.TencentDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'Tencent.pipelines.TencentPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /spiders/scrapy/Tencent/Tencent/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /spiders/scrapy/Tencent/Tencent/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/Tencent/Tencent/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/Tencent/Tencent/spiders/__pycache__/tencent.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/Tencent/Tencent/spiders/__pycache__/tencent.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/Tencent/Tencent/spiders/tencent.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy import Spider, Request 3 | 4 | from Tencent.items import TencentItem 5 | 6 | 7 | class TencentSpider(Spider): 8 | name = 'tencent' 9 | allowed_domains = ['tencent.com'] 10 | url = 'https://hr.tencent.com/position.php?&start={index}#a' 11 | 12 | def start_requests(self): 13 | start_url = self.url.format(index=0) 14 | yield Request(start_url, callback=self.parse) 15 | 16 | def parse(self, response): 17 | table_list = response.xpath('//*[@id="position"]/div[1]/table/tr') 18 | for tr in table_list[1:-1]: 19 | item = TencentItem() 20 | item['position_name'] = tr.xpath('./td/a/text()').extract_first() 21 | item['position_link'] = tr.xpath('./td/a/@href').extract_first() 22 | item['position_type'] = tr.xpath('./td[2]/text()').extract_first() 23 | item['position_number'] = tr.xpath('./td[3]/text()').extract_first() 24 | item['work_location'] = tr.xpath('./td[4]/text()').extract_first() 25 | item['publish_time'] = tr.xpath('./td[5]/text()').extract_first() 26 | yield item 27 | for i in range(2, 331): 28 | url = self.url.format(index=i*10) 29 | yield Request(url=url, callback=self.parse) 30 | -------------------------------------------------------------------------------- /spiders/scrapy/Tencent/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = Tencent.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = Tencent 12 | -------------------------------------------------------------------------------- /spiders/scrapy/jianshu/jishuspider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/jianshu/jishuspider/__init__.py -------------------------------------------------------------------------------- /spiders/scrapy/jianshu/jishuspider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class JishuItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | title = scrapy.Field() 15 | author = scrapy.Field() 16 | publish_time = scrapy.Field() 17 | wordage = scrapy.Field() 18 | views_count = scrapy.Field() 19 | comments_count = scrapy.Field() 20 | likes_count = scrapy.Field() 21 | rewards_count = scrapy.Field() 22 | pass 23 | -------------------------------------------------------------------------------- /spiders/scrapy/jianshu/jishuspider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class JishuspiderSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class JishuspiderDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /spiders/scrapy/jianshu/jishuspider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymongo 8 | 9 | 10 | class JishuspiderPipeline(object): 11 | def process_item(self, item, spider): 12 | return item 13 | 14 | 15 | class MongoPipeline(object): 16 | def __init__(self, mongo_uri, mongo_db): 17 | self.mongo_uri = mongo_uri 18 | self.mongo_db = mongo_db 19 | 20 | @classmethod 21 | def from_crawler(cls, crawler): 22 | return cls( 23 | mongo_db=crawler.settings.get('MONGO_DB'), 24 | mongo_url=crawler.settings.get('MONGO_URI') 25 | ) 26 | 27 | def open_spider(self, spider): 28 | self.client = pymongo.MongoClient(self.mongo_uri) 29 | self.db = self.client[self.mongo_db] 30 | 31 | def process_item(self, item, spider): 32 | name = item.__class__.__name__ 33 | self.db[name].insert(dict(item)) 34 | return item 35 | 36 | def close_spider(self, spider): 37 | self.client.close() 38 | 39 | -------------------------------------------------------------------------------- /spiders/scrapy/jianshu/jishuspider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for jishuspider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'jishuspider' 13 | 14 | SPIDER_MODULES = ['jishuspider.spiders'] 15 | NEWSPIDER_MODULE = 'jishuspider.spiders' 16 | 17 | MONGO_URI = 'localhost' 18 | MONGO_DB = 'jianshu' 19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 20 | #USER_AGENT = 'jishuspider (+http://www.yourdomain.com)' 21 | 22 | # Obey robots.txt rules 23 | ROBOTSTXT_OBEY = False 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | #CONCURRENT_REQUESTS = 32 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | #DOWNLOAD_DELAY = 3 32 | # The download delay setting will honor only one of: 33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 34 | #CONCURRENT_REQUESTS_PER_IP = 16 35 | 36 | # Disable cookies (enabled by default) 37 | #COOKIES_ENABLED = False 38 | 39 | # Disable Telnet Console (enabled by default) 40 | #TELNETCONSOLE_ENABLED = False 41 | 42 | # Override the default request headers: 43 | DEFAULT_REQUEST_HEADERS = { 44 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 45 | 'Accept-Language': 'en', 46 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' 47 | } 48 | 49 | # Enable or disable spider middlewares 50 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 51 | #SPIDER_MIDDLEWARES = { 52 | # 'jishuspider.middlewares.JishuspiderSpiderMiddleware': 543, 53 | #} 54 | 55 | # Enable or disable downloader middlewares 56 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 57 | #DOWNLOADER_MIDDLEWARES = { 58 | # 'jishuspider.middlewares.JishuspiderDownloaderMiddleware': 543, 59 | #} 60 | 61 | # Enable or disable extensions 62 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 63 | #EXTENSIONS = { 64 | # 'scrapy.extensions.telnet.TelnetConsole': None, 65 | #} 66 | 67 | # Configure item pipelines 68 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 69 | # ITEM_PIPELINES = { 70 | # 'jishuspider.pipelines.MongoPipeline': 300, 71 | # } 72 | 73 | # Enable and configure the AutoThrottle extension (disabled by default) 74 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 75 | #AUTOTHROTTLE_ENABLED = True 76 | # The initial download delay 77 | #AUTOTHROTTLE_START_DELAY = 5 78 | # The maximum download delay to be set in case of high latencies 79 | #AUTOTHROTTLE_MAX_DELAY = 60 80 | # The average number of requests Scrapy should be sending in parallel to 81 | # each remote server 82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 83 | # Enable showing throttling stats for every response received: 84 | #AUTOTHROTTLE_DEBUG = False 85 | 86 | # Enable and configure HTTP caching (disabled by default) 87 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 88 | #HTTPCACHE_ENABLED = True 89 | #HTTPCACHE_EXPIRATION_SECS = 0 90 | #HTTPCACHE_DIR = 'httpcache' 91 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 93 | -------------------------------------------------------------------------------- /spiders/scrapy/jianshu/jishuspider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /spiders/scrapy/jianshu/jishuspider/spiders/zhihu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from jishuspider.items import JishuItem 4 | import re 5 | import json 6 | 7 | 8 | class ZhihuSpider(scrapy.Spider): 9 | name = 'jishu' 10 | def start_requests(self): 11 | 12 | self.url = 'https://www.jianshu.com/trending/weekly?&page=1' 13 | self.headers = { 14 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' 15 | } 16 | yield scrapy.Request(self.url, headers=self.headers, callback=self.parse) 17 | 18 | def parse(self, response): 19 | lis = response.css('#list-container > ul li') 20 | for li in lis: 21 | href = li.css('.title::attr(href)').extract_first() 22 | url = response.urljoin(href) 23 | meta = {'Referer' : url} 24 | yield scrapy.Request(url, meta=meta, headers=self.headers, callback=self.parse_page) 25 | 26 | def parse_page(self, response): 27 | page_data =re.search(r'page-data">(.*?)<',response.text,re.S).group(1) 28 | note = json.loads(page_data)['note'] 29 | id = note['id'] 30 | meta = { 31 | 'author': note['author'], #作者信息 32 | 'title': response.css('body > div.note > div.post > div.article >.title::text').extract_first(), # 标题 33 | 'wordage': note['public_wordage'] , # 总字数 34 | 'views_count': note['views_count'], # 阅读数 35 | 'comments_count': note['comments_count'], # 评论数 36 | 'likes_count': note['likes_count'] , # 喜欢人数 37 | 'rewards_count': note['total_rewards_count'] # 赞赏人数 38 | } 39 | comment_headers = { 40 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', 41 | 'Referer': response.meta['Referer'], 42 | 'Accept': 'application/json', 43 | 'Accept-Encoding':'gzip, deflate, sdch, br', 44 | 'Accept-Language':'zh-CN,zh;q=0.8', 45 | 'Connection': 'keep-alive' 46 | } 47 | comments_url = 'https://www.jianshu.com/notes/%s/comments?comment_id=&author_only=false&since_id=0&max_id=1586510606000&order_by=desc&page=1' % str(id) 48 | yield scrapy.Request(url=comments_url, headers=comment_headers, meta=meta, callback=self.parse_comments) 49 | 50 | def parse_comments(self, response): 51 | item = JishuItem() 52 | page = json.loads(response.text) 53 | comments = page['comments'] 54 | comment_info = {} # 存放评论者的信息和评论的内容 55 | for comment in comments: 56 | user = comment['user'] 57 | comment_info['nick_name'] = user['nickname'] # 评论者的姓名 58 | comment_info['compiled_comment'] = comment['compiled_content'] # 评论的内容 59 | # for children in comment['children']: # 跟评论内容 60 | # nick_name = user['nickname'] # 跟评论者的姓名 61 | # compiled_comment = comment['compiled_content'] # 跟评论的内容 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /spiders/scrapy/jianshu/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = jishuspider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = jishuspider 12 | -------------------------------------------------------------------------------- /spiders/scrapy/movie/movie/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/movie/movie/__init__.py -------------------------------------------------------------------------------- /spiders/scrapy/movie/movie/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/movie/movie/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/movie/movie/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/movie/movie/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/movie/movie/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class MovieItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /spiders/scrapy/movie/movie/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class MovieSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class MovieDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /spiders/scrapy/movie/movie/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class MoviePipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /spiders/scrapy/movie/movie/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for movie project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'movie' 13 | 14 | SPIDER_MODULES = ['movie.spiders'] 15 | NEWSPIDER_MODULE = 'movie.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'movie (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'movie.middlewares.MovieSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'movie.middlewares.MovieDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'movie.pipelines.MoviePipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /spiders/scrapy/movie/movie/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /spiders/scrapy/movie/movie/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/movie/movie/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/movie/movie/spiders/__pycache__/movies.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/movie/movie/spiders/__pycache__/movies.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/movie/movie/spiders/movies.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | 5 | class MoviesSpider(scrapy.Spider): 6 | name = 'movies' 7 | allowed_domains = ['phthon.com'] 8 | start_urls = ['http://phthon.com/'] 9 | 10 | def parse(self, response): 11 | movies = response.x 12 | -------------------------------------------------------------------------------- /spiders/scrapy/movie/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = movie.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = movie 12 | -------------------------------------------------------------------------------- /spiders/scrapy/python123demo/python123demo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/python123demo/python123demo/__init__.py -------------------------------------------------------------------------------- /spiders/scrapy/python123demo/python123demo/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/python123demo/python123demo/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/python123demo/python123demo/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/python123demo/python123demo/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/python123demo/python123demo/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class Python123DemoItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /spiders/scrapy/python123demo/python123demo/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class Python123DemoSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class Python123DemoDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /spiders/scrapy/python123demo/python123demo/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class Python123DemoPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /spiders/scrapy/python123demo/python123demo/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for python123demo project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'python123demo' 13 | 14 | SPIDER_MODULES = ['python123demo.spiders'] 15 | NEWSPIDER_MODULE = 'python123demo.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'python123demo (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'python123demo.middlewares.Python123DemoSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'python123demo.middlewares.Python123DemoDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'python123demo.pipelines.Python123DemoPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /spiders/scrapy/python123demo/python123demo/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /spiders/scrapy/python123demo/python123demo/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/python123demo/python123demo/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/python123demo/python123demo/spiders/__pycache__/demo.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/python123demo/python123demo/spiders/__pycache__/demo.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/python123demo/python123demo/spiders/demo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import os 4 | 5 | 6 | class DemoSpider(scrapy.Spider): 7 | name = "demo" 8 | allowed_domains = ['python123,io'] 9 | start_urls = ["http://python123.io/ws/demo.html"] 10 | 11 | def parse(self, response): 12 | """ 13 | 用于处理响应,解析内容形成字典,发现新的URL爬取请求 14 | :param response: 15 | """ 16 | fname = response.url.split('/')[-1] 17 | with open(fname, 'wb') as f: 18 | f.write(response.body) 19 | self.log('Saved file &s.' % fname) 20 | -------------------------------------------------------------------------------- /spiders/scrapy/python123demo/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = python123demo.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = python123demo 12 | -------------------------------------------------------------------------------- /spiders/scrapy/quoteturorial/quoteturorial/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/quoteturorial/quoteturorial/__init__.py -------------------------------------------------------------------------------- /spiders/scrapy/quoteturorial/quoteturorial/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/quoteturorial/quoteturorial/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/quoteturorial/quoteturorial/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/quoteturorial/quoteturorial/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/quoteturorial/quoteturorial/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/quoteturorial/quoteturorial/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/quoteturorial/quoteturorial/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/quoteturorial/quoteturorial/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/quoteturorial/quoteturorial/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class QuotesItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | text = scrapy.Field() 15 | author = scrapy.Field() 16 | tags = scrapy.Field() 17 | -------------------------------------------------------------------------------- /spiders/scrapy/quoteturorial/quoteturorial/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymongo 8 | from scrapy.exceptions import DropItem 9 | 10 | 11 | class TextPipeline(object): 12 | 13 | def __init__(self): 14 | self.limit = 50 15 | 16 | def process_item(self, item, spider): 17 | if item['text']: 18 | if len(item['text']) > self.limit: 19 | item['text'] = item['text'][0:self.limit].rstrip() + '...' 20 | return item 21 | else: 22 | return DropItem('Missing Text') 23 | 24 | 25 | class MongoPipeline(object): 26 | 27 | def __init__(self, mongo_url, mongo_db): 28 | self.mongo_url = mongo_url 29 | self.mongo_db = mongo_db 30 | 31 | @classmethod 32 | def from_crawler(cls, crawler): 33 | return cls( 34 | mongo_url=crawler.settings.get('MONGO_URL'), 35 | mongo_db=crawler.settings.get('MONGO_DB') 36 | ) 37 | 38 | # 打开一个爬虫的时候就链接数据库 39 | def open_spider(self, spider): 40 | self.client = pymongo.MongoClient(self.mongo_url) 41 | self.db = self.client[self.mongo_db] 42 | 43 | def process_item(self, item, spider): 44 | name = item.__class__.__name__ 45 | self.db['quotes'].insert(dict(item)) 46 | return item 47 | 48 | def close_spider(self, spider): 49 | self.client.close() 50 | -------------------------------------------------------------------------------- /spiders/scrapy/quoteturorial/quoteturorial/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for quoteturorial project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'quoteturorial' 13 | 14 | SPIDER_MODULES = ['quoteturorial.spiders'] 15 | NEWSPIDER_MODULE = 'quoteturorial.spiders' 16 | 17 | MONGO_URL = 'localhost' 18 | MONGO_DB = 'quotestutorial' 19 | 20 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 21 | #USER_AGENT = 'quoteturorial (+http://www.yourdomain.com)' 22 | 23 | # Obey robots.txt rules 24 | ROBOTSTXT_OBEY = True 25 | 26 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 27 | #CONCURRENT_REQUESTS = 32 28 | 29 | # Configure a delay for requests for the same website (default: 0) 30 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 31 | # See also autothrottle settings and docs 32 | #DOWNLOAD_DELAY = 3 33 | # The download delay setting will honor only one of: 34 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 35 | #CONCURRENT_REQUESTS_PER_IP = 16 36 | 37 | # Disable cookies (enabled by default) 38 | #COOKIES_ENABLED = False 39 | 40 | # Disable Telnet Console (enabled by default) 41 | #TELNETCONSOLE_ENABLED = False 42 | 43 | # Override the default request headers: 44 | #DEFAULT_REQUEST_HEADERS = { 45 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 46 | # 'Accept-Language': 'en', 47 | #} 48 | 49 | # Enable or disable spider middlewares 50 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 51 | #SPIDER_MIDDLEWARES = { 52 | # 'quoteturorial.middlewares.QuoteturorialSpiderMiddleware': 543, 53 | #} 54 | 55 | # Enable or disable downloader middlewares 56 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 57 | #DOWNLOADER_MIDDLEWARES = { 58 | # 'quoteturorial.middlewares.QuoteturorialDownloaderMiddleware': 543, 59 | #} 60 | 61 | # Enable or disable extensions 62 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 63 | #EXTENSIONS = { 64 | # 'scrapy.extensions.telnet.TelnetConsole': None, 65 | #} 66 | 67 | # Configure item pipelines 68 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 69 | ITEM_PIPELINES = { 70 | 'quoteturorial.pipelines.TextPipeline': 300, 71 | 'quoteturorial.pipelines.MongoPipeline': 400, 72 | } 73 | 74 | # Enable and configure the AutoThrottle extension (disabled by default) 75 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 76 | #AUTOTHROTTLE_ENABLED = True 77 | # The initial download delay 78 | #AUTOTHROTTLE_START_DELAY = 5 79 | # The maximum download delay to be set in case of high latencies 80 | #AUTOTHROTTLE_MAX_DELAY = 60 81 | # The average number of requests Scrapy should be sending in parallel to 82 | # each remote server 83 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 84 | # Enable showing throttling stats for every response received: 85 | #AUTOTHROTTLE_DEBUG = False 86 | 87 | # Enable and configure HTTP caching (disabled by default) 88 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 89 | #HTTPCACHE_ENABLED = True 90 | #HTTPCACHE_EXPIRATION_SECS = 0 91 | #HTTPCACHE_DIR = 'httpcache' 92 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 93 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 94 | -------------------------------------------------------------------------------- /spiders/scrapy/quoteturorial/quoteturorial/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /spiders/scrapy/quoteturorial/quoteturorial/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/quoteturorial/quoteturorial/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/quoteturorial/quoteturorial/spiders/__pycache__/quotes.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/quoteturorial/quoteturorial/spiders/__pycache__/quotes.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/quoteturorial/quoteturorial/spiders/quotes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from quoteturorial.items import QuotesItem 4 | 5 | class QuotesSpider(scrapy.Spider): 6 | name = 'quotes' 7 | allowed_domains = ['quotes.toscrape.com'] 8 | start_urls = ['http://quotes.toscrape.com/'] 9 | 10 | def parse(self, response): 11 | item = QuotesItem() 12 | quotes = response.css('.quote') 13 | for quote in quotes: 14 | text = quote.css('.text::text').extract_first() 15 | author = quote.css('.author::text').extract_first() 16 | tags = quote.css('.tags .tag::text').extract() 17 | item['text'] = text 18 | item['author'] = author 19 | item['tags'] = tags 20 | yield item 21 | 22 | next = response.css('.pager .next a::attr(href)').extract_first() 23 | url = response.urljoin(next) 24 | yield scrapy.Request(url=url, callback=self.parse) 25 | 26 | 27 | -------------------------------------------------------------------------------- /spiders/scrapy/quoteturorial/quoteturorial/调试.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from scrapy import cmdline 4 | name = 'quotes' 5 | cmd = 'scrapy crawl {0}'.format(name) 6 | cmdline.execute(cmd.split()) -------------------------------------------------------------------------------- /spiders/scrapy/quoteturorial/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = quoteturorial.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = quoteturorial 12 | -------------------------------------------------------------------------------- /spiders/scrapy/weibo/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = weibo.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = weibo 12 | -------------------------------------------------------------------------------- /spiders/scrapy/weibo/weibo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/weibo/weibo/__init__.py -------------------------------------------------------------------------------- /spiders/scrapy/weibo/weibo/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/weibo/weibo/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/weibo/weibo/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/weibo/weibo/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/weibo/weibo/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class WeiboItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /spiders/scrapy/weibo/weibo/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | 9 | -------------------------------------------------------------------------------- /spiders/scrapy/weibo/weibo/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class WeiboPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /spiders/scrapy/weibo/weibo/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /spiders/scrapy/weibo/weibo/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/weibo/weibo/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/weibo/weibo/spiders/__pycache__/weibos.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/weibo/weibo/spiders/__pycache__/weibos.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/weibo/weibo/spiders/weibos.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy import Spider, FormRequest 3 | 4 | 5 | class WeibosSpider(Spider): 6 | name = 'weibos' 7 | allowed_domains = ['weibo.cn'] 8 | start_urls = 'http://weibo.cn/search/mblog' 9 | max_page = 0 10 | 11 | def start_requests(self): 12 | keyword = '000001' 13 | url = '{url}?keyword={keyword}'.format(url=self.start_urls, keyword=keyword) 14 | print(url) 15 | for page in range(self.max_page + 1): 16 | data = { 17 | 'mp': str(self.max_page), 18 | 'page': str(page) 19 | } 20 | yield FormRequest(url, callback=self.parse_index, formdata=data) 21 | 22 | def parse_index(self, response): 23 | print(response.text) 24 | -------------------------------------------------------------------------------- /spiders/scrapy/zhihuuser/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = zhihuuser.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = zhihuuser 12 | -------------------------------------------------------------------------------- /spiders/scrapy/zhihuuser/zhihuuser/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/zhihuuser/zhihuuser/__init__.py -------------------------------------------------------------------------------- /spiders/scrapy/zhihuuser/zhihuuser/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/zhihuuser/zhihuuser/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/zhihuuser/zhihuuser/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/zhihuuser/zhihuuser/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/zhihuuser/zhihuuser/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/zhihuuser/zhihuuser/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/zhihuuser/zhihuuser/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/zhihuuser/zhihuuser/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/zhihuuser/zhihuuser/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Item, Field 9 | 10 | 11 | class UserItem(Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | allow_message = Field() 15 | answer_count = Field() 16 | articles_count = Field() 17 | avatar_url = Field() 18 | avatar_url_template = Field() 19 | badge = Field() 20 | employments = Field() 21 | follower_count = Field() 22 | gender = Field() 23 | headline = Field() 24 | id = Field() 25 | is_org = Field() 26 | name = Field() 27 | type = Field() 28 | url = Field() 29 | url_token = Field() 30 | user_type = Field() 31 | -------------------------------------------------------------------------------- /spiders/scrapy/zhihuuser/zhihuuser/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class ZhihuuserSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class ZhihuuserDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /spiders/scrapy/zhihuuser/zhihuuser/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymongo 8 | from scrapy.exceptions import DropItem 9 | 10 | 11 | class MongoPipeline(object): 12 | 13 | def __init__(self, mongo_url, mongo_db): 14 | self.mongo_url = mongo_url 15 | self.mongo_db = mongo_db 16 | 17 | @classmethod 18 | def from_crawler(cls, crawler): 19 | return cls( 20 | mongo_url=crawler.settings.get('MONGO_URL'), 21 | mongo_db=crawler.settings.get('MONGO_DB') 22 | ) 23 | 24 | # 打开一个爬虫的时候就链接数据库 25 | def open_spider(self, spider): 26 | self.client = pymongo.MongoClient(self.mongo_url) 27 | self.db = self.client[self.mongo_db] 28 | 29 | def process_item(self, item, spider): 30 | name = item.__class__.__name__ 31 | self.db['quotes'].insert(dict(item)) 32 | return item 33 | 34 | def close_spider(self, spider): 35 | self.client.close() 36 | -------------------------------------------------------------------------------- /spiders/scrapy/zhihuuser/zhihuuser/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /spiders/scrapy/zhihuuser/zhihuuser/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/zhihuuser/zhihuuser/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/zhihuuser/zhihuuser/spiders/__pycache__/zhihu.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/zhihuuser/zhihuuser/spiders/__pycache__/zhihu.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/scrapy/zhihuuser/zhihuuser/spiders/zhihu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy import Spider, Request 3 | import json 4 | from zhihuuser.items import UserItem 5 | # from scrapy_redis 6 | 7 | class ZhihuSpider(Spider): 8 | name = 'zhihu' 9 | start_urls = ['http://www.zhihui.com/'] 10 | 11 | start_user = 'excited-vczh' 12 | user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}' 13 | user_query = 'allow_message%2Cis_followed%2Cis_following%2Cis_org%2Cis_blocking%2Cemployments%2Canswer_count%2Cfollower_count%2Carticles_count%2Cgender%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics' 14 | 15 | follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}' 16 | follows_query = 'data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics' 17 | 18 | followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}' 19 | followers_query = 'data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics' 20 | 21 | def start_requests(self): 22 | # 他关注的人的链接 23 | yield Request(self.user_url.format(user=self.start_user, include=self.user_query), callback=self.parse_user) 24 | yield Request(self.follows_url.format(user=self.start_user, include=self.follows_query, offset=0, limit=20), callback=self.parse_follows) 25 | yield Request(self.followers_url.format(user=self.start_user, include=self.followers_query, offset=0, limit=20), callback=self.parse_followers) 26 | 27 | def parse_user(self, response): 28 | result = json.loads(response.text) 29 | item = UserItem() 30 | for field in item.fields: 31 | if field in result.keys(): 32 | item[field] = result.get(field) 33 | yield item 34 | # 找到每个用户自己的关注列表 35 | yield Request(self.follows_url.format(user=result.get('url_token'), include=self.follows_query, limit=20, offset=0), callback=self.parse_follows) 36 | yield Request(self.followers_url.format(user=result.get('url_token'), include=self.followers_query, limit=20, offset=0), callback=self.parse_followers) 37 | 38 | def parse_follows(self, response): 39 | results = json.loads(response.text) 40 | if 'data' in results.keys(): 41 | for result in results.get('data'): 42 | yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query), callback=self.parse_user) 43 | 44 | if 'paging' in results.keys() and results.get('paging').get('is_end') ==False: 45 | next_page = results.get('paging').get('next') 46 | yield Request(next_page, callback=self.parse_follows) 47 | 48 | def parse_followers(self, response): 49 | results = json.loads(response.text) 50 | if 'data' in results.keys(): 51 | for result in results.get('data'): 52 | yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query), callback=self.parse_user) 53 | 54 | if 'paging' in results.keys() and results.get('paging').get('is_end') ==False: 55 | next_page = results.get('paging').get('next') 56 | yield Request(next_page, callback=self.parse_followers) 57 | 58 | -------------------------------------------------------------------------------- /spiders/secondSpider/Data_Output.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import time 3 | 4 | 5 | class Data_Output(object): 6 | def __init__(self): 7 | self.filepath ='baike_%s.html' % (time.strftime("%Y_%m_%d_%h_%M_%S", time.localtime())) 8 | self.output_head(self.filepath) 9 | self.datas = [] 10 | 11 | def store_data(self, data): 12 | if data is None: 13 | return 14 | # print('************') 15 | self.datas.append(data) 16 | # print(len(self.datas)) 17 | # if len(self.datas) > 10: 18 | self.output_html(self.filepath) 19 | 20 | def output_head(self, path): 21 | """ 22 | 将HTML头写进去 23 | :param path: 24 | :return: 25 | """ 26 | # open 函数只能写入字符串 27 | fout = codecs.open(path, 'a', encoding='utf-8') 28 | fout.write("") 29 | fout.write("") 30 | fout.write("") 31 | fout.close() 32 | 33 | def output_html(self, path): 34 | """ 35 | 将数据写入HTML文件 36 | :param path:文件路径 37 | :return: 38 | """ 39 | fout = codecs.open(path, 'a', encoding='Utf-8') 40 | print(self.datas) 41 | for data in self.datas: 42 | fout.write("") 43 | fout.write("" % data['url']) 44 | fout.write("" % data['title']) 45 | fout.write("" % data['summary']) 46 | fout.write("") 47 | self.datas.remove(data) 48 | fout.close() 49 | 50 | def ouput_end(self, path): 51 | """ 52 | 输出HTML结束 53 | :param path: 文件存储路径 54 | :return: 55 | """ 56 | fout = codecs.open(path, 'a', encoding='utf-8') 57 | fout.write("
%s%s%s
") 58 | fout.write("") 59 | fout.write("") 60 | fout.close() -------------------------------------------------------------------------------- /spiders/secondSpider/Html_Downloader.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | 4 | class Html_Downloader(object): 5 | 6 | def download(self, url): 7 | if url is None: 8 | return None 9 | user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/51.0.2704.63 Safari/537.36' 10 | headers = {'User-Agent': user_agent} 11 | r = requests.get(url, headers=headers) 12 | if r.status_code == 200: 13 | r.encoding = 'utf-8' 14 | return r.text 15 | return None 16 | -------------------------------------------------------------------------------- /spiders/secondSpider/Html_Parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | from bs4 import BeautifulSoup 3 | from urllib import parse 4 | import urllib 5 | 6 | 7 | class Html_Parser(object): 8 | 9 | def parser(self, page_url, html_cont): 10 | """ 11 | 用于解析网页内容,抽取URL 和数据 12 | :param page_url: 下载页面的URL 13 | :param html_cont: 下载的网页内容 14 | :return: 返回URL和数据 15 | """ 16 | if page_url is None or html_cont is None: 17 | return 18 | soup = BeautifulSoup(html_cont, 'lxml') 19 | new_urls = self._get_new_urls(page_url, soup) 20 | new_data = self._get_new_data(page_url, soup) 21 | return new_urls, new_data 22 | 23 | def _get_new_urls(self, page_url, soup): 24 | """ 25 | 抽取新的URl集合 26 | :param page_url: 下载页面的URL 27 | :param soup: soup 28 | :return: 返回新的URL集合 29 | """ 30 | new_urls = set() 31 | # 抽取符合要求的a标记 32 | links = soup.find_all('a', href=re.compile(r'/item/.*')) 33 | for link in links: 34 | # 提取href属性 35 | new_url = link['href'] 36 | # 拼接成完成网址 37 | new_full_url = parse.urljoin(page_url, new_url) 38 | new_urls.add(new_full_url) 39 | return new_urls 40 | 41 | def _get_new_data(self, page_url, soup): 42 | """ 43 | 抽取有效数据 44 | :param page_url:下载页面的URL 45 | :param soup: 46 | :return: 返回有效数据 47 | """ 48 | data = {} 49 | data['url'] = page_url 50 | title = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').find('h1') 51 | data['title'] = title.text 52 | summary = soup.find('div', class_='lemma-summary') 53 | data['summary'] = summary.text 54 | 55 | return data 56 | 57 | -------------------------------------------------------------------------------- /spiders/secondSpider/SpiderWork.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.managers import BaseManager 2 | from Html_Downloader import Html_Downloader 3 | from Html_Parser import Html_Parser 4 | 5 | 6 | class SpiderWork(object): 7 | def __init__(self): 8 | # 初始化分布式进程工作节点的连接作业 9 | # 实现第一步:使用BaseManager注册用于获取Queue的方法名称 10 | BaseManager.register('get_task_queue') 11 | BaseManager.register('get_result_queue') 12 | # 实现第二步,连接到服务器 13 | server_addr = '192.168.43.149' 14 | print(f'connect to server {server_addr}...') 15 | self.m = BaseManager(address=(server_addr, 8001), authkey=b'baike') 16 | # 从网络连接 17 | self.m.connect() 18 | # 实现第三步:获取Queue的对象 19 | self.task = self.m.get_task_queue() 20 | self.result = self.m.get_result_queue() 21 | # 初始化网页下载器和解析器 22 | self.downloader = Html_Downloader() 23 | self.parser = Html_Parser() 24 | print('init finish') 25 | 26 | def crawl(self): 27 | while 1: 28 | try: 29 | if not self.task.empty(): 30 | url = self.task.get() 31 | if url == 'end': 32 | print('控制节点通知爬虫节点停止工作...') 33 | # 接的通知其他节点停止工作 34 | self.result.put({'new_urls': 'end', 'data': 'end'}) 35 | return 36 | print('爬虫节点正在解析%s' % url.encode('utf-8')) 37 | content = self.downloader.download(url) 38 | new_urls, data = self.parser.parser(url, content) 39 | # print(new_urls) 40 | self.result.put({'new_urls': new_urls, 'data': data}) 41 | except EOFError as e: 42 | print(e, '连接工作节点失败') 43 | except Exception as e: 44 | print(e) 45 | print('Crawl fail') 46 | 47 | if __name__ == '__main__': 48 | spider = SpiderWork() 49 | spider.crawl() -------------------------------------------------------------------------------- /spiders/secondSpider/URL_Manager.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import hashlib 3 | 4 | 5 | class URL_Manager(object): 6 | def __init__(self): 7 | self.new_urls = self.load_progress('new_urls.txt') # 未爬取URL集合 8 | self.old_urls = self.load_progress('old_urls.txt') # 已爬取URL集合 9 | 10 | def has_new_url(self): 11 | """ 12 | 判断是否有未爬取的URL 13 | """ 14 | return self.new_url_size() != 0 15 | 16 | def get_new_url(self): 17 | """ 18 | 获取一个未爬取的URL 19 | """ 20 | new_url = self.new_urls.pop() 21 | # 摘要算法md() 22 | m = hashlib.md5() 23 | m.update(new_url.encode('utf-8')) 24 | self.old_urls.add(m.hexdigest()[8: -8]) 25 | return new_url 26 | 27 | def add_new_url(self, url): 28 | """ 29 | 将新的URL添加到未爬取的URl集合中 30 | :param url: 单个RUL 31 | :return: 32 | """ 33 | if url is None: 34 | return 35 | m = hashlib.md5() 36 | m.update(url.encode('utf-8')) 37 | url_md5 = m.hexdigest()[8: -8] 38 | if url not in self.new_urls and url_md5 not in self.old_urls: 39 | self.new_urls.add(url) 40 | 41 | def add_new_urls(self, urls): 42 | """ 43 | 将新的URL添加到未爬取的URL集合中 44 | :param urls: url集合 45 | :return: 46 | """ 47 | if urls is None or len(urls) == 0: 48 | return 49 | for url in urls: 50 | self.add_new_url(url) 51 | 52 | 53 | def new_url_size(self): 54 | """ 55 | 获取未爬取URL集合的大小 56 | :return: 57 | """ 58 | return len(self.new_urls) 59 | 60 | def old_url_size(self): 61 | """ 62 | 获取已经爬取URL集合的大小 63 | :return: 64 | """ 65 | return len(self.old_urls) 66 | 67 | def save_process(self, path, data): 68 | """ 69 | 保存进度 70 | :param path:文件路径 71 | :param data: 数据 72 | :return: 73 | """ 74 | with open(path, 'wb') as f: 75 | pickle.dump(data, f) 76 | 77 | def load_progress(self, path): 78 | """ 79 | 从本地文件加载进度 80 | :param path: 文件路径 81 | :return: 返回set集合 82 | """ 83 | print(f'[+]从文件加载进度:{path}') 84 | try: 85 | with open(path, 'rb') as f: 86 | tmp = pickle.load(f) 87 | return tmp 88 | except: 89 | print(f'[!]无进度文件,创建:{path}') 90 | return set() -------------------------------------------------------------------------------- /spiders/secondSpider/__pycache__/Data_Output.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/secondSpider/__pycache__/Data_Output.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/secondSpider/__pycache__/Html_Downloader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/secondSpider/__pycache__/Html_Downloader.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/secondSpider/__pycache__/Html_Parser.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/secondSpider/__pycache__/Html_Parser.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/secondSpider/__pycache__/URL_Manager.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/secondSpider/__pycache__/URL_Manager.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/secondSpider/new_urls.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/secondSpider/new_urls.txt -------------------------------------------------------------------------------- /spiders/secondSpider/old_urls.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/secondSpider/old_urls.txt -------------------------------------------------------------------------------- /spiders/selenium/Frame.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/selenium/Frame.py -------------------------------------------------------------------------------- /spiders/selenium/Jiaohu.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | import time 3 | 4 | browser = webdriver.Chrome() 5 | browser.get('http://www.taobao.com') 6 | input =browser.find_element_by_id('q') 7 | input.send_keys('IPhone') 8 | time.sleep(1) 9 | input.clear() 10 | input.send_keys('Ipad') 11 | button = browser.find_element_by_class_name('btn-search') 12 | button.click() -------------------------------------------------------------------------------- /spiders/selenium/javaScript.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | 3 | browser = webdriver.Chrome() 4 | browser.get('https://www.zhihu.com/explore') 5 | browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') 6 | browser.execute_script('alert("To Bottom")') -------------------------------------------------------------------------------- /spiders/selenium/前进后退.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | 3 | browser = webdriver.Chrome() 4 | browser.get('http://www.baidu.com/') 5 | input = browser.find_element_by_id('kw') 6 | input.send_keys('图片') 7 | botton = browser.find_element_by_id('su') 8 | botton.click() -------------------------------------------------------------------------------- /spiders/selenium/获取属性.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver import ActionChains 3 | 4 | browser = webdriver.Chrome() 5 | url = 'https://www.zhihu.com/explore' 6 | 7 | browser.get(url) 8 | # logo = browser.find_element_by_id('zh-top-link-logo') 9 | # print(logo) 10 | # print(logo.get_attribute('class')) 11 | input = browser.find_element_by_class_name('zu-top-link-logo') 12 | print(input.text) -------------------------------------------------------------------------------- /spiders/selenium/选项卡管理.py: -------------------------------------------------------------------------------- 1 | import time 2 | from selenium import webdriver 3 | 4 | 5 | browser = webdriver.Chrome() 6 | browser.get('http://www.baidu.com') 7 | browser.execute_script('window.open()') 8 | browser.switch_to_window(browser.window_handles[1]) 9 | browser.get('https://www.taobao.com') 10 | time.sleep(1) 11 | browser.switch_to_window(browser.window_handles[0]) 12 | browser.get('https://python.org') -------------------------------------------------------------------------------- /spiders/selenium个人邮箱.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.by import By 3 | from selenium.webdriver.support.ui import WebDriverWait 4 | from selenium.webdriver.support import expected_conditions as EC 5 | 6 | browser = webdriver.Chrome() 7 | wait = WebDriverWait(browser, 10) 8 | 9 | 10 | def search(): 11 | browser.get('https://www.baidu.com/') 12 | input = wait.until(EC.presence_of_element_located((By.ID, 'kw'))) 13 | submit = wait.until(EC.element_to_be_clickable((By.ID, 'su'))) 14 | input.send_keys('qq邮箱') 15 | submit.click() 16 | wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="1"]/h3/a[1]'))).click() 17 | browser.implicitly_wait(10) 18 | browser.switch_to_window(browser.window_handles[1]) 19 | browser.switch_to_frame('login_frame') 20 | wait.until(EC.presence_of_element_located((By.ID, 'switcher_plogin'))).click() 21 | u = wait.until(EC.presence_of_element_located((By.ID, 'u'))) 22 | p = wait.until(EC.presence_of_element_located((By.ID, 'p'))) 23 | u.send_keys('942203701') 24 | p.send_keys('xingfu1314...') 25 | wait.until(EC.element_to_be_clickable((By.ID, 'login_button'))).click() 26 | 27 | 28 | def main(): 29 | search() 30 | 31 | 32 | if __name__ == '__main__': 33 | main() -------------------------------------------------------------------------------- /spiders/selenium模拟淘宝.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | import re 3 | from bs4 import BeautifulSoup 4 | from selenium.common.exceptions import TimeoutException 5 | from selenium.webdriver.common.by import By 6 | from selenium.webdriver.support.ui import WebDriverWait 7 | from selenium.webdriver.support import expected_conditions as EC 8 | from config import * 9 | import pymongo 10 | # from selenium.webdriver.chrome.options import Options 11 | 12 | client = pymongo.MongoClient(MONGO_URL) 13 | db = client[MONGO_DB] 14 | # chrome_options = Options() 15 | # chrome_options.add_argument('--headless') 16 | # chrome_options.add_argument('--disable-gpu') 17 | browser = webdriver.Chrome(chrome_options=chrome_options) 18 | # browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) 19 | wait = WebDriverWait(browser, 10) 20 | 21 | # browser.set_window_size(1400, 900) 22 | 23 | 24 | def search(): 25 | try: 26 | browser.get('https://www.taobao.com') 27 | input = wait.until(EC.presence_of_element_located((By.ID, 'q'))) 28 | submit = wait.until( 29 | EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button'))) 30 | input.send_keys('美食') 31 | submit.click() 32 | total = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'total'))) 33 | get_products() 34 | return total.text 35 | except TimeoutException: 36 | return search() 37 | 38 | 39 | def next_page(page_number): 40 | try: 41 | submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.next > a > span:nth-child(1)'))) 42 | submit.click() 43 | num = wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number))) 44 | print(f'正在解析第{page_number}页') 45 | get_products() 46 | except TimeoutException: 47 | next_page(page_number) 48 | 49 | 50 | def get_products(): 51 | product = {} 52 | wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item'))) 53 | html = browser.page_source 54 | soup = BeautifulSoup(html, 'lxml') 55 | items = soup.select('#mainsrp-itemlist .items .item') 56 | # item = soup.find_all(class_='item') 57 | for item in items: 58 | product = { 59 | 'image': item.select('.pic .img')[0]['data-src'], 60 | 'price': item.find(class_='price').text.strip(), 61 | 'deal': item.find(class_='deal-cnt').text[: -3], 62 | 'title': item.find(class_='title').text.strip(), 63 | 'shop': item.find(class_='shop').text.strip(), 64 | 'location': item.find(class_='location').text 65 | } 66 | save_to_mongo(product) 67 | 68 | 69 | def save_to_mongo(result): 70 | try: 71 | if db[MONGO_DB].insert(result): 72 | print('存储到MONGODB成功') 73 | except Exception: 74 | print('存储失败') 75 | 76 | 77 | def main(): 78 | total = search() 79 | total = int(re.search('(\d+)', total).group(1)) 80 | for i in range(2, total+1): 81 | next_page(i) 82 | 83 | 84 | if __name__ == '__main__': 85 | main() -------------------------------------------------------------------------------- /spiders/中国大学排名定向爬虫.py: -------------------------------------------------------------------------------- 1 | import bs4 2 | import requests 3 | import urllib.error 4 | from bs4 import BeautifulSoup 5 | import os 6 | 7 | 8 | def get_html_text(url): 9 | try: 10 | r = requests.get(url, timeout=30) 11 | r.raise_for_status() 12 | r.encoding = r.apparent_encoding 13 | return r.text 14 | except urllib.error.URLError as e: 15 | print(e.reason) 16 | return "" 17 | 18 | 19 | def fill_univ_list(ulist, html): 20 | soup = BeautifulSoup(html, 'lxml') # html/xml这两种格式 21 | # print(soup.prettify()) 22 | # print(soup.find('tbody')) 23 | for tr in soup.find("tbody").children: 24 | if isinstance(tr, bs4.element.Tag): # 过滤掉非标签类型 25 | tds = tr('td') 26 | # print(tds) 27 | ulist.append([tds[0].string, tds[1].string, tds[2].string]) 28 | 29 | # all_u = soup.find("tbody").children 30 | # print(all_u) 31 | # for tr in all_u: 32 | # tds = tr.find_all('td') 33 | # ulist.append([tds[0], tds[1], tds[2]]) 34 | 35 | 36 | def print_univ_list(ulist, num): 37 | tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}" 38 | print(tplt.format("排名", "学校名称", "省市", chr(12288))) 39 | for i in range(num): 40 | u = ulist[i] 41 | print(tplt.format(u[0], u[1], u[2], chr(12288))) 42 | 43 | if __name__ == '__main__': 44 | uinfo = [] 45 | url = "http://www.zuihaodaxue.cn/shengyuanzhiliangpaiming2018.html" 46 | html = get_html_text(url) 47 | fill_univ_list(uinfo, html) 48 | print_univ_list(uinfo, 20) -------------------------------------------------------------------------------- /spiders/分布式爬虫/Data_Output.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | 3 | 4 | class Data_Output(object): 5 | def __init__(self): 6 | self.datas = [] 7 | 8 | def store_data(self, data): 9 | if data is None: 10 | return 11 | self.datas.append(data) 12 | 13 | def output_html(self): 14 | fout = codecs.open('baike.html', 'w', encoding='gbk') 15 | fout.write("") 16 | fout.write("") 17 | fout.write("") 18 | for data in self.datas: 19 | fout.write("") 20 | fout.write("" % data['url']) 21 | fout.write("" % data['title']) 22 | fout.write("" % data['summary']) 23 | fout.write("") 24 | fout.write("") 25 | fout.write("") 26 | fout.write("
%s%s%s
") 27 | fout.close() -------------------------------------------------------------------------------- /spiders/分布式爬虫/Html_Downloader.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | 4 | class Html_Downloader(object): 5 | 6 | def download(self, url): 7 | if url is None: 8 | return None 9 | user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/51.0.2704.63 Safari/537.36' 10 | headers = {'User-Agent': user_agent} 11 | r = requests.get(url, headers=headers) 12 | if r.status_code == 200: 13 | r.encoding = 'Utf-8' 14 | return r.text 15 | return None 16 | -------------------------------------------------------------------------------- /spiders/分布式爬虫/Html_Parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | from bs4 import BeautifulSoup 3 | from urllib import parse 4 | import urllib 5 | 6 | 7 | class Html_Parser(object): 8 | 9 | def parser(self, page_url, html_cont): 10 | """ 11 | 用于解析网页内容,抽取URL 和数据 12 | :param page_url: 下载页面的URL 13 | :param html_cont: 下载的网页内容 14 | :return: 返回URL和数据 15 | """ 16 | if page_url is None or html_cont is None: 17 | return 18 | soup = BeautifulSoup(html_cont, 'lxml') 19 | new_urls = self._get_new_urls(page_url, soup) 20 | new_data = self._get_new_data(page_url, soup) 21 | # print(new_urls, new_data) 22 | return new_urls, new_data 23 | 24 | def _get_new_urls(self, page_url, soup): 25 | """ 26 | 抽取新的URl集合 27 | :param page_url: 下载页面的URL 28 | :param soup: soup 29 | :return: 返回新的URL集合 30 | """ 31 | new_urls = set() 32 | # 抽取符合要求的a标记 33 | links = soup.find_all('a', href=re.compile(r'/view/\d+\.htm')) 34 | # print('****************') 35 | # print(links) 36 | # print('***************') 37 | for link in links: 38 | # 提取href属性 39 | new_url = link['href'] 40 | # 拼接成完成网址 41 | new_full_url = parse.urljoin(page_url, new_url) 42 | new_urls.add(new_full_url) 43 | return new_urls 44 | 45 | 46 | def _get_new_data(self, page_url, soup): 47 | """ 48 | 抽取有效数据 49 | :param page_url:下载页面的URL 50 | :param soup: 51 | :return: 返回有效数据 52 | """ 53 | data = {} 54 | data['url'] = page_url 55 | title = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').find('h1') 56 | # print(title.get_text) 57 | data['title'] = title.text 58 | summary = soup.find('div', class_='lemma-summary') 59 | # print(summary.text) 60 | # 获取tag中包含的所有文本内容,包括子孙tag中的内容,并将结果作为Unico字符串返回 61 | data['summary'] = summary.text 62 | 63 | return data 64 | 65 | -------------------------------------------------------------------------------- /spiders/分布式爬虫/SpiderMan.py: -------------------------------------------------------------------------------- 1 | from Data_Output import Data_Output 2 | from Html_Downloader import Html_Downloader 3 | from Html_Parser import Html_Parser 4 | from URL_Manager import URL_Manager 5 | 6 | 7 | class SpiderMan(object): 8 | def __init__(self): 9 | self.mamager = URL_Manager() 10 | self.downloader = Html_Downloader() 11 | self.parser = Html_Parser() 12 | self.output = Data_Output() 13 | 14 | def crawl(self, root_url): 15 | # 添加入口URL 16 | self.mamager.add_new_url(root_url) 17 | # 判断url管理器中是否有新的url, 同时判断抓取了多少个url 18 | while (self.mamager.has_new_url() and self.mamager.old_url_size()<100): 19 | try: 20 | # 从URL管理器获取新的url 21 | new_url = self.mamager.get_new_url() 22 | html = self.downloader.download(new_url) 23 | # 从html解析器抽取网页数据 24 | new_url, data = self.parser.parser(new_url, html) 25 | # 将抽取的url 添加到URL管理器中 26 | # print(new_url, data) 27 | self.mamager.add_new_urls(new_url) 28 | # 数据存储器存储文件 29 | # print('***************8') 30 | self.output.store_data(data) 31 | 32 | print(f"已经抓取{self.mamager.old_url_size()}个链接") 33 | except Exception as e: 34 | print(e) 35 | self.output.output_html() 36 | 37 | if __name__ == '__main__': 38 | spider_man = SpiderMan() 39 | spider_man.crawl("http://baike.baidu.com/view/284853.htm") 40 | -------------------------------------------------------------------------------- /spiders/分布式爬虫/URL_Manager.py: -------------------------------------------------------------------------------- 1 | class URL_Manager(object): 2 | def __init__(self): 3 | self.new_urls = set() # 未爬取URL集合 4 | self.old_urls = set() # 已爬取URL集合 5 | 6 | def has_new_url(self): 7 | """ 8 | 判断是否有未爬取的URL 9 | """ 10 | return self.new_url_size() != 0 11 | 12 | def get_new_url(self): 13 | """ 14 | 获取一个未爬取的URL 15 | """ 16 | new_url = self.new_urls.pop() 17 | self.old_urls.add(new_url) 18 | return new_url 19 | 20 | def add_new_url(self, url): 21 | """ 22 | 将新的URL添加到未爬取的URl集合中 23 | :param url: 单个RUL 24 | :return: 25 | """ 26 | if url is None: 27 | return 28 | if url not in self.new_urls and url not in self.old_urls: 29 | # print(self.new_urls, self.old_urls) 30 | self.new_urls.add(url) 31 | # print(self.new_urls) 32 | # print('*****') 33 | 34 | def add_new_urls(self, urls): 35 | """ 36 | 将新的URL添加到未爬取的URL集合中 37 | :param urls: url集合 38 | :return: 39 | """ 40 | if urls is None or len(urls) == 0: 41 | return 42 | for url in urls: 43 | self.add_new_url(url) 44 | 45 | 46 | def new_url_size(self): 47 | """ 48 | 获取未爬取URL集合的大小 49 | :return: 50 | """ 51 | return len(self.new_urls) 52 | 53 | def old_url_size(self): 54 | """ 55 | 获取已经爬取URL集合的大小 56 | :return: 57 | """ 58 | return len(self.old_urls) 59 | -------------------------------------------------------------------------------- /spiders/分布式爬虫/__pycache__/Data_Output.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/分布式爬虫/__pycache__/Data_Output.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/分布式爬虫/__pycache__/Html_Downloader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/分布式爬虫/__pycache__/Html_Downloader.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/分布式爬虫/__pycache__/Html_Parser.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/分布式爬虫/__pycache__/Html_Parser.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/分布式爬虫/__pycache__/URL_Manager.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/分布式爬虫/__pycache__/URL_Manager.cpython-36.pyc -------------------------------------------------------------------------------- /spiders/分布式进程/taskManager.py: -------------------------------------------------------------------------------- 1 | import queue 2 | import socket 3 | from socket import SOL_SOCKET, SO_REUSEADDR 4 | from multiprocessing.managers import BaseManager 5 | from multiprocessing import freeze_support 6 | # 任务个数 7 | task_number = 10 8 | # 定义收发队列 9 | task_queue = queue.Queue(task_number) 10 | result_queue = queue.Queue(task_number) 11 | 12 | 13 | def get_task(): 14 | return task_queue 15 | 16 | 17 | def get_result(): 18 | return result_queue 19 | 20 | 21 | # 创建类似的queueManager: 22 | # 从BaseManager继承的 23 | class QueueManager(BaseManager): 24 | pass 25 | 26 | 27 | def win_run(): 28 | # windows 下绑定调用接口不能使用lambda,所以只能先定义函数再绑定 29 | # 把两个队列注册到网络上 30 | sk = socket.socket() 31 | QueueManager.register('get_task_queue', callable=get_task) 32 | QueueManager.register('get_result_queue', callable=get_result) 33 | # 绑定端口并设置验证口令,Windows下需要填写IP地址,Linux 下不填默认本机地址 34 | sk.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1) 35 | manager = QueueManager(address=('192.168.43.149', 8001), authkey=b'qiye') 36 | # 启动 37 | manager.start() 38 | try: 39 | # 通过网络获取任务队列和结果队列 40 | task = manager.get_task_queue() 41 | result = manager.get_result_queue() 42 | # 添加任务 43 | for url in ["ImageUrl_"+str(i) for i in range(10)]: 44 | print(f'put task {url}...') 45 | task.put(url) 46 | print('try get result...') 47 | for i in range(10): 48 | print(f'result is {result.get(timeout=100)}') 49 | except: 50 | print('Manager error') 51 | finally: 52 | # 一定要关闭,否则会报管道未关闭的错误 53 | manager.shutdown() 54 | 55 | 56 | if __name__ == '__main__': 57 | # Windos 下多进程可能会有问题,添加这句可以缓解 58 | freeze_support() 59 | win_run() -------------------------------------------------------------------------------- /spiders/分布式进程/taskWork.py: -------------------------------------------------------------------------------- 1 | import time 2 | from multiprocessing.managers import BaseManager 3 | 4 | 5 | class QueueManager(BaseManager): 6 | pass 7 | # 第一步,使用QueueManager 注册用于获取Queen的方法名称 8 | QueueManager.register('get_task_queue') 9 | QueueManager.register('get_result_queue') 10 | # 第二步,链接服务器 11 | server_addr = '192.168.43.149' 12 | print(f'Connect to server {server_addr}') 13 | # 端口和验证口令注意保持与服务进程完全一致: 14 | m = QueueManager(address=(server_addr, 8001), authkey=b'qiye') 15 | # 从网络链接: 16 | m.connect() 17 | # 第三步:获取queue的对象 18 | task = m.get_task_queue() 19 | result = m.get_result_queue() 20 | # 第四步,从task队列中获取任务,并把结果写入result队列: 21 | while (not task.empty()): 22 | immage_url = task.get(True, timeout=20) 23 | print(f'run task download {immage_url}...') 24 | time.sleep(1) 25 | result.put(f'{immage_url}--->success') 26 | 27 | # 处理结束: 28 | print('worker exit.') 29 | 30 | -------------------------------------------------------------------------------- /spiders/医生信息索取.py: -------------------------------------------------------------------------------- 1 | import bs4 2 | import requests 3 | import urllib.error 4 | from bs4 import BeautifulSoup 5 | import os 6 | import random 7 | import re 8 | 9 | user_agents = [ 10 | 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', 11 | 'Opera/9.25 (Windows NT 5.1; U; en)', 12 | 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 13 | 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)', 14 | 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12', 15 | 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9', 16 | "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7", 17 | "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ", 18 | "Mozilla/5.0 (Windows NT 10.0; WOW64)", 19 | 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/51.0.2704.63 Safari/537.36' 20 | ] 21 | 22 | 23 | def get_html_text(url, page): 24 | try: 25 | headers = {'User-Agent': user_agents[-1], 26 | 'Host': '400.haodf.com' 27 | } 28 | kv = {'nowpage': page} 29 | r = requests.get(url, headers=headers, params=kv) 30 | r.raise_for_status() 31 | r.encoding = r.apparent_encoding 32 | return r.text 33 | except Exception as e: 34 | print(e) 35 | return "" 36 | 37 | 38 | def parse_doc_info(ulis, html): 39 | soup = BeautifulSoup(html, 'lxml') 40 | all_info = soup.find_all(class_='clearfix showResult-cell bb pb10 mt15') 41 | for p in all_info: 42 | try: 43 | tc_p = p.find('p', class_='tc mt5') 44 | name = tc_p.find('a').text 45 | grade = re.findall(r'.*.*?', html, flags=re.S) 23 | for dd in dds: 24 | try: 25 | title = re.findall(r'title="(.+?)"', dd)[0], 26 | staring = re.findall(r'star">(.*?)

', dd, flags=re.S)[0].split()[0], 27 | # print(staring) 28 | releasetime = re.findall(r'releasetime">(.*?)

', dd)[0], 29 | yield { 30 | 'title': title, 31 | 'staring': staring, 32 | 'releasetime': releasetime 33 | } 34 | except: 35 | continue 36 | 37 | 38 | def output_info(content): 39 | # mutex = Lock() 40 | # mutex.acquire() 41 | with open('test.txt', 'a', encoding='utf-8') as f: 42 | f.write(json.dumps(content, ensure_ascii=False) + '\n', ) 43 | f.close() 44 | # mutex.release() 45 | 46 | 47 | def main(offset): 48 | info = [] 49 | url = 'http://maoyan.com/board/4?offset=' + str(offset) 50 | html = get_htme_text(url) 51 | for item in parse_html(html): 52 | print(item) 53 | output_info(item) 54 | 55 | 56 | if __name__ == '__main__': 57 | for i in range(10): 58 | main(i*10) 59 | # pool = Pool() 60 | # # for i in range(10): 61 | # # pool.apply_async(main, args=(i*10,)) 62 | # pool.map(main, [i * 10 for i in range(10)]) 63 | # pool.close() 64 | # pool.join() 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /spiders/百度图片.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import os 3 | import urllib.error 4 | from bs4 import BeautifulSoup 5 | 6 | 7 | class Bai_du: 8 | def __init__(self): 9 | self.user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36' \ 10 | ' (KHTML, like Gecko)Chrome/51.0.2704.63 Safari/537.36' 11 | keyword = input("Input key word:") 12 | self.keyword = {'word': keyword} 13 | self.header = {'User_Agent': self.user_agent} 14 | self.url = "https://image.baidu.com/search/index?tn=baiduimage&" \ 15 | "ipn=r&ct=201326592&cl=2&lm=-1&st=-1&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0" \ 16 | "&fb=0&width=&height=&face=0&istype=2&ie=utf-8&fm=index&pos=history&" 17 | 18 | def get_html_text(self): 19 | try: 20 | r = requests.get(url=self.url, headers=self.header, params=self.keyword) 21 | r.raise_for_status() 22 | print(r.request.url) 23 | r.encoding = r.apparent_encoding 24 | self.html = r.text 25 | except urllib.error.URLError as e: 26 | print(e.reason) 27 | 28 | def pick_pic(self): 29 | print(self.html) 30 | soup = BeautifulSoup(self.html, 'lxml') 31 | img_list = soup.find('ul', class_='imglist clearfix pageNum0') 32 | print(img_list) 33 | 34 | 35 | if __name__ == "__main__": 36 | spider = Bai_du() 37 | spider.get_html_text() 38 | spider.pick_pic() 39 | 40 | # try: 41 | # if os.path.exists('./photo.jpg'): 42 | # os.remove('./photo.jpg') 43 | # print('删除同名文件') 44 | # with open('photo.jpg', 'ab+') as f: 45 | # f.write(r.content + b'\n') # HTTP响应内容二进制形式 46 | # print("下载完成") 47 | # except urllib.error.URLError as e: 48 | # print(e.reason) 49 | # print("爬取失败") -------------------------------------------------------------------------------- /spiders/股票爬虫.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests 3 | import traceback 4 | import os 5 | from bs4 import BeautifulSoup 6 | 7 | 8 | def get_html_text(url, code='utf-8'): 9 | try: 10 | r = requests.get(url, timeout=30) 11 | r.raise_for_status() 12 | r.encoding = code 13 | return r.text 14 | except: 15 | print('失败') 16 | return "" 17 | 18 | 19 | def get_stock_list(lst, stockurl): 20 | """ 21 | 从东方财富网获取股票列表 22 | :param lst: 23 | :param stockurl: 24 | """ 25 | html = get_html_text(stockurl, 'GB2312') 26 | soup = BeautifulSoup(html, 'lxml') 27 | all_a = soup.find_all('a') 28 | for i in all_a: 29 | try: 30 | print(type(i)) 31 | href = i.attrs['href'] 32 | # print(type(href)) 33 | lst.append(re.findall(r'[s][hz]\d{6}', href)[0]) 34 | # print(type(re.findall(r'[s][hz]\d{6}', href)[0])) 35 | except: 36 | continue 37 | 38 | 39 | def get_stock_info(lst, stockurl, fpath): 40 | count = 0 41 | for stock in lst: 42 | url = stockurl + stock + ".html" 43 | html =get_html_text(url) 44 | try: 45 | if html =='': 46 | continue 47 | info_dict = {} 48 | soup = BeautifulSoup(html, 'lxml') 49 | stock_info = soup.find('div', class_='stock-bets') 50 | # print(type(stock_info)) find()返回的是个标签 51 | name = stock_info.find(class_='bets-name') 52 | # print(name.text) 53 | info_dict.update({'股票名称': name.text.split()[0]}) 54 | 55 | keylist = stock_info.find_all('dt') 56 | # print(type(keylist)) find_all返回的是结果集 57 | valuelist = stock_info.find_all('dd') 58 | for i in range(len(keylist)): 59 | key = keylist[i].text 60 | val = valuelist[i].text 61 | info_dict[key] = val 62 | 63 | with open(fpath, 'a', encoding='utf-8') as f: 64 | f.write(str(info_dict) + '\n') 65 | count = count + 1 66 | print('当前速度:{:.2%}'.format(count/len(lst)), end='\r') 67 | 68 | except: 69 | count = count + 1 70 | print('当前速度:{:.2%}'.format(count/len(lst)), end='\r') 71 | traceback.print_exc() 72 | continue 73 | 74 | 75 | if __name__ == '__main__': 76 | stock_list_url = 'http://quote.eastmoney.com/stocklist.html' 77 | stock_info_url = 'https://gupiao.baidu.com/stock/' 78 | output_file = 'D://BaidustockInfo.txt' 79 | if os.path.exists(output_file): 80 | os.remove(output_file) 81 | print('删除同名文件') 82 | slist = [] 83 | get_stock_list(slist, stock_list_url) 84 | get_stock_info(slist, stock_info_url, output_file) -------------------------------------------------------------------------------- /spiders/豆瓣.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import re 3 | import requests 4 | from lxml import etree 5 | # 第三方库User-Agent模块,它提供了最新、最全的浏览器的标识 6 | # 支持谷歌、火狐、IE、Opera 7 | # 使用命令pip3 install fake-useragent安装 8 | from fake_useragent import UserAgent 9 | 10 | 11 | class DBMovie(object): 12 | def __init__(self): 13 | self.base_url = 'https://movie.douban.com/top250' 14 | # self.ua = UserAgent() 15 | self.html_obj = None 16 | 17 | def open_file(self): 18 | csv_file = open('movie.csv', 'w', encoding='utf-8', newline='') 19 | self.writer = csv.DictWriter( 20 | csv_file, 21 | fieldnames=[ 22 | 'movie_rank', 'movie_name', 'movie_member', 'movie_star', 'movie_comment', 'movie_quote' 23 | ] 24 | ) 25 | self.writer.writeheader() 26 | 27 | def get_next_page_url(self): 28 | a = self.html_obj.xpath('//span[@class="next"]/a') 29 | if len(a) == 0: 30 | print('已经是最后一页') 31 | return 32 | next_page = a[0].xpath('@href')[0] 33 | # next_page:?start=50&filter= 34 | self.get_page_code(next_page) 35 | 36 | def write_movie_info(self, movie_list): 37 | for index, moive in enumerate(movie_list): 38 | self.writer.writerow(moive) 39 | print('第{}页写入完成'.format(index)) 40 | self.get_next_page_url() 41 | 42 | def get_content_by_xpath(self, html_obj): 43 | movie_list = [] 44 | item_div = html_obj.xpath('//div[@class="item"]') 45 | for item_tag in item_div: 46 | movie_dict = {} 47 | 48 | em = item_tag.xpath('.//em/text()')[0] 49 | print(em) 50 | hd = item_tag.xpath('.//div[@class="hd"]/a/span/text()') 51 | # 将hd中的3个信息拼接在一起 52 | info = '' 53 | for info_text in hd: 54 | content = info_text.strip('\n').strip() 55 | info += content 56 | # 演员 57 | member_info = item_tag.xpath('.//p[@class=""]/text()')[0].strip('\n').strip() 58 | # 电影评分 59 | star_number = item_tag.xpath('.//span[@class="rating_num"]/text()')[0] 60 | # 电影评论 61 | comment_number = item_tag.xpath('.//div[@class="star"]/span[last()]/text()')[0] 62 | comment_number = re.search(re.compile('(\d+)'), comment_number).group(1) 63 | # 电影点评 64 | quote = item_tag.xpath('.//span[@class="inq"]') 65 | if len(quote) != 0: 66 | quote = quote[0].xpath('text()')[0] 67 | else: 68 | quote = '影评不存在' 69 | 70 | # 将以上数据添加到movie_dict里 71 | movie_dict['movie_rank'] = em 72 | movie_dict['movie_name'] = info 73 | movie_dict['movie_member'] = member_info 74 | movie_dict['movie_star'] = star_number 75 | movie_dict['movie_comment'] = comment_number 76 | movie_dict['movie_quote'] = quote 77 | 78 | movie_list.append(movie_dict) 79 | 80 | self.write_movie_info(movie_list) 81 | 82 | def get_page_code(self, url=""): 83 | # abs_url:请求的绝对路径 84 | # 第2页 url = ?start=25&filter= 85 | # 第2次请求abs_url = https://www.douban.com/top250 + ?start=50&filter= 86 | abs_url = self.base_url + url 87 | content = requests.get(abs_url, headers={ 88 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0' 89 | }).content.decode() 90 | print(content) 91 | 92 | # 把网页源代码解析成文档树对象 93 | self.html_obj = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8')) 94 | 95 | # 调用get_content_by_xpath() 96 | self.get_content_by_xpath(self.html_obj) 97 | 98 | 99 | if __name__ == "__main__": 100 | movie_obj = DBMovie() 101 | movie_obj.open_file() 102 | movie_obj.get_page_code() 103 | --------------------------------------------------------------------------------