├── .gitignore
├── Fluent Python
    └── 数据结构
    │   └── 序列构成的数组.py
├── README.md
├── Web Scrapying with Python
    ├── Chapter01
    │   └── BeautifulSouptest.py
    ├── Chapter02
    │   ├── child_descendant.py
    │   ├── css.py
    │   ├── regex_BeautifulSoup.py
    │   └── soup_lambda.py
    ├── Chapter03
    │   ├── 1_wikipedia.py
    │   ├── 2_getlinks.py
    │   ├── 3_get_all_link.py
    │   ├── findlinks.py
    │   └── wikiSpider
    │   │   ├── scrapy.cfg
    │   │   └── wikiSpider
    │   │       ├── __init__.py
    │   │       ├── __pycache__
    │   │           ├── __init__.cpython-36.pyc
    │   │           ├── items.cpython-36.pyc
    │   │           └── settings.cpython-36.pyc
    │   │       ├── items.py
    │   │       ├── middlewares.py
    │   │       ├── pipelines.py
    │   │       ├── settings.py
    │   │       └── spiders
    │   │           ├── __init__.py
    │   │           ├── __pycache__
    │   │               ├── __init__.cpython-36.pyc
    │   │               └── article.cpython-36.pyc
    │   │           └── article.py
    ├── Chapter05
    │   ├── csv_use.py
    │   ├── download_logo.py
    │   ├── download_src.py
    │   └── email_text.py
    ├── Chapter06
    │   ├── 6-degreescrawlwiki.py
    │   └── read_csv.py
    ├── Chapter07
    │   ├── clean-n-grams.py
    │   └── n-grams.py
    ├── Chapter08
    │   ├── 2-gram-summary.py
    │   ├── 6-degrees-demo.py
    │   ├── 6-degrees-find.py
    │   └── MarkovGenerator.py
    ├── Chapter09
    │   ├── 1-simpleForm.py
    │   ├── 2-fileSubmission.py
    │   ├── 3-cookies.py
    │   ├── 4-sessionCookies.py
    │   └── 5-BasicAuth.py
    ├── Chapter10
    │   ├── 1-seleniumBasic.py
    │   ├── 2-waitForLoad.py
    │   └── 3-javascriptRedirect.py
    ├── Chapter11
    │   ├── 1-basicImage.py
    │   ├── 2-cleanImage.py
    │   ├── 3-readWebImages.py
    │   └── 4-solveCaptcha.py
    ├── Chapter12
    │   ├── headers.py
    │   ├── honeypotDetection.py
    │   └── seleniumCookies.py
    └── README.md
├── mongoDB资料
    └── ReferenceCards15-PDF.pdf
├── pymongo
    ├── README.md
    ├── create_index.py
    ├── delete.py
    ├── insert.py
    ├── update.py
    └── 查找
    │   ├── 1嵌入文档.py
    │   ├── 2嵌入数组.py
    │   ├── 3数组中嵌入文档.py
    │   ├── 4从查询中返回的项目字段.py
    │   ├── 5空字段或缺失字段.py
    │   └── 6限制显示行数.py
├── singleton
    ├── README.md
    ├── __new__.py
    ├── decorator.py
    ├── metaclass.py
    └── new_threading_safe.py
└── spiders
    ├── Bs4基本元素.py
    ├── RE库基本使用.py
    ├── ajax今日头条.py
    ├── csdn_ajax.py
    ├── jdsearch.py
    ├── jianshu.py
    ├── newhouse.py
    ├── scrapy
        ├── BaiduStocks
        │   ├── BaiduStockInfo.txt
        │   ├── BaiduStocks
        │   │   ├── __init__.py
        │   │   ├── __pycache__
        │   │   │   ├── __init__.cpython-36.pyc
        │   │   │   ├── pipelines.cpython-36.pyc
        │   │   │   └── settings.cpython-36.pyc
        │   │   ├── items.py
        │   │   ├── middlewares.py
        │   │   ├── pipelines.py
        │   │   ├── settings.py
        │   │   ├── spiders
        │   │   │   ├── __init__.py
        │   │   │   ├── __pycache__
        │   │   │   │   ├── __init__.cpython-36.pyc
        │   │   │   │   └── stocks.cpython-36.pyc
        │   │   │   └── stocks.py
        │   │   └── 调试.py
        │   └── scrapy.cfg
        ├── Tencent
        │   ├── Tencent
        │   │   ├── __init__.py
        │   │   ├── __pycache__
        │   │   │   ├── __init__.cpython-36.pyc
        │   │   │   ├── items.cpython-36.pyc
        │   │   │   └── settings.cpython-36.pyc
        │   │   ├── items.py
        │   │   ├── middlewares.py
        │   │   ├── pipelines.py
        │   │   ├── settings.py
        │   │   └── spiders
        │   │   │   ├── __init__.py
        │   │   │   ├── __pycache__
        │   │   │       ├── __init__.cpython-36.pyc
        │   │   │       └── tencent.cpython-36.pyc
        │   │   │   └── tencent.py
        │   └── scrapy.cfg
        ├── jianshu
        │   ├── jishuspider
        │   │   ├── __init__.py
        │   │   ├── items.py
        │   │   ├── middlewares.py
        │   │   ├── pipelines.py
        │   │   ├── settings.py
        │   │   └── spiders
        │   │   │   ├── __init__.py
        │   │   │   └── zhihu.py
        │   └── scrapy.cfg
        ├── movie
        │   ├── movie
        │   │   ├── __init__.py
        │   │   ├── __pycache__
        │   │   │   ├── __init__.cpython-36.pyc
        │   │   │   └── settings.cpython-36.pyc
        │   │   ├── items.py
        │   │   ├── middlewares.py
        │   │   ├── pipelines.py
        │   │   ├── settings.py
        │   │   └── spiders
        │   │   │   ├── __init__.py
        │   │   │   ├── __pycache__
        │   │   │       ├── __init__.cpython-36.pyc
        │   │   │       └── movies.cpython-36.pyc
        │   │   │   └── movies.py
        │   └── scrapy.cfg
        ├── python123demo
        │   ├── python123demo
        │   │   ├── __init__.py
        │   │   ├── __pycache__
        │   │   │   ├── __init__.cpython-36.pyc
        │   │   │   └── settings.cpython-36.pyc
        │   │   ├── items.py
        │   │   ├── middlewares.py
        │   │   ├── pipelines.py
        │   │   ├── settings.py
        │   │   └── spiders
        │   │   │   ├── __init__.py
        │   │   │   ├── __pycache__
        │   │   │       ├── __init__.cpython-36.pyc
        │   │   │       └── demo.cpython-36.pyc
        │   │   │   └── demo.py
        │   └── scrapy.cfg
        ├── quoteturorial
        │   ├── quotes.json
        │   ├── quoteturorial
        │   │   ├── __init__.py
        │   │   ├── __pycache__
        │   │   │   ├── __init__.cpython-36.pyc
        │   │   │   ├── items.cpython-36.pyc
        │   │   │   ├── pipelines.cpython-36.pyc
        │   │   │   └── settings.cpython-36.pyc
        │   │   ├── items.py
        │   │   ├── middlewares.py
        │   │   ├── pipelines.py
        │   │   ├── settings.py
        │   │   ├── spiders
        │   │   │   ├── __init__.py
        │   │   │   ├── __pycache__
        │   │   │   │   ├── __init__.cpython-36.pyc
        │   │   │   │   └── quotes.cpython-36.pyc
        │   │   │   └── quotes.py
        │   │   └── 调试.py
        │   └── scrapy.cfg
        ├── weibo
        │   ├── scrapy.cfg
        │   └── weibo
        │   │   ├── __init__.py
        │   │   ├── __pycache__
        │   │       ├── __init__.cpython-36.pyc
        │   │       └── settings.cpython-36.pyc
        │   │   ├── items.py
        │   │   ├── middlewares.py
        │   │   ├── pipelines.py
        │   │   ├── settings.py
        │   │   └── spiders
        │   │       ├── __init__.py
        │   │       ├── __pycache__
        │   │           ├── __init__.cpython-36.pyc
        │   │           └── weibos.cpython-36.pyc
        │   │       └── weibos.py
        └── zhihuuser
        │   ├── scrapy.cfg
        │   └── zhihuuser
        │       ├── __init__.py
        │       ├── __pycache__
        │           ├── __init__.cpython-36.pyc
        │           ├── items.cpython-36.pyc
        │           ├── pipelines.cpython-36.pyc
        │           └── settings.cpython-36.pyc
        │       ├── items.py
        │       ├── middlewares.py
        │       ├── pipelines.py
        │       ├── settings.py
        │       └── spiders
        │           ├── __init__.py
        │           ├── __pycache__
        │               ├── __init__.cpython-36.pyc
        │               └── zhihu.cpython-36.pyc
        │           └── zhihu.py
    ├── secondSpider
        ├── Data_Output.py
        ├── Html_Downloader.py
        ├── Html_Parser.py
        ├── SpiderWork.py
        ├── URL_Manager.py
        ├── __pycache__
        │   ├── Data_Output.cpython-36.pyc
        │   ├── Html_Downloader.cpython-36.pyc
        │   ├── Html_Parser.cpython-36.pyc
        │   └── URL_Manager.cpython-36.pyc
        ├── new_urls.txt
        ├── old_urls.txt
        └── start_Manager.py
    ├── selenium
        ├── Frame.py
        ├── Jiaohu.py
        ├── javaScript.py
        ├── 前进后退.py
        ├── 获取属性.py
        └── 选项卡管理.py
    ├── selenium个人邮箱.py
    ├── selenium模拟淘宝.py
    ├── 中国大学排名定向爬虫.py
    ├── 分布式爬虫
        ├── Data_Output.py
        ├── Html_Downloader.py
        ├── Html_Parser.py
        ├── SpiderMan.py
        ├── URL_Manager.py
        └── __pycache__
        │   ├── Data_Output.cpython-36.pyc
        │   ├── Html_Downloader.cpython-36.pyc
        │   ├── Html_Parser.cpython-36.pyc
        │   └── URL_Manager.cpython-36.pyc
    ├── 分布式进程
        ├── taskManager.py
        └── taskWork.py
    ├── 医生信息索取.py
    ├── 多线程爬取医生.py
    ├── 淘宝商品信息爬取.py
    ├── 猫眼电影.py
    ├── 百度图片.py
    ├── 股票爬虫.py
    └── 豆瓣.py


/.gitignore:
--------------------------------------------------------------------------------
1 | **/.idea/**
2 | 


--------------------------------------------------------------------------------
/Fluent Python/数据结构/序列构成的数组.py:
--------------------------------------------------------------------------------
 1 | # 容器序列
 2 |     # list，tuple，collection.deque
 3 |     # 存放的是他们所包含的任意对象的引用
 4 | # 扁平序列
 5 |     # str，bytes，bytearray，memoryview和array.array
 6 |     # 存放的是值，扁平序列其实是一段连续的内存空间
 7 | 
 8 | # 可变序列
 9 |     # list，
10 | # 不可变序列
11 |     # tuple，str和bytes
12 | 
13 | text = [i for i in range(10)]  # 列表推导式，如果超过二行，考虑for循环
14 | print(text)                    # 列表推导的作用只用一个：生成列表
15 | 
16 | 
17 | text = (i for i in range(10))  # 生成表达式，只不过把方括号换成圆括号而已
18 | print(text)
19 | 
20 | 
21 | text = tuple(i for i in range(10))  # 生成表达式是一个函数调用过程中的唯一参数时，那么不需要
22 | print(text)                         # 用额外的括号把它围起来
23 | 
24 | 
25 | # 用*处理剩下的元素    在平行赋值中，*前缀只能出现在一个变量前面
26 | # 1 函数
27 |     # def main(*args, **kwargs):
28 |     # 在python中，函数用*args来获取不确定数量的位置参数，**kwargs获取不确定数量的关键字传参
29 | 
30 | # 2 元组
31 | a, b , *rest = range(5)
32 | print(a, b, rest)
33 | 
34 | 
35 | # 列表或元祖的方法和属性
36 | #                          列表          元组
37 | # s.__add__(s2)             *             *       s+s2 拼接 ————创建一个新对象
38 | # s.__iadd__(s2)            *                     s += s2，就地拼接————元祖不可变
39 | # s.append(e)               *
40 | # s.clear()                 *                     删除所有元素
41 | # s.__contain(e)            *             *       s是否包含e————一般使用in
42 | # s.count(e)                *             *       e在s中出现的次数
43 | # s.extend(it)              *                     把可迭代对象it追加给s
44 | # s.index(e)                *             *       在s中找到元素e第一次出现的位置
45 | # s.insert(p, e)            *                     在位置p之前插入元素e
46 | # s.pop([p])                *                     删除最后或位于p位置的元素，并返回他的值
47 | # s.remove(e)               *                     删除s中第一次出现的e
48 | # s.reverse()               *                     就地把s的元素倒序排列
49 | # s.sort([key], [reverse])  *                     就地对s的元素进行排序
50 | # sorted内置函数，可接受任何可迭代对象，最后返回一个列表。
51 | 
52 | 
53 | 
54 | 
55 | # 当数组不是首选时
56 |     # 存放大量float，数组array效率更高
57 |     # 频繁的对序列做先进先出的操作，deque(双端队列)的速度会更快
58 |     # 如果查找操作很频繁，用set会更合适
59 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | #py爬虫
 2 | 
 3 | 
 4 | ## 设计模式
 5 | - [单例模式](/singleton)
 6 | 
 7 | ## 读书
 8 | - [Python网络数据采集](/Web%20Scrapying%20with%20Python)
 9 | 	- 进度：100%
10 | 
11 | - [流畅的python](/Fluent%20Python)
12 | 	- 进度：50%
13 | 
14 | 
15 | ## 爬虫
16 | - [自己写的一些爬虫实例](/spiders)
17 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter01/BeautifulSouptest.py:
--------------------------------------------------------------------------------
 1 | import requsets
 2 | from bs4 import  BeautifulSoup
 3 | 
 4 | 
 5 | html = requsets.get("http://www.pythonscraping.com/pages/page1.html")
 6 | 
 7 | '''
 8 | To get rid of this warning, change this:
 9 |  BeautifulSoup([your markup])
10 | to this:
11 |  BeautifulSoup([your markup], "html.parser")
12 |   markup_type=markup_type))
13 | '''
14 | #data = BeautifulSoup(html.read())
15 | 
16 | soup = BeautifulSoup(html.text, "html.parser")
17 | 
18 | 
19 | print(soup.title)
20 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter02/child_descendant.py:
--------------------------------------------------------------------------------
 1 | # from urllib.request import urlopen
 2 | # from bs4 import BeautifulSoup
 3 | #
 4 | # html = urlopen("http://www.pythonscraping.com/pages/page3.html")
 5 | #
 6 | # data = BeautifulSoup(html, "html.parser")
 7 | #
 8 | # for child in data.find("table", {"id": "giftList"}).children:
 9 | #     print(child)
10 | 
11 | import requests
12 | from bs4 import BeautifulSoup
13 | 
14 | html = requests.get("http://www.pythonscraping.com/pages/page3.html")
15 | 
16 | soup = BeautifulSoup(html.text, "html.parser")
17 | 
18 | # children     list_iterator
19 | print(type(soup.find("table", {"id": "giftList"}).children))
20 | # include '\n'
21 | print(len(list(soup.find("table", {"id": "giftList"}).children)))
22 | print(len(list(soup.find("table", {"id": "giftList"}).descendants)))
23 | for child in soup.find("table", {"id": "giftList"}).children:
24 |     print(repr(child))
25 |     print('*************')
26 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter02/css.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | 
 4 | html = requests.get("http://www.pythonscraping.com/pages/warandpeace.html")
 5 | 
 6 | soup = BeautifulSoup(html.text, "html.parser")
 7 | 
 8 | '''
 9 | name_list = data.findAll("span", {"class":"green"})
10 | for name in name_list:
11 |     print(name.get_text())
12 | '''
13 | 
14 | prince_list = soup.findAll(text="the prince")
15 | print(len(prince_list))
16 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter02/regex_BeautifulSoup.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | import re
 6 | 
 7 | html = requests.get("http://www.pythonscraping.com/pages/page3.html")
 8 | 
 9 | soup = BeautifulSoup(html.text, "html.parser")
10 | 
11 | images = soup.findAll("img", {"src": re.compile("\.\.\/img\/gifts/img.*\.jpg")})
12 | 
13 | for i in images:
14 |     print(i["src"])
15 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter02/soup_lambda.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | 
 4 | html = requests.get("http://www.pythonscraping.com/pages/page3.html")
 5 | 
 6 | soup = BeautifulSoup(html.text, "html.parser")
 7 | 
 8 | 
 9 | prince_list = soup.findAll(lambda tag: len(tag.attrs) == 2)
10 | for i in prince_list:
11 |     print(i)
12 | 
13 | print(len(prince_list))
14 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter03/1_wikipedia.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import re
 4 | 
 5 | html = requests.get("https://en.wikipedia.org/wiki/Kevin_Bacon")
 6 | 
 7 | soup = BeautifulSoup(html.text, "html.parser")
 8 | 
 9 | '''
10 | for link in data.findAll("a"):
11 |     if 'href'  in link.attrs:
12 |         print(link.attrs['href'])
13 | '''
14 | 
15 | for link in soup.find("div", {"id": "bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$")):
16 |     if 'href' in link.attrs:
17 |         print(link.attrs['href'])
18 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter03/2_getlinks.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | import datetime
 6 | import random
 7 | import re
 8 | 
 9 | random.seed(datetime.datetime.now())
10 | 
11 | 
12 | def getLinks(articeUrl):
13 |     html = requests.get("https://en.wikipedia.org" + articeUrl)
14 |     soup = BeautifulSoup(html.text, "html.parser")
15 | 
16 |     return soup.find("div", {"id": "bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
17 | 
18 | 
19 | links = getLinks("/wiki/Kevin_Bacon")
20 | 
21 | while len(links) > 0:
22 |     newArticle = links[random.randint(0, len(links) - 1)].attrs['href']
23 |     print(newArticle)
24 | 
25 |     links = getLinks(newArticle)
26 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter03/3_get_all_link.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import requests
 3 | import re
 4 | 
 5 | pages = set()
 6 | 
 7 | 
 8 | def getLinks(articeUrl):
 9 |     global pages
10 | 
11 |     html = requests.get("https://en.wikipedia.org" + articeUrl)
12 |     data = BeautifulSoup(html.text, "html.parser")
13 | 
14 |     try:
15 |         print(data.h1.text)
16 |         print(data.find(id="mw-content-text").findAll("p")[0])
17 |         print(data.find(id="ca-edit").find("span").find("a").attrs['href'])
18 |     except AttributeError:
19 |         print("Missing some attributes")
20 | 
21 |     for link in data.findAll("a", href=re.compile("^(/wiki/)")):
22 |         if 'href' in link.attrs:
23 |             if link.attrs['href'] not in pages:
24 |                 newPage = link.attrs['href']
25 |                 print("new Page:")
26 |                 print(newPage)
27 | 
28 |                 pages.add(newPage)
29 | 
30 |                 getLinks(newPage)
31 | 
32 | 
33 | getLinks("")
34 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter03/findlinks.py:
--------------------------------------------------------------------------------
 1 | from urllib.request import urlopen
 2 | from bs4 import BeautifulSoup
 3 | 
 4 | import re
 5 | import datetime
 6 | import random
 7 | 
 8 | pages = set()
 9 | random.seed(datetime.datetime.now())
10 | 
11 | 
12 | def getInternalLinks(data, includeUrl):
13 |     inLinks = []
14 | 
15 |     for link in data.findAll("a", href=re.compile("^(/|.*" + includeUrl + ")")):
16 |         if link.attrs['href'] is not None:
17 |             if link.attrs['href'] not in inLinks:
18 |                 inLinks.append(link.attrs['href'])
19 | 
20 |     return inLinks
21 | 
22 | 
23 | def getExLinks(data, excludeUrl):
24 |     exLinks = []
25 | 
26 |     for link in data.findAll("a", href=re.compile("^(http|www)((?!" + excludeUrl + ").)*$")):
27 |         if link.attrs['href'] is not None:
28 |             if link.attrs['href'] not in exLinks:
29 |                 exLinks.append(link.attrs['href'])
30 | 
31 |     return exLinks
32 | 
33 | 
34 | def splitAddress(address):
35 |     addressParts = address.replace("http://", "").split("/")
36 | 
37 |     return addressParts
38 | 
39 | 
40 | def getRandomExtLink(startPage):
41 |     html = urlopen(startPage)
42 |     data = BeautifulSoup(html, "html.parser")
43 |     exLinks = getExLinks(data, splitAddress(startPage)[0])
44 | 
45 |     if len(exLinks) == 0:
46 |         inLinks = getInternalLinks(startPage)
47 |         return getRandomExtLink(inLinks[random.randint(0, len(inLinks) - 1)])
48 |     else:
49 |         return exLinks[random.randint(0, len(exLinks) - 1)]
50 | 
51 | 
52 | def followExtOnly(startSite):
53 |     extLink = getRandomExtLink("http://oreilly.com")
54 |     print("extLink:" + extLink)
55 |     followExtOnly(extLink)
56 | 
57 | 
58 | followExtOnly("http://oreilly.com")
59 | 
60 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter03/wikiSpider/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = wikiSpider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = wikiSpider
12 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/__init__.py


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy import Item, Field
 9 | 
10 | 
11 | class WikispiderItem(Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     title = Field()
15 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class WikispiderPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for wikiSpider project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'wikiSpider'
13 | 
14 | SPIDER_MODULES = ['wikiSpider.spiders']
15 | NEWSPIDER_MODULE = 'wikiSpider.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'wikiSpider (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'wikiSpider.middlewares.WikispiderSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'wikiSpider.middlewares.WikispiderDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'wikiSpider.pipelines.WikispiderPipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/spiders/__pycache__/article.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/spiders/__pycache__/article.cpython-36.pyc


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter03/wikiSpider/wikiSpider/spiders/article.py:
--------------------------------------------------------------------------------
 1 | from scrapy import Spider
 2 | from wikiSpider.items import WikispiderItem
 3 | 
 4 | 
 5 | class ArticleSpider(Spider):
 6 |     name = "article"
 7 |     allowed_domains = ["en.wikipedia.org"]
 8 |     start_urls = ["https://en.wikipedia.org/wiki/Main_Page",
 9 |                   "https://en.wikipedia.org/wiki/Python_%28programming_language%29"]
10 | 
11 |     def parse(self, response):
12 |         item = WikispiderItem()
13 |         title = response.xpath('//h1/text()')[0].extract()
14 |         print("Title is:" + title)
15 |         item['title'] = title
16 |         return item
17 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter05/csv_use.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from urllib.request import urlopen
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | html = urlopen('https://en.wikipedia.org/wiki/Comparison_of_text_editors')
 6 | soup = BeautifulSoup(html, 'lxml')
 7 | table = soup.find('table', {'class': 'wikitable'})
 8 | print(type(table))
 9 | rows = table.findAll('tr')
10 | 
11 | csvFile = open("editors.csv", 'wt', newline='', encoding='utf-8')
12 | writer = csv.writer(csvFile)
13 | 
14 | try:
15 |     for row in rows:
16 |         csvRow = []
17 |         for cell in row.findAll(['td', 'th']):
18 |             csvRow.append(cell.text)
19 |         writer.writerow(csvRow)
20 | finally:
21 |     csvFile.close()


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter05/download_logo.py:
--------------------------------------------------------------------------------
1 | from urllib.request import urlopen, urlretrieve
2 | from bs4 import BeautifulSoup
3 | 
4 | html = urlopen("http://www.pythonscraping.com")
5 | data = BeautifulSoup(html, "html.parser")
6 | 
7 | logo_location = data.find("a", {"id": "logo"}).find("img")["src"]
8 | urlretrieve(logo_location, "logo.jpg")
9 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter05/download_src.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from urllib.request import urlopen, urlretrieve
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | download_dir = "downloaded"
 7 | 
 8 | baseUrl = "http://pythonscraping.com"
 9 | 
10 | 
11 | def getURI(url, source):
12 |     if source.startswith("http://www."):
13 |         url = "http://" + source[11:]
14 |     elif source.startswith("http://"):
15 |         url = source
16 |     elif source.startswith("www."):
17 |         url = "http://" + source[4:]
18 |     else:
19 |         url = baseUrl + "/" + source
20 | 
21 |     if baseUrl not in url:
22 |         print('not in url')
23 |         return None
24 | 
25 |     return url
26 | 
27 | 
28 | def getDownloadPath(baseUrl, url, download_dir):
29 |     path = url.replace("www.", "")
30 |     path = path.replace(baseUrl, "")
31 |     path = download_dir + path
32 | 
33 |     print(path)
34 |     dir = os.path.dirname(path)
35 |     print(dir)
36 |     if not os.path.exists(dir):
37 |         os.makedirs(dir)
38 | 
39 |     return path
40 | 
41 | 
42 | def main():
43 |     html = urlopen(baseUrl)
44 |     soup = BeautifulSoup(html, "html.parser")
45 |     download_list = soup.findAll(src=True)
46 | 
47 |     for download in download_list:
48 |         fileUrl = getURI(baseUrl, download["src"])
49 |         if fileUrl is not None:
50 |             print(fileUrl)
51 |             urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, download_dir))
52 |             # break
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     main()
57 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter05/email_text.py:
--------------------------------------------------------------------------------
 1 | import smtplib
 2 | 
 3 | from email.mime.text import MIMEText
 4 | 
 5 | msg = MIMEText("This is a mail test")
 6 | 
 7 | msg['Subject'] = "An Email ALERT"
 8 | msg['From'] = "ds@ds-virtual-machine"
 9 | msg['To'] = "942203701@qq.com"
10 | 
11 | s = smtplib.SMTP('localhost')
12 | s.send_message(msg)
13 | s.quit()
14 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter06/6-degreescrawlwiki.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import re
 3 | import pymysql
 4 | from urllib.request import urlopen
 5 | 
 6 | conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='123456', db='mysql', charset='utf8')
 7 | cur = conn.cursor()
 8 | cur.execute("USE wikipedia")
 9 | 
10 | 
11 | def pageScraped(url):
12 |     cur.execute("SELECT * FROM pages WHERE url = %s", (url))
13 |     if cur.rowcount == 0:
14 |         return False
15 |     page = cur.fetchone()
16 | 
17 |     cur.execute("SELECT * FROM links WHERE fromPageId = %s", (int(page[0])))
18 |     if cur.rowcount == 0:
19 |         return False
20 |     return True
21 | 
22 | 
23 | def insertPageIfNotExists(url):
24 |     cur.execute("SELECT * FROM pages WHERE url = %s", (url))
25 |     if cur.rowcount == 0:
26 |         cur.execute("INSERT INTO pages (url) VALUES (%s)", (url))
27 |         conn.commit()
28 |         return cur.lastrowid
29 |     else:
30 |         return cur.fetchone()[0]
31 | 
32 | 
33 | def insertLink(fromPageId, toPageId):
34 |     cur.execute("SELECT * FROM links WHERE fromPageId = %s AND toPageId = %s", (int(fromPageId), int(toPageId)))
35 |     if cur.rowcount == 0:
36 |         cur.execute("INSERT INTO links (fromPageId, toPageId) VALUES (%s, %s)", (int(fromPageId), int(toPageId)))
37 |         conn.commit()
38 | 
39 | 
40 | def getLinks(pageUrl, recursionLevel):
41 |     global pages
42 |     if recursionLevel > 4:
43 |         return
44 |     pageId = insertPageIfNotExists(pageUrl)
45 |     html = urlopen("http://en.wikipedia.org" + pageUrl)
46 |     bsObj = BeautifulSoup(html, "html.parser")
47 |     for link in bsObj.findAll("a", href=re.compile("^(/wiki/)((?!:).)*$")):
48 |         insertLink(pageId, insertPageIfNotExists(link.attrs['href']))
49 |         if not pageScraped(link.attrs['href']):
50 |             # We have encountered a new page, add it and search it for links
51 |             newPage = link.attrs['href']
52 |             print(newPage)
53 |             getLinks(newPage, recursionLevel + 1)
54 |         else:
55 |             print("Skipping: " + str(link.attrs['href']) + " found on " + pageUrl)
56 | 
57 | 
58 | getLinks("/wiki/Kevin_Bacon", 0)
59 | cur.close()
60 | conn.close()
61 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter06/read_csv.py:
--------------------------------------------------------------------------------
 1 | from urllib.request import urlopen
 2 | from io import StringIO
 3 | import csv
 4 | 
 5 | '''
 6 | Don't name your file csv.py.
 7 | When you do, Python will look in your file for the csv code instead of the standard library csv module.
 8 | '''
 9 | 
10 | data = urlopen("http://pythonscraping.com/files/MontyPythonAlbums.csv").read().decode('ascii', 'ignore')
11 | 
12 | dataFile = StringIO(data)
13 | 
14 | '''
15 | csvRead = csv.reader(dataFile)
16 | for row in csvRead:
17 |     print(row)
18 | '''
19 | 
20 | dictReader = csv.DictReader(dataFile)
21 | 
22 | print(dictReader.fieldnames)
23 | 
24 | for row in dictReader:
25 |     print(row)
26 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter07/clean-n-grams.py:
--------------------------------------------------------------------------------
 1 | from urllib.request import urlopen
 2 | from bs4 import BeautifulSoup
 3 | import re
 4 | import string
 5 | from collections import OrderedDict
 6 | 
 7 | 
 8 | def cleanInput(input):
 9 |     '''
10 |     移除转义字符，过滤Unicode字符
11 |     '''
12 |     # input = re.sub('\n+', " ", input)  # 替换换行符
13 |     input = re.sub('\[[0-9]*\]', "", input)  # remove digit
14 |     input = re.sub(r'\s', " ", input)  # remove all blank character
15 |     input = bytes(input, "UTF-8")  # 更改编码
16 |     input = input.decode("ascii", "ignore")
17 | 
18 |     cleanInput = []
19 |     input = input.split(' ')
20 |     for item in input:
21 |         item = item.strip(string.punctuation)
22 |         if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
23 |             cleanInput.append(item)
24 |     return cleanInput
25 | 
26 | 
27 | def getNgrams(input, n):
28 |     input = cleanInput(input)
29 |     # print(input)
30 |     output = dict()
31 |     for i in range(len(input) - n + 1):
32 |         newNGram = " ".join(input[i:i + n])
33 |         if newNGram in output:
34 |             output[newNGram] += 1
35 |         else:
36 |             output[newNGram] = 1
37 |     return output
38 | 
39 | 
40 | html = urlopen("https://en.wikipedia.org/wiki/Python_(programming_language)")
41 | soup = BeautifulSoup(html, "html.parser")
42 | content = soup.find("div", {"id": "mw-content-text"}).get_text()
43 | 
44 | ngrams = getNgrams(content, 2)
45 | 
46 | # Using OrderedDict sort
47 | ngrams = OrderedDict(sorted(ngrams.items(), key=lambda t: t[1], reverse=True))
48 | 
49 | print(ngrams)
50 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter07/n-grams.py:
--------------------------------------------------------------------------------
 1 | from urllib.request import urlopen
 2 | from bs4 import BeautifulSoup
 3 | import re
 4 | 
 5 | def ngrams(indata, n):
 6 |     # indata = re.split(r' |,|;|\n|', indata)
 7 |     indata = indata.split()
 8 |     print(indata)
 9 |     outodata = []
10 | 
11 |     for i in range(len(indata) - n + 1):
12 |         outodata.append(indata[i:i + n])
13 |         print(outodata)
14 |         break
15 | 
16 |     return outodata
17 | 
18 | 
19 | html = urlopen("https://en.wikipedia.org/wiki/Python_(programming_language)")
20 | soup = BeautifulSoup(html, "html.parser")
21 | 
22 | content = soup.find("div", {"id": "mw-content-text"}).get_text()
23 | # content = soup.find("div", id_="mw-content-text").get_text()
24 | 
25 | ngram = ngrams(content, 2)
26 | 
27 | print(ngram)
28 | print("2-ngrams count is:" + str(len(ngram)))
29 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter08/2-gram-summary.py:
--------------------------------------------------------------------------------
 1 | from urllib.request import urlopen
 2 | from bs4 import BeautifulSoup
 3 | import re
 4 | import string
 5 | import operator
 6 | 
 7 | 
 8 | def isCommon(ngram):
 9 |     commonWords = ["the", "be", "and", "of", "a", "in", "to", "have", "it", "i", "that", "for", "you", "he", "with",
10 |                    "on", "do", "say", "this", "they", "is", "an", "at", "but", "we", "his", "from", "that", "not", "by",
11 |                    "she", "or", "as", "what", "go", "their", "can", "who", "get", "if", "would", "her", "all", "my",
12 |                    "make", "about", "know", "will", "as", "up", "one", "time", "has", "been", "there", "year", "so",
13 |                    "think", "when", "which", "them", "some", "me", "people", "take", "out", "into", "just", "see",
14 |                    "him", "your", "come", "could", "now", "than", "like", "other", "how", "then", "its", "our", "two",
15 |                    "more", "these", "want", "way", "look", "first", "also", "new", "because", "day", "more", "use",
16 |                    "no", "man", "find", "here", "thing", "give", "many", "well"]
17 |     for word in ngram:
18 |         if word in commonWords:
19 |             return True
20 |     return False
21 | 
22 | 
23 | def cleanText(input):
24 |     input = re.sub('\n+', " ", input).lower()
25 |     input = re.sub('\[[0-9]*\]', "", input)
26 |     input = re.sub(' +', " ", input)
27 |     input = re.sub("u\.s\.", "us", input)
28 |     input = bytes(input, "UTF-8")
29 |     input = input.decode("ascii", "ignore")
30 |     return input
31 | 
32 | 
33 | def cleanInput(input):
34 |     input = cleanText(input)
35 |     cleanInput = []
36 |     input = input.split(' ')
37 |     for item in input:
38 |         item = item.strip(string.punctuation)
39 |         if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
40 |             cleanInput.append(item)
41 |     print(len(cleanInput))
42 |     return cleanInput
43 | 
44 | 
45 | def getNgrams(input, n):
46 |     input = cleanInput(input)
47 |     output = {}
48 |     for i in range(len(input) - n + 1):
49 |         ngramTemp = " ".join(input[i:i + n])
50 |         if ngramTemp not in output:
51 |             output[ngramTemp] = 0
52 |         output[ngramTemp] += 1
53 |     return output
54 | 
55 | 
56 | def getFirstSentenceContaining(ngram, content):
57 |     # print(ngram)
58 |     sentences = content.split(".")
59 |     # print(sentences)
60 |     for sentence in sentences:
61 |         if ngram in sentence.lower():
62 |             return sentence
63 |     return ""
64 | 
65 | 
66 | content = str(urlopen("http://pythonscraping.com/files/inaugurationSpeech.txt").read(), 'utf-8')
67 | # print(content)
68 | ngrams = getNgrams(content, 2)
69 | sortedNGrams = sorted(ngrams.items(), key=operator.itemgetter(1), reverse=True)
70 | print(len(sortedNGrams))
71 | 
72 | selected_ngrams = []
73 | for item in sortedNGrams:
74 |     if item[1] > 2 and not isCommon(item[0].split()):
75 |         selected_ngrams.append(item)
76 | print(selected_ngrams)
77 | print('the number of the significant 2-grams is:' + str(len(selected_ngrams)))
78 | 
79 | count = 0
80 | for ngram in selected_ngrams:
81 |     count += 1
82 |     print(ngram)
83 |     print(getFirstSentenceContaining(ngram[0], content))
84 |     if count > 5:
85 |         break


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter08/6-degrees-demo.py:
--------------------------------------------------------------------------------
 1 | class SolutionFound(RuntimeError):
 2 |     def __init__(self, message):
 3 |         self.message = message
 4 | 
 5 | 
 6 | def getLinks(fromPageId):
 7 |     data = {1: [2, 3, 4],
 8 |             2: [5, 6, 7],
 9 |             3: [8, 9, 10],
10 |             4: [11, 12, 13],
11 |             6: [14, 15, 16]}
12 |     if fromPageId not in data:
13 |         return None
14 |     return data[fromPageId]
15 | 
16 | 
17 | def constructDict(currentPageId):
18 |     links = getLinks(currentPageId)
19 |     if links:
20 |         return dict(zip(links, [{}] * len(links)))
21 |     return {}
22 | 
23 | 
24 | def searchDepth(targetPageId, currentPageId, linkTree, depth):
25 |     print('depth: ', depth)
26 |     # print(id(linkTree))
27 |     if depth == 0:
28 |         return linkTree
29 |     if not linkTree:
30 |         linkTree = constructDict(currentPageId)
31 |         if not linkTree:
32 |             return {}
33 |     if targetPageId in linkTree.keys():
34 |         print('TAREGT: ' + str(targetPageId) + ' FOUND!')
35 |         raise SolutionFound('PAGE:' + str(currentPageId))
36 | 
37 |     for branchkey, branchvalue in linkTree.items():
38 |         try:
39 |             linkTree[branchkey] = searchDepth(targetPageId, branchkey,
40 |                                               branchvalue, depth - 1)
41 |         except SolutionFound as e:
42 |             print(e.message)
43 |             raise SolutionFound('PAGE:' + str(currentPageId))
44 |     return linkTree
45 | 
46 | 
47 | try:
48 |     linkTree = searchDepth(14, 1, {}, 4)
49 | 
50 |     print('No solution found')
51 | except SolutionFound as e:
52 |     print(e.message)
53 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter08/6-degrees-find.py:
--------------------------------------------------------------------------------
 1 | from urllib.request import urlopen
 2 | from bs4 import BeautifulSoup
 3 | import pymysql
 4 | 
 5 | conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='mysql', charset='utf8')
 6 | cur = conn.cursor()
 7 | cur.execute("USE wikipedia")
 8 | 
 9 | 
10 | def getUrl(pageId):
11 |     cur.execute("SELECT url FROM pages WHERE id = %s", (int(pageId)))
12 |     if cur.rowcount == 0:
13 |         return None
14 |     return cur.fetchone()[0]
15 | 
16 | 
17 | def getLinks(fromPageId):
18 |     cur.execute("SELECT toPageId FROM links WHERE fromPageId = %s", (int(fromPageId)))
19 |     if cur.rowcount == 0:
20 |         return None
21 |     return [x[0] for x in cur.fetchall()]
22 | 
23 | 
24 | def searchBreadth(targetPageId, currentPageId, depth, nodes):
25 |     if nodes is None or len(nodes) == 0:
26 |         return None
27 |     if depth <= 0:
28 |         for node in nodes:
29 |             if node == targetPageId:
30 |                 return [node]
31 |         return None
32 |     # depth is greater than 0 -- go deeper!
33 |     for node in nodes:
34 |         found = searchBreadth(targetPageId, node, depth - 1, getLinks(node))
35 |         if found is not None:
36 |             return found.append(currentPageId)
37 |     return None
38 | 
39 | 
40 | nodes = getLinks(1)
41 | targetPageId = 123428
42 | for i in range(0, 4):
43 |     found = searchBreadth(targetPageId, 1, i, nodes)
44 |     if found is not None:
45 |         print(found)
46 |         for node in found:
47 |             print(getUrl(node))
48 |         break
49 |     else:
50 |         print("No path found")
51 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter08/MarkovGenerator.py:
--------------------------------------------------------------------------------
 1 | from urllib.request import urlopen
 2 | from random import randint
 3 | import collections
 4 | 
 5 | 
 6 | def wordListSum(wordList):
 7 |     sum = 0
 8 |     for word, value in wordList.items():
 9 |         sum += value
10 |     return sum
11 | 
12 | 
13 | def retrieveRandomWord(wordList):
14 |     randIndex = randint(1, wordListSum(wordList))
15 |     for word, value in wordList.items():
16 |         randIndex -= value
17 |         if randIndex <= 0:
18 |             return word
19 | 
20 | 
21 | def buildWordDict(text):
22 |     # Remove newlines and quotes
23 |     text = text.replace("\n", " ")
24 |     text = text.replace("\"", "")
25 | 
26 |     # Make sure puncuation are treated as their own "word," so they will be included
27 |     # in the Markov chain
28 |     punctuation = [',', '.', ';', ':']
29 |     for symbol in punctuation:
30 |         text = text.replace(symbol, " " + symbol + " ")
31 | 
32 |     words = text.split(" ")
33 |     # Filter out empty words
34 |     words = [word for word in words if word != ""]
35 | 
36 |     wordDict = {}
37 |     for i in range(1, len(words)):
38 |         if words[i - 1] not in wordDict:
39 |             # Create a new dictionary for this word
40 |             wordDict[words[i - 1]] = {}
41 |         if words[i] not in wordDict[words[i - 1]]:
42 |             wordDict[words[i - 1]][words[i]] = 0
43 |         wordDict[words[i - 1]][words[i]] += 1
44 | 
45 |     """
46 |     # defaultdict: 
47 |     
48 |     wordDict = collections.defaultdict(dict)
49 |     for i in range(1, len(words)):
50 |         if words[i] not in wordDict[words[i - 1]]:
51 |             wordDict[words[i - 1]][words[i]] = 0
52 |         wordDict[words[i - 1]][words[i]] += 1
53 |     """
54 | 
55 |     """
56 |         # setdefault: 
57 |         
58 |         
59 |         wordDict = {}
60 |         for i in range(1, len(words)):
61 |             if words[i] not in wordDict.setdefault(words[i - 1], {}):
62 |                 wordDict[words[i - 1]][words[i]] = 0
63 |             wordDict[words[i - 1]][words[i]] += 1
64 |     """
65 | 
66 |     return wordDict
67 | 
68 | 
69 | text = str(urlopen("https://pythonscraping.com/files/inaugurationSpeech.txt").read(), 'utf-8')
70 | wordDict = buildWordDict(text)
71 | # print(wordDict)
72 | 
73 | # Generate a Markov chain of length 100
74 | length = 100
75 | chain = ""
76 | currentWord = "I"
77 | for i in range(0, length):
78 |     chain += currentWord + " "
79 |     # print(wordDict[currentWord])
80 |     currentWord = retrieveRandomWord(wordDict[currentWord])
81 | 
82 | print(chain)
83 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter09/1-simpleForm.py:
--------------------------------------------------------------------------------
1 | import requests
2 | 
3 | params = {'firstname': 'Ryan', 'lastname': 'Mitchell'}
4 | r = requests.post("http://pythonscraping.com/files/processing.php", data=params)
5 | print(r.text)
6 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter09/2-fileSubmission.py:
--------------------------------------------------------------------------------
1 | import requests
2 | 
3 | files = {'uploadFile': open('../files/Python-logo.png', 'rb')}
4 | r = requests.post("http://pythonscraping.com/pages/processing2.php", 
5 |                   files=files)
6 | print(r.text)
7 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter09/3-cookies.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | params = {'username': 'Ryan', 'password': 'password'}
 4 | r = requests.post("http://pythonscraping.com/pages/cookies/welcome.php", params)
 5 | print("Cookie is set to:")
 6 | print(r.cookies.get_dict())
 7 | print("-----------")
 8 | print("Going to profile page...")
 9 | r = requests.get("http://pythonscraping.com/pages/cookies/profile.php", cookies=r.cookies)
10 | print(r.text)
11 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter09/4-sessionCookies.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | session = requests.Session()
 4 | 
 5 | params = {'username': 'username', 'password': 'password'}
 6 | s = session.post("http://pythonscraping.com/pages/cookies/welcome.php", params)
 7 | print("Cookie is set to:")
 8 | print(s.cookies.get_dict())
 9 | print("-----------")
10 | print("Going to profile page...")
11 | s = session.get("http://pythonscraping.com/pages/cookies/profile.php")
12 | print(s.text)
13 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter09/5-BasicAuth.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from requests.auth import AuthBase
3 | from requests.auth import HTTPBasicAuth
4 | 
5 | auth = HTTPBasicAuth('ryan', 'password')
6 | r = requests.post(url="http://pythonscraping.com/pages/auth/login.php", auth=auth)
7 | print(r.text)
8 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter10/1-seleniumBasic.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | import time
3 | 
4 | driver = webdriver.Firefox()
5 | # driver = webdriver.PhantomJS()
6 | driver.get('http://pythonscraping.com/pages/javascript/ajaxDemo.html')
7 | time.sleep(5)
8 | print(driver.find_element_by_id('content').text)
9 | driver.close()


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter10/2-waitForLoad.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from selenium.webdriver.common.by import By
 3 | from selenium.webdriver.support.ui import WebDriverWait
 4 | from selenium.webdriver.support import expected_conditions as EC
 5 | 
 6 | driver = webdriver.Firefox()
 7 | driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")
 8 | try:
 9 |     element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "loadedButton")))
10 | finally:
11 |     print(driver.find_element_by_id("content").text)
12 |     driver.close()


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter10/3-javascriptRedirect.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | import time
 3 | from selenium.webdriver.remote.webelement import WebElement
 4 | from selenium.common.exceptions import StaleElementReferenceException
 5 | 
 6 | 
 7 | def waitForLoad(driver):
 8 |     elem = driver.find_element_by_tag_name("html")
 9 |     count = 0
10 |     while True:
11 |         count += 1
12 |         if count > 20:
13 |             print("Timing out after 10 seconds and returning")
14 |             return
15 |         time.sleep(.5)
16 |         try:
17 |             elem == driver.find_element_by_tag_name("html")
18 |         except StaleElementReferenceException:
19 |             return
20 | 
21 | 
22 | driver = webdriver.Firefox()
23 | driver.get("http://pythonscraping.com/pages/javascript/redirectDemo1.html")
24 | waitForLoad(driver)
25 | print(driver.page_source)
26 | 
27 | driver.close()
28 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter11/1-basicImage.py:
--------------------------------------------------------------------------------
1 | from PIL import Image, ImageFilter
2 | 
3 | kitten = Image.open("kitten.jpg")
4 | blurryKitten = kitten.filter(ImageFilter.GaussianBlur)
5 | blurryKitten.save("kitten_blurred.jpg")
6 | blurryKitten.show()


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter11/2-cleanImage.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | import subprocess
 3 | 
 4 | 
 5 | def cleanFile(filePath, newFilePath):
 6 |     image = Image.open(filePath)
 7 | 
 8 |     # Set a threshold value for the image, and save
 9 |     image = image.point(lambda x: 0 if x < 143 else 255)
10 |     image.save(newFilePath)
11 | 
12 |     # call tesseract to do OCR on the newly created image
13 |     subprocess.call(["tesseract", newFilePath, "output"])
14 | 
15 |     # Open and read the resulting data file
16 |     outputFile = open("output.txt", 'r')
17 |     print(outputFile.read())
18 |     outputFile.close()
19 | 
20 | 
21 | cleanFile("text_2.png", "text_2_clean.png")


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter11/3-readWebImages.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from urllib.request import urlretrieve
 3 | import subprocess
 4 | from selenium import webdriver
 5 | 
 6 | # driver = webdriver.PhantomJS(executable_path='/Users/ryan/Documents/pythonscraping/code/headless/phantomjs-1.9.8-macosx/bin/phantomjs')
 7 | driver = webdriver.Firefox()
 8 | driver.get("http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200")
 9 | time.sleep(2)
10 | 
11 | driver.find_element_by_id("img-canvas").click()
12 | # The easiest way to get exactly one of every page
13 | imageList = set()
14 | 
15 | # Wait for the page to load
16 | time.sleep(10)
17 | print(driver.find_element_by_id("sitbReaderRightPageTurner").get_attribute("style"))
18 | while "pointer" in driver.find_element_by_id("sitbReaderRightPageTurner").get_attribute("style"):
19 |     # While we can click on the right arrow, move through the pages
20 |     driver.find_element_by_id("sitbReaderRightPageTurner").click()
21 |     time.sleep(2)
22 |     # Get any new pages that have loaded (multiple pages can load at once)
23 |     pages = driver.find_elements_by_xpath("//div[@class='pageImage']/div/img")
24 |     for page in pages:
25 |         image = page.get_attribute("src")
26 |         imageList.add(image)
27 |     # break
28 | 
29 | driver.quit()
30 | 
31 | # Start processing the images we've collected URLs for with Tesseract
32 | for image in sorted(imageList):
33 |     urlretrieve(image, "page.jpg")
34 |     p = subprocess.Popen(["tesseract", "page.jpg", "page"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
35 |     p.wait()
36 |     f = open("page.txt", "r")
37 |     print(f.read())
38 |     # break
39 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter11/4-solveCaptcha.py:
--------------------------------------------------------------------------------
 1 | from urllib.request import urlretrieve
 2 | from urllib.request import urlopen
 3 | from bs4 import BeautifulSoup
 4 | import subprocess
 5 | import requests
 6 | from PIL import Image
 7 | from PIL import ImageOps
 8 | 
 9 | 
10 | def cleanImage(imagePath):
11 |     image = Image.open(imagePath)
12 |     image = image.point(lambda x: 0 if x < 143 else 255)
13 |     borderImage = ImageOps.expand(image, border=20, fill='white')
14 |     borderImage.save(imagePath)
15 | 
16 | 
17 | html = urlopen("http://www.pythonscraping.com/humans-only")
18 | bsObj = BeautifulSoup(html, "html.parser")
19 | # Gather prepopulated form values
20 | imageLocation = bsObj.find("img", {"title": "Image CAPTCHA"})["src"]
21 | formBuildId = bsObj.find("input", {"name": "form_build_id"})["value"]
22 | captchaSid = bsObj.find("input", {"name": "captcha_sid"})["value"]
23 | captchaToken = bsObj.find("input", {"name": "captcha_token"})["value"]
24 | 
25 | captchaUrl = "http://pythonscraping.com" + imageLocation
26 | urlretrieve(captchaUrl, "captcha.jpg")
27 | cleanImage("captcha.jpg")
28 | p = subprocess.Popen(["tesseract", "captcha.jpg", "captcha"], stdout=
29 | subprocess.PIPE, stderr=subprocess.PIPE)
30 | p.wait()
31 | f = open("captcha.txt", "r")
32 | 
33 | # Clean any whitespace characters
34 | captchaResponse = f.read().replace(" ", "").replace("\n", "")
35 | print("Captcha solution attempt: " + captchaResponse)
36 | 
37 | if len(captchaResponse) == 5:
38 |     params = {"captcha_token": captchaToken, "captcha_sid": captchaSid,
39 |               "form_id": "comment_node_page_form", "form_build_id": formBuildId,
40 |               "captcha_response": captchaResponse, "name": "Ryan Mitchell",
41 |               "subject": "I come to seek the Grail",
42 |               "comment_body[und][0][value]":
43 |                   "...and I am definitely not a bot"}
44 |     r = requests.post("http://www.pythonscraping.com/comment/reply/10",
45 |                       data=params)
46 |     responseObj = BeautifulSoup(r.text, 'html.parse')
47 |     if responseObj.find("div", {"class": "messages"}) is not None:
48 |         print(responseObj.find("div", {"class": "messages"}).get_text())
49 | else:
50 |     print("There was a problem reading the CAPTCHA correctly!")
51 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter12/headers.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | 
 4 | session = requests.Session()
 5 | headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}
 6 | url = "https://www.whatismybrowser.com/detect/what-http-headers-is-my-browser-sending"
 7 | req = session.get(url, headers=headers)
 8 | 
 9 | bsObj = BeautifulSoup(req.text, "lxml")
10 | print(bsObj.find("table",{"class":"table-striped"}).get_text)
11 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter12/honeypotDetection.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from selenium.webdriver.remote.webelement import WebElement
 3 | 
 4 | driver = webdriver.PhantomJS(executable_path='')
 5 | driver.get("http://pythonscraping.com/pages/itsatrap.html")
 6 | links = driver.find_elements_by_tag_name("a")
 7 | for link in links:
 8 |     if not link.is_displayed():
 9 |         print("The link "+link.get_attribute("href")+" is a trap")
10 | 
11 | fields = driver.find_elements_by_tag_name("input")
12 | for field in fields:
13 |     if not field.is_displayed():
14 |         print("Do not change value of "+field.get_attribute("name"))
15 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/Chapter12/seleniumCookies.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | 
 3 | # driver = webdriver.PhantomJS(executable_path='<Path to Phantom JS>')
 4 | driver = webdriver.Firefox()
 5 | driver.get("http://pythonscraping.com")
 6 | driver.implicitly_wait(1)
 7 | print(driver.get_cookies())
 8 | 
 9 | savedCookies = driver.get_cookies()
10 | 
11 | driver2 = webdriver.PhantomJS(executable_path='<Path to Phantom JS>')
12 | driver2.get("http://pythonscraping.com")
13 | driver2.delete_all_cookies()
14 | for cookie in savedCookies:
15 |     driver2.add_cookie(cookie)
16 | 
17 | driver2.get("http://pythonscraping.com")
18 | driver.implicitly_wait(1)
19 | print(driver2.get_cookies())
20 | 


--------------------------------------------------------------------------------
/Web Scrapying with Python/README.md:
--------------------------------------------------------------------------------
 1 | ##Web Scraping with Python
 2 | 
 3 | 
 4 | #Python网络数据采集
 5 | - [第1章 初见网络爬虫](Chapter01/)
 6 | - [第2章 复杂HTML解析](Chapter02/)
 7 | - [第3章 开始采集](Chapter03/)
 8 | - [第4章 使用API](Chapter04/)
 9 | - [第5章 存储数据](Chapter05/)
10 | - [第6章 读取文档](Chapter06/)
11 | - [第7章 数据清洗](Chapter07/)
12 | - [第8章 自然语言处理](Chapter08/)
13 | - [第9章 穿越网页表单与登录窗口采集](Chapter09/)
14 | - [第10章 采集JavaScript](Chapter10/)
15 | - [第11章 图像识别与文字处理](Chapter11/)
16 | - [第12章 避开采集陷阱](Chapter12/)
17 | - [第13章 用爬虫测试网站](Chapter13/)
18 | - [第14章 远程采集](Chapter14/)
19 | 
20 | 
21 | ##注意
22 | 此书代码使用的是Python3
23 | 
24 | ##实践
25 | - [马尔可夫文字生成器](Chapter08/MarkovGenerator.py)
26 | - [深度优先遍历](Chapter08/6-degrees-demo.py)
27 | - [维基百科广度优先遍历](Chapter08/6-degrees-find.py)
28 | 


--------------------------------------------------------------------------------
/mongoDB资料/ReferenceCards15-PDF.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/mongoDB资料/ReferenceCards15-PDF.pdf


--------------------------------------------------------------------------------
/pymongo/README.md:
--------------------------------------------------------------------------------
1 | pymongo 对mongoDB进行
2 | 插入
3 | 查找
4 | 更新
5 | 


--------------------------------------------------------------------------------
/pymongo/create_index.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | import pymongo
 3 | client = MongoClient(host='localhost', port=27017)
 4 | db = client['text']  # 数据库名字
 5 | 
 6 | result = db.profiles.create_index([('user_id', pymongo.ASCENDING)], unique=True)
 7 | 
 8 | # The index prevents us from inserting a document whose user_id is already in the collection
 9 | user_profiles = [
10 |     {'user_id': 222, 'name': 'Luke'},
11 |     {'user_id': 252, 'name': 'Ziltoid'}]
12 | result = db.profiles.insert_many(user_profiles)
13 | client.close()


--------------------------------------------------------------------------------
/pymongo/delete.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | 
 3 | client = MongoClient(host='localhost', port=27017)
 4 | db = client['text']  # 数据库名字
 5 | 
 6 | db['inventory'].delete_one({})
 7 | # Delete operations do not drop indexes, even if deleting all documents from a collection
 8 | db['inventory'].delete_many({})
 9 | 
10 | db['inventory'].remove()
11 | client.close()


--------------------------------------------------------------------------------
/pymongo/insert.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | 
 3 | client = MongoClient(host='localhost', port=27017)
 4 | db = client['text']  # 数据库名字
 5 | 
 6 | # -----------------Inserting a Document---------------------#
 7 | post_id = db['inventory'].insert_one(
 8 |     {'item': "canvas", 'qty': 100, 'tags': ["cotton"], 'size': {'h': 28, 'w': 35.5, 'uom': "cm"}}
 9 | )
10 | 
11 | db['inventory'].bulk_write()
12 | print(post_id)
13 | 
14 | # ------------显示这个数据库下的所有几个名字-------------------#
15 | print(db.collection_names())
16 | 
17 | # --------Getting a Single Document With find_one()--------#
18 | 
19 | print(db['inventory'].find_one())
20 | print(db['inventory'].find_one({'item': 'canvas'}))
21 | 
22 | # A common task in web applications is to get an ObjectId from the
23 | # request URL and find the matching document.
24 | # It’s necessary in this case to convert the ObjectId from a string before passing it to find_one:
25 | #
26 | 
27 | 
28 | # -----------------Bulk insert ---------------------#
29 | post_id = db['inventory'].insert_many(
30 |     [{'item': "canvas1", 'qty': 100, 'tags': ["cotton"], 'size': {'h': 28, 'w': 35.5, 'uom': "cm"}},
31 |      {'item': "canvas2", 'qty': 100, 'tags': ["cotton"], 'size': {'h': 28, 'w': 35.5, 'uom': "cm"}}
32 |      ]
33 | )
34 | print(post_id)
35 | print(post_id.inserted_ids)
36 | client.close()
37 | 
38 | # ------Querying for More Than One Document----------#
39 | # --------------------find()-------------------------#
40 | 
41 | 
42 | 
43 | # -------------------Aggregation Framework-----------#
44 | result = db['aggregation'].insert_many(
45 |     [{'x': "1", 'tags': ["cat", 'dog', 'mouse']},
46 |      {'x': "2", 'tags': ["cat", 'dog', 'mouse']},
47 |      {'x': "3", 'tags': ["cat", 'dog', 'mouse']},
48 |      {'x': "4", 'tags': ['dog']},
49 |      {'x': "5", 'tags': ['pig']},
50 |      ]
51 | )
52 | 
53 | from bson.son import SON
54 | pipeline = [
55 |     {'$unwind': '$tags'},
56 |     {'$group': {'_id': '$tags', 'count': {'$sum': 1}}},
57 |     {'$sort': SON([('count', -1), ('_id', -1)])},
58 | ]
59 | print(list(db['aggregation'].aggregate(pipeline)))
60 | 
61 | db['aggregation'].map_reduce()


--------------------------------------------------------------------------------
/pymongo/update.py:
--------------------------------------------------------------------------------
 1 | # 想一个数据库添加一个新的字段
 2 | from pymongo import MongoClient
 3 | 
 4 | client = MongoClient(host='localhost', port=27017)
 5 | db = client['text']  # 数据库名字
 6 | 
 7 | # -----------------Inserting a Document---------------------#
 8 | db.inventory.insert_many([
 9 |     {'item': "canvas", 'qty': 100, 'size': {'h': 28, 'w': 35.5, 'uom': "cm"}, 'status': "A"},
10 |     {'item': "journal", 'qty': 25, 'size': {'h': 14, 'w': 21, 'uom': "cm"}, 'status': "A"},
11 |     {'item': "mat", 'qty': 85, 'size': {'h': 27.9, 'w': 35.5, 'uom': "cm"}, 'status': "A"},
12 |     {'item': "mousepad", 'qty': 25, 'size': {'h': 19, 'w': 22.85, 'uom': "cm"}, 'status': "P"},
13 |     {'item': "notebook", 'qty': 50, 'size': {'h': 8.5, 'w': 11, 'uom': "in"}, 'status': "P"},
14 |     {'item': "paper", 'qty': 100, 'size': {'h': 8.5, 'w': 11, 'uom': "in"}, 'status': "D"},
15 |     {'item': "planner", 'qty': 75, 'size': {'h': 22.85, 'w': 30, 'uom': "cm"}, 'status': "D"},
16 |     {'item': "postcard", 'qty': 45, 'size': {'h': 10, 'w': 15.25, 'uom': "cm"}, 'status': "A"},
17 |     {'item': "sketchbook", 'qty': 80, 'size': {'h': 14, 'w': 21, 'uom': "cm"}, 'status': "A"},
18 |     {'item': "sketch pad", 'qty': 95, 'size': {'h': 22.85, 'w': 30.5, 'uom': "cm"}, 'status': "A"}
19 | ])
20 | 
21 | # db['inventory'].update_one({'item': 'paper'},
22 | #                            {'$set': {'size.uom': 'cm', 'status': 'p'},
23 | #                             '$currentDate': {'lastModified': True}})
24 | #
25 | # db['inventory'].update_many({'qty': {'$lt': 50}},
26 | #                             {'$set': {'size.uom': 'in', 'status': 'p'},
27 | #                              '$currentDate': {'lastModified': True}})
28 | 
29 | 
30 | # $addToSet：向数组中添加元素，若数组本身含有该元素，则不添加，否则，添加，这样就避免了数组中的元素重复现象；
31 | # $push：向数组尾部添加元素，但它不管数组中有没有该元素，都会添加
32 | db['inventory'].update({'item': 'canvas'},
33 |                             {'$addToSet': {'comments': {'name': 456, 'status': 'p'}},
34 |                              '$currentDate': {'lastModified': True}}, True)
35 | 
36 | db['inventory'].update({'item': 'canvas'},
37 |                             {'$addToSet': {'comments': {'name': 456, 'status': 'p'}},
38 |                              '$currentDate': {'lastModified': True}}, True)
39 | 
40 | db['inventory'].update({'item': 'canvas'},
41 |                             {'$push': {'comments': {'name': 456, 'status': 'p'}},
42 |                              '$currentDate': {'lastModified': True}}, True)
43 | # replace
44 | 
45 | # db['inventory'].replace_one(
46 | #     {'item': "paper"},
47 | #     {'item': "paper", 'instock': [{'warehouse': "A", 'qty': 60}, {'warehouse': "B", 'qty': 40}]}
48 | # )
49 | # db['inventory'].update({'size.h': 30}, {'size': tmp}, True)
50 | client.close()
51 | 


--------------------------------------------------------------------------------
/pymongo/查找/1嵌入文档.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | 
 3 | client = MongoClient(host='localhost', port=27017)
 4 | db = client['text']  # 数据库名字
 5 | 
 6 | post_id = db['inventory'].insert_many(
 7 |     [{'item': "canvas1", 'qty': 20, 'tags': ["blank", 'red', 'blue'], 'size': {'h': 28, 'w': 35.5, 'uom': "cm"}},
 8 |      {'item': "canvas2", 'qty': 100, 'tags': ["red", 'blank'], 'size': {'h': 8.5, 'w': 11.5, 'uom': "in"}},
 9 |      {'item': "canvas3", 'qty': 60, 'tags': ["blank", 'red'], 'size': {'h': 8.5, 'w': 11.5, 'uom': "in"}},
10 |      {'item': "canvas4", 'qty': 45, 'tags': ["blank", 'red'], 'size': {'h': 30, 'w': 20.5, 'uom': "cm"}},
11 |      {'item': "canvas5", 'qty': 30, 'tags': ['blue'], 'size': {'h': 30, 'w': 20.5, 'uom': "cm"}}
12 |      ]
13 | )
14 | 
15 | # --------------嵌入文档 ----------------------#
16 | print(list(db['inventory'].find({'size.h': 28})))  # 查询嵌入文档里面的字段，用.
17 | # 字典里面的字段（field）必须完全匹配 order 顺序不能乱
18 | print(list(db['inventory'].find({'size': {'h': 28, 'w': 35.5, 'uom': "cm"}})))
19 | print(list(db['inventory'].find({'size': {'w': 35.5, 'uom': "cm", 'h': 28}})))
20 | print(list(db['inventory'].find({'size': {'w': 35.5, 'uom': "cm"}})))
21 | 
22 | client.close()
23 | 


--------------------------------------------------------------------------------
/pymongo/查找/2嵌入数组.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | 
 3 | client = MongoClient(host='localhost', port=27017)
 4 | db = client['text']  # 数据库名字
 5 | 
 6 | post_id = db['inventory'].insert_many(
 7 |     [{'item': "canvas1", 'qty': 20, 'tags': ["blank", 'red', 'blue'], 'size': {'h': 28, 'w': 35.5, 'uom': "cm"}},
 8 |      {'item': "canvas2", 'qty': 100, 'tags': ["red", 'blank'], 'size': {'h': 8.5, 'w': 11.5, 'uom': "in"}},
 9 |      {'item': "canvas3", 'qty': 60, 'tags': ["blank", 'red'], 'size': {'h': 8.5, 'w': 11.5, 'uom': "in"}},
10 |      {'item': "canvas4", 'qty': 45, 'tags': ["blank", 'red'], 'size': {'h': 30, 'w': 20.5, 'uom': "cm"}},
11 |      {'item': "canvas5", 'qty': 30, 'tags': ['blue'], 'size': {'h': 30, 'w': 20.5, 'uom': "cm"}}
12 |      ]
13 | )
14 | 
15 | # -------------嵌入数组------------------------#
16 | # print(list(db['inventory'].find({'tags': ['blue']})))  # 完全匹配
17 | # print(list(db['inventory'].find({'tags': ['red', 'blank']})))
18 | 
19 | # ---只要拥有red和blank即可，不必在意顺序---------#
20 | # all用在array上，不能在嵌入文档
21 | print(list(db['inventory'].find({'tags': {'$all': ['red', 'blank']}})))
22 | 
23 | # -----------tags字段至少包含一个blue-----------#
24 | # print(list(db['inventory'].find({'tags': 'blue'})))
25 | 
26 | 
27 | # -----------tags字段的大小 -----------------#
28 | # print(list(db['inventory'].find({'tags': {'$size': 3}})))
29 | 
30 | # -------------.
31 | # print(list(db['inventory2'].find({'tags.0': 'blue'})))
32 | 


--------------------------------------------------------------------------------
/pymongo/查找/3数组中嵌入文档.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | 
 3 | client = MongoClient(host='localhost', port=27017)
 4 | db = client['text']  # 数据库名字
 5 | 
 6 | # post_id = db['inventory'].insert_many(
 7 | #     [{'item': "canvas1", 'tags': [{'h': 28, 'uom': "in"}, {'h': 30, 'uom': "cm"}]},
 8 | #      {'item': "canvas2", 'tags': [{'h': 15}]},
 9 | #      {'item': "canvas3", 'tags': [{'h': 10, 'uom': "cm"}, {'h': 28, 'uom': "in"}]},
10 | #      {'item': "canvas4", 'tags': [{'h': 10, 'uom': "cm"}, {'h': 30, 'uom': "cm"}]},
11 | #      {'item': "canvas5", 'tags': [{'h': 28, 'uom': "cm"}, {'h': 30, 'uom': "cm"}]},
12 | #      {'item': "canvas6", 'tags': [{'h': 30, 'uom': "out"}, {'h': 28, 'uom': "cm"}]},
13 | #      ]
14 | # )
15 | 
16 | # -------------嵌入数组------------------------#
17 | # Equality matches on the whole embedded/nested document
18 | # require an exact match of the specified document, including the field orde
19 | # print(list(db['inventory'].find({'tags': {'h': 28, 'uom': 'in'}})))
20 | # print(list(db['inventory'].find({'tags': {'uom': 'in', 'h': 28}})))
21 | 
22 | # 字段查询
23 | # print(list(db['inventory'].find({'tags.0.uom': 'in'})))
24 | 
25 | # elemMatch  array文档至少一个满足 同一个元素中的键值组合
26 | # print(list(db['inventory'].find({'tags': {'$elemMatch': {'uom': 'in', 'h': 28}}})))
27 | # print(list(db['inventory'].find({'tags': {'$elemMatch': {'h': 28}}})))
28 | print(list(db['inventory'].find({'tags': {'$elemMatch': {'h': {'$gt': 10, '$lte': 20}}}})))
29 | 
30 | # 例如，以下查询匹配文档，其中嵌套在tags数组中的任何文档的h字段大于11，
31 | # 并且数组中的任何文档（但不一定是相同的嵌入文档）的h字段小于或等于20：
32 | print(list(db['inventory'].find({'tags.h': {'$gt': 11, '$lte': 20}})))
33 | client.close()
34 | 


--------------------------------------------------------------------------------
/pymongo/查找/4从查询中返回的项目字段.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | 
 3 | client = MongoClient(host='localhost', port=27017)
 4 | db = client['text']  # 数据库名字
 5 | 
 6 | db['inventory'].insert_many([
 7 |     {'item': "journal", 'status': "A", 'size': {'h': 14, 'w': 21, 'uom': "cm"},
 8 |      'instock': [{'warehouse': "A", 'qty': 5}]},
 9 |     {'item': "notebook", 'status': "A", 'size': {'h': 8.5, 'w': 11, 'uom': "in"},
10 |      'instock': [{'warehouse': "C", 'qty': 5}]},
11 |     {'item': "paper", 'status': "D", 'size': {'h': 8.5, 'w': 11, 'uom': "in"},
12 |      'instock': [{'warehouse': "A", 'qty': 60}]},
13 |     {'item': "planner", 'status': "D", 'size': {'h': 22.85, 'w': 30, 'uom': "cm"},
14 |      'instock': [{'warehouse': "A", 'qty': 40}]},
15 |     {'item': "postcard", 'status': "A", 'size': {'h': 10, 'w': 15.25, 'uom': "cm"},
16 |      'instock': [{'warehouse': "B", 'qty': 15}, {'warehouse': "C", 'qty': 35}]}
17 | ])
18 | 
19 | #  结果中只显示item, status 和 _id（默认）
20 | print(list(db['inventory'].find({'status': 'A'}, {'item': 1, 'status': 1})))
21 | 
22 | # Suppress _id Field
23 | print(list(db['inventory'].find({'status': 'A'}, {'item': 1, 'status': 1, '_id': 0})))
24 | 
25 | # Return All But the Excluded Fields
26 | print(list(db['inventory'].find({'status': 'A'}, {'status': 0, 'instock': 0})))
27 | 
28 | # Return Specific Fields in Embedded Documents
29 | print(list(db['inventory'].find({'status': 'A'}, {'item': 1, 'status': 1, "size.uom": 1})))
30 | 
31 | # Suppress Specific Fields in Embedded Documents
32 | print(list(db['inventory'].find({'status': 'A'}, {"size.uom": 0})))
33 | 
34 | # Projection on Embedded Documents in an Array
35 | print(list(db['inventory'].find({'status': 'A'}, {'item': 1, 'status': 1, "instock.qty": 1})))
36 | 
37 | # Project Specific Array Elements in the Returned Array
38 | # The following example uses the $slice projection operator to return the last element in the instock array
39 | print(list(db['inventory'].find({'status': 'A'}, {'item': 1, 'status': 1, "instock": {'$slice': 1}})))
40 | 
41 | 
42 | # print(list(db['inventory'].find({'status': 'A'}, {'item': 1, 'status': 1, "instock": {'$slice': [1, -1]}})))
43 | print(list(db['inventory'].find({'status': 'A'}, {'item': 1, 'status': 1, "instock": 1})))
44 | 
45 | client.close()
46 | 


--------------------------------------------------------------------------------
/pymongo/查找/5空字段或缺失字段.py:
--------------------------------------------------------------------------------
 1 | #  Query for Null or Missing Fields
 2 | 
 3 | from pymongo import MongoClient
 4 | 
 5 | client = MongoClient(host='localhost', port=27017)
 6 | db = client['text']  # 数据库名字
 7 | 
 8 | # db['inventory'].insert_many([
 9 | #     {'_id': 1, 'item': None},
10 | #     {'_id': 2}
11 | # ])
12 | 
13 | # The { item : null } query matches documents
14 | # that either contain the item field whose value is null or that do not contain the item field
15 | print(list(db['inventory'].find({'item': None})))
16 | 
17 | #  only documents that contain the item field whose value is null
18 | print(list(db['inventory'].find({'item': {'$type': 10}})))
19 | 
20 | # The { item : { $exists: false } } query matches documents that do not contain the item field:
21 | print(list(db['inventory'].find({'item': {'$exists': True}})))
22 | print(list(db['inventory'].find({'item': {'$exists': False}})))
23 | 
24 | client.close()


--------------------------------------------------------------------------------
/pymongo/查找/6限制显示行数.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | 
 3 | client = MongoClient(host='localhost', port=27017)
 4 | db = client['text']  # 数据库名字
 5 | 
 6 | 
 7 | cursor = db['inventory'].find({}).limit(2)
 8 | for i in cursor:
 9 |     print(i)
10 | 
11 | 
12 | cursor = db['inventory'].find({}).limit(2).skip(2)
13 | for i in cursor:
14 |     print(i)
15 | 
16 | 
17 | # cursor = db['inventory'].find({'$or': [{'qty': {'$gt': 95}}, {'qty': {'$lt': 30}}]})
18 | # cursor = db['inventory'].find({'qty': {'$gt': 80}})
19 | # for i in cursor:
20 | #     print(i)


--------------------------------------------------------------------------------
/singleton/README.md:
--------------------------------------------------------------------------------
1 | ## 单例模式
2 | - [	元类](./metaclass.py)
3 | - [	装饰器](./decorator.py)
4 | - [	__new__方法](./__new__.py)
5 | - [new 方法 线程安全](./new_threading_safe.py)


--------------------------------------------------------------------------------
/singleton/__new__.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | import time
 3 | 
 4 | 
 5 | class Singleton(object):
 6 | 
 7 |     def __new__(cls, *args, **kwargs):
 8 |         if not hasattr(Singleton, "_instance"):
 9 |             Singleton._instance = super(Singleton, cls).__new__(cls)
10 |         return Singleton._instance
11 | 
12 | 
13 | def task(arg):
14 |     obj = Singleton(arg)
15 |     print(obj)
16 | 
17 | 
18 | for i in range(20):
19 |     t = threading.Thread(target=task, args=[i, ])
20 |     t.start()
21 | 


--------------------------------------------------------------------------------
/singleton/decorator.py:
--------------------------------------------------------------------------------
 1 | def Singleton(cls):
 2 |     _instance = {}
 3 | 
 4 |     def _singleton(*args, **kargs):
 5 |         if cls not in _instance:
 6 |             _instance[cls] = cls(*args, **kargs)
 7 |         return _instance[cls]
 8 |     return _singleton
 9 | 
10 | 
11 | @Singleton
12 | class A(object):  # A = singleton(A)
13 |     a = 1
14 | 
15 |     def __init__(self, x=0):
16 |         self.x = x
17 | 
18 | 
19 | obj1 = A(2)
20 | obj2 = A(3)
21 | print(obj1, obj2)
22 | 


--------------------------------------------------------------------------------
/singleton/metaclass.py:
--------------------------------------------------------------------------------
 1 | class SingletonType(type):
 2 |     def __call__(cls, *args, **kwargs):
 3 |         if not hasattr(cls, "_instance"):
 4 |             cls._instance = super().__call__(*args, **kwargs)  # 创建一个类对象
 5 |         return cls._instance
 6 | 
 7 | 
 8 | class Foo(metaclass=SingletonType):
 9 |     def __init__(self, name):
10 |         self.name = name
11 | 
12 | obj1 = Foo('name1')
13 | obj2 = Foo('name2')
14 | obj3 = Foo('name3')
15 | print(obj1, obj2, obj3)
16 | print(obj1.name, obj2.name, obj3.name)
17 | 
18 | 


--------------------------------------------------------------------------------
/singleton/new_threading_safe.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | 
 3 | import time
 4 | 
 5 | 
 6 | class Singleton(object):
 7 |     _instance_lock = threading.Lock()
 8 | 
 9 |     def __new__(cls, *args, **kwargs):
10 |         with Singleton._instance_lock:
11 |             if not hasattr(Singleton, "_instance"):
12 |                 Singleton._instance = super(Singleton, cls).__new__(cls)
13 |         return Singleton._instance
14 | 
15 | 
16 | def task(arg):
17 |     obj = Singleton(arg)
18 |     print(obj)
19 | 
20 | 
21 | for i in range(10):
22 |     t = threading.Thread(target=task, args=[i, ])
23 |     t.start()
24 | 


--------------------------------------------------------------------------------
/spiders/Bs4基本元素.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import requests
 3 | 
 4 | 
 5 | url = 'http://duanziwang.com/category/%E7%BB%8F%E5%85%B8%E6%AE%B5%E5%AD%90/'
 6 | re = requests.get(url)
 7 | print(re.url)
 8 | html = re.text
 9 | soup = BeautifulSoup(html)
10 | print(soup.attrs)
11 | print(soup.prettify())
12 | # print(soup.b.string)   # 标签内字符串的注释部分
13 | # print(soup.p.string)   # 标签内非字符串
14 | 


--------------------------------------------------------------------------------
/spiders/RE库基本使用.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | # group(0)永远是原始字符串，group(1)、group(2)……表示第1、2、……个子串。
 3 | # re.search(pattern= , string= , flags=0)
 4 | # 从一个字符串中搜索匹配正则表达式的第一个位置
 5 | # pattern  正则表达式的字符串或原生字符串表示
 6 | # string   待匹配字符串
 7 | # flags 正则表达式使用时的控制标记
 8 | '''
 9 |     re.I 忽略正则表达式的大小写
10 |     re.M 给定字符串的每行当中匹配开始
11 |     re.S 默认匹配除换行外的所以匹配
12 | '''
13 | 
14 | # match = re.search(r'[1-9]\d{5}', 'BIT 100081 TSU 100084',)
15 | # print(match.re)
16 | # print(match.pos)
17 | # print(match.endpos)
18 | # print(match.string)
19 | # print('xxxxxxxxx')
20 | # print(match.group(0))
21 | # print(match.start())
22 | # print(match.end())
23 | # print(match.span())
24 | #
25 | # print(match)
26 | # pat = re.compile(r'')
27 | # rst = pat.search('BIT 100081 TSU 100084')
28 | # print(rst)
29 | # # 函数式用法，一次性操作
30 | # '''
31 | # 面向对象用法：编译后的多次操作
32 | # pat = re.compile(r'[1-9]\d{5}')  #将正则表达式的字符串形式编译成正则表达式对象
33 | # rst = pat.search('BIT 100081')
34 | # '''
35 | # if match:
36 | #     print(match.group(0))
37 | #
38 | # '''
39 | # re.match(pattern, string, flags=0)
40 | # 从一个字符串的开始位置起匹配正则表达式，返回match对象
41 | # '''
42 | # # match = re.match(r'[1-9]\d{5}', 'BIT 100081')
43 | # match = re.match(r'[1-9]\d{5}', '100081 BIT')
44 | # if match:
45 | #     print(match.group(0))
46 | #
47 | #
48 | #
49 | # '''
50 | # re.findall()
51 | # 搜素字符串，以列表类型返回全部匹配的字串
52 | # '''
53 | #
54 | # ls = re.findall(r'[1-9]\d{5}', 'BIT100081 TSU100084')
55 | # ls = re.findall(r'(\+86[1][23456789]\d{9}|'
56 | #                 r'86[1][23456789]\d{9})', '8613125134887 +8611125134887 +8613125134887')
57 | # print(ls)
58 | #
59 | # '''
60 | # re.split(pattern, string, maxsplit=0, flag=0)
61 | # 将一个字符串按照正则表达式匹配结果进行分割，返回列表类型
62 | # maxsplit: 正则表达式使用时的控制标记
63 | # '''
64 | # ls = re.split(r'[1-9]\d{5}', 'BIT100081 TSU100084')
65 | # print(ls)
66 | # ls = re.split(r'[1-9]\d{5}', 'BIT100081 TSU100084', maxsplit=1)
67 | # print(ls)
68 | #
69 | # '''
70 | # re.findite()
71 | # 搜素字符串，返回一个匹配结果的迭代类型，每个迭代元素为match对象
72 | # '''
73 | # for m in re.finditer(r'[1-9]\d{5}', 'BIT100081 TSU100084'):
74 | #         print(m.group(0))
75 | #
76 | # '''
77 | # re.sub(pattern, repl, string, count=0, flags=0)
78 | # 在一个字符串中替换所有匹配正则表达式的字串，返回替换后的字符串
79 | # repl: 替换匹配字符串的字符串
80 | # count：匹配的最大替换次数
81 | # '''
82 | # #
83 | # ls = re.sub(r'[1-9]\d{5}', ':zipcode', 'BIT100081 TSU100084')
84 | # print(ls)
85 | 


--------------------------------------------------------------------------------
/spiders/ajax今日头条.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from urllib.parse import urlencode
  4 | import requests
  5 | from bs4 import BeautifulSoup
  6 | from requests.exceptions import ConnectionError
  7 | import re
  8 | from multiprocessing import Pool
  9 | from hashlib import md5
 10 | from json.decoder import JSONDecodeError
 11 | 
 12 | 
 13 | def get_page_index(offset, keyword):
 14 |     data = {
 15 |         'autoload': 'true',
 16 |         'count': 20,
 17 |         'cur_tab': 1,
 18 |         'format': 'json',
 19 |         'keyword': keyword,
 20 |         'offset': offset,
 21 |     }
 22 |     params = urlencode(data)
 23 |     base = 'http://www.toutiao.com/search_content/'
 24 |     url = base + '?' + params
 25 |     print(url)
 26 |     try:
 27 |         response = requests.get(url)
 28 |         if response.status_code == 200:
 29 |             return response.text
 30 |         return None
 31 |     except ConnectionError:
 32 |         print('Error occurred')
 33 |         return None
 34 | 
 35 | 
 36 | def download_image(url):
 37 |     print('Downloading', url)
 38 |     try:
 39 |         response = requests.get(url)
 40 |         if response.status_code == 200:
 41 |             save_image(response.content)
 42 |         return None
 43 |     except ConnectionError:
 44 |         return None
 45 | 
 46 | 
 47 | def save_image(content):
 48 |     file_path = '{0}'.format(os.getcwd() + '\今日头条照片')
 49 |     if not os.path.exists(file_path):
 50 |         os.makedirs(file_path)
 51 |     image_path = '{0}/{1}.{2}'.format(os.getcwd() + '\今日头条照片', md5(content).hexdigest(), 'jpg')
 52 |     if not os.path.exists(image_path):
 53 |         with open(image_path, 'wb') as f:
 54 |             f.write(content)
 55 |             f.close()
 56 | 
 57 | 
 58 | def parse_page_index(text):
 59 |     try:
 60 |         data = json.loads(text)
 61 |         if data and 'data' in data.keys():
 62 |             for item in data.get('data'):
 63 |                 yield item.get('article_url')
 64 |     except JSONDecodeError:
 65 |         pass
 66 | 
 67 | 
 68 | def get_page_detail(url):
 69 |     headers = {
 70 |         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
 71 |     try:
 72 |         response = requests.get(url, headers=headers)
 73 |         if response.status_code == 200:
 74 |             return response.text
 75 |         return None
 76 |     except ConnectionError:
 77 |         print('Error occurred')
 78 |         return None
 79 | 
 80 | 
 81 | def parse_page_detail(html, url):
 82 |     soup = BeautifulSoup(html, 'lxml')
 83 |     result = soup.select('title')
 84 |     title = result[0].get_text() if result else ''
 85 |     images_pattern = re.compile('gallery: JSON.parse\("(.*)"\)', re.S)
 86 |     result = re.search(images_pattern, html)
 87 |     if result:
 88 |         data = json.loads(result.group(1).replace('\\', ''))
 89 |         if data and 'sub_images' in data.keys():
 90 |             sub_images = data.get('sub_images')
 91 |             images = [item.get('url') for item in sub_images]
 92 |             for image in images:
 93 |                 download_image(image)
 94 |             return {
 95 |                 'title': title,
 96 |                 'url': url,
 97 |                 'images': images
 98 |             }
 99 | 
100 | 
101 | # def save_to_mongo(result):
102 | #     if db[MONGO_TABLE].insert(result):
103 | #         print('Successfully Saved to Mongo', result)
104 | #         return True
105 | #     return False
106 | 
107 | 
108 | def main(offset):
109 |     KEYWORD = '街拍'
110 |     text = get_page_index(offset, KEYWORD)
111 |     urls = parse_page_index(text)
112 |     for url in urls:
113 |         if url != None:
114 |             html = get_page_detail(url)
115 |             result = parse_page_detail(html, url)
116 |         # if result : save_to_mongo(result)
117 | 
118 | 
119 | if __name__ == '__main__':
120 |     # GROUP_START = 1
121 |     # GROUP_END = 3
122 |     main(0)
123 |     # pool = Pool()
124 |     # groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
125 |     # pool.map(main, groups)
126 |     # pool.close()
127 |     # pool.join()


--------------------------------------------------------------------------------
/spiders/csdn_ajax.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | 
 4 | def get_page():
 5 |     headers = {'cookie':'',
 6 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
 7 |     url = 'https://www.csdn.net/api/articles?type=more&category=home&shown_offset=0'
 8 |     try:
 9 |         r = requests.get(url, headers=headers)
10 |         if r.status_code == 200:
11 |             html = r.json()
12 |             articles = html['articles']
13 |             if len(articles) == 0:
14 |                 print(url)
15 |             return r.json()
16 |         return None
17 |     except ConnectionError:
18 |         return None
19 | 
20 | 
21 | def pares_page(html):
22 |     articles = html['articles']
23 |     print(len(articles))
24 |     for article in articles:
25 |         yield article['title']
26 | 
27 | 
28 | def main():
29 |     for i in range(20):
30 |         html = get_page()
31 |         yield from pares_page(html)
32 | 
33 | if __name__ == '__main__':
34 |     l = list(main())
35 |     print(len(l))


--------------------------------------------------------------------------------
/spiders/jdsearch.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import requests
  3 | from bs4 import BeautifulSoup
  4 | import time
  5 | 
  6 | 
  7 | def get_frist(page, s):
  8 |     params = {
  9 |         'keyword': '小米手机',
 10 |         'enc': 'utf-8',
 11 |         'qrst': '1',
 12 |         'rt': '1',
 13 |         'stop': '1',
 14 |         'vt': '2',
 15 |         'bs': '1',
 16 |         'psort':'3',
 17 |         'ev': 'exbrand_小米（MI）^',
 18 |         'page': str(page),
 19 |         's': s,
 20 |         'click': '0',
 21 |     }
 22 |     url = 'https://search.jd.com/Search?'
 23 |     try:
 24 |         r = requests.get(url, params=params)
 25 |         r.raise_for_status()
 26 |         r.encoding = 'utf-8'
 27 |         print(r.url)
 28 |         return r.text
 29 |     except Exception as e:
 30 |         print(r.status_code)
 31 | 
 32 | 
 33 | def get_last(page, s):
 34 |     log_id = time.time()
 35 |     log_id = '%.5f' % log_id
 36 |     params = {
 37 |         'keyword': '小米手机',
 38 |         'enc': 'utf-8',
 39 |         'qrst': '1',
 40 |         'rt': '1',
 41 |         'stop': '1',
 42 |         'vt': '2',
 43 |         'bs': '1',
 44 |         'psort': '3',
 45 |         'ev': 'exbrand_小米（MI）^',
 46 |         'page': str(page),
 47 |         's': s,
 48 |         'scrolling': 'y',
 49 |         'log_id': log_id,
 50 |         'tpl': '3_M',
 51 |     }
 52 |     url = 'https://search.jd.com/Search?'
 53 |     try:
 54 |         r = requests.get(url, params=params)
 55 |         r.raise_for_status()
 56 |         r.encoding = 'utf-8'
 57 |         print(r.url)
 58 |         return r.text
 59 |     except Exception as e:
 60 |         print(r.status_code)
 61 | 
 62 | 
 63 | def get_info(text, count):
 64 |     soup = BeautifulSoup(text, 'html.parser')
 65 |     for child in soup.find_all(class_='gl-item'):
 66 |         data = {}
 67 |         try:
 68 |             data['price'] = child.find('strong').attrs['data-price']
 69 |         except:
 70 |             data['price'] = child.find('strong').find('i').text
 71 | 
 72 |         try:
 73 |             data['shop'] = child.find(class_='p-shop').find('a').text.strip()
 74 |         except:
 75 |             print('这是一个广告')
 76 |             continue
 77 | 
 78 |         try:
 79 |             data['name'] = child.find(class_='p-name').find('em').text
 80 |             data['commit'] = child.find(class_='p-commit').find('strong').find('a').text
 81 |         except:
 82 |             continue
 83 |         print(data)
 84 |         save_to_file(data)
 85 |         count += 1
 86 |         if count == 200:
 87 |             break
 88 |     return count
 89 | 
 90 | 
 91 | def save_to_file(data):
 92 |     file = 'goods.txt'
 93 |     with open(file, 'a', encoding='utf-8') as f:
 94 |         f.write(str(data))
 95 |         f.write('\n')
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     # text = get_frist()
100 |     s = 1
101 |     count = 0
102 |     for i in range(0, 10, 2):
103 |         text = get_frist(i, s)
104 |         s = s + 30
105 |         count = get_info(text, count)
106 |         if count == 200:
107 |             print('200个信息爬取完毕')
108 |             break
109 | 
110 |         text = get_last(i+1, s)
111 |         s = s + 30
112 |         count = get_info(text, count)
113 |         if count == 200:
114 |             print('200个信息爬取完毕')
115 |             break
116 | 


--------------------------------------------------------------------------------
/spiders/jianshu.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | from urllib.parse import urlencode
  4 | import pymongo
  5 | client = pymongo.MongoClient('localhost')
  6 | db = client['jianshu']
  7 | data = []
  8 | 
  9 | 
 10 | def get_first_page(url):
 11 |     headers = {
 12 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
 13 |     }
 14 |     try:
 15 |         r = requests.get(url, headers=headers)
 16 | 
 17 |         if r.status_code == 200:
 18 |             return r.text
 19 |         return None
 20 |     except ConnectionError:
 21 |         print('抓取失败', url)
 22 |         return None
 23 | 
 24 | 
 25 | def get_page(url):
 26 |     headers = {
 27 |         'X-CSRF-Token': '6vJnbFxpgkYWu28t+TQd77DYYeG/HuELzV4vKveTleCyCWtAFd408Un7Z5cwn3b1hzZB3uGqzUQprnJKOL3lgw==',
 28 |         'X-PJAX': 'true',
 29 |         'X-Requested-With': 'XMLHttpRequest',
 30 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
 31 |     }
 32 |     try:
 33 |         r = requests.get(url, headers=headers)
 34 |         if r.status_code == 200:
 35 |             return r.text
 36 |         return None
 37 |     except ConnectionError:
 38 |         print('抓取失败', url)
 39 |         return None
 40 | 
 41 | 
 42 | def save_to_mongo(result):
 43 |     db['result'].insert(result)
 44 | 
 45 | 
 46 | def parse_first_page(html):
 47 |     global data
 48 |     soup = BeautifulSoup(html, 'lxml')
 49 |     note_list = soup.find('ul', class_='note-list')
 50 |     if note_list is None:
 51 |         return None
 52 |     for li in note_list.find_all('li'):
 53 |         try:
 54 |             id = 'seen_snote_ids%5B%5D=' + li.get('data-note-id')
 55 |             data.append(id)
 56 |             yield {
 57 |                 'title': li.find('div').find('a').text,
 58 |                 'abstract': li.find('p').text,
 59 |                 'nickname': li.find(class_='meta').find(class_='nickname').text
 60 |             }
 61 |         except:
 62 |             continue
 63 | 
 64 | 
 65 | def parse_page(html):
 66 |     global data
 67 |     soup = BeautifulSoup(html, 'lxml')
 68 |     for li in soup.find_all('li'):
 69 |         try:
 70 |             id = 'seen_snote_ids%5B%5D=' + li.get('data-note-id')
 71 |             data.append(id)
 72 |             yield {
 73 |                 'title': li.find('div').find('a').text,
 74 |                 'abstract': li.find('p').text,
 75 |                 'nickname': li.find(class_='meta').find(class_='nickname').text
 76 |             }
 77 |         except:
 78 |             continue
 79 | 
 80 | 
 81 | def main():
 82 |     global data
 83 |     # 第一次请求
 84 |     print('正在解析第一页')
 85 |     url = 'https://www.jianshu.com/?&page=1'
 86 |     html = get_first_page(url)
 87 |     if html is None:
 88 |         return False
 89 |     for result in parse_first_page(html):
 90 |         save_to_mongo(result)
 91 |     # 弟二三请求都是get请求
 92 |     # 后面是post请求
 93 |     print('解析分页数据')
 94 |     for i in range(2, 16):
 95 |         params = '&'.join(data)
 96 |         url = 'https://www.jianshu.com/?' + params + '&page={}'.format(i)
 97 |         html = get_page(url)
 98 |         for result in parse_page(html):
 99 |             save_to_mongo(result)
100 | 
101 | 
102 | if __name__ == '__main__':
103 |     main()


--------------------------------------------------------------------------------
/spiders/newhouse.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import time
 4 | import csv
 5 | 
 6 | def get_html_text(page):
 7 |     url = f'http://newhouse.nj.house365.com/house/dist-4_p-{page}/'
 8 |     print('正在解析：', url)
 9 |     try:
10 |         r = requests.get(url)
11 |         r.raise_for_status()
12 |         return r.text
13 |     except Exception as e:
14 |         print('抓取失败', r.status_code)
15 |         return None
16 | 
17 | 
18 | def get_page(html):
19 |     soup = BeautifulSoup(html, 'html.parser')
20 |     page_num = soup.find(class_='fr orderby').find('b').text
21 |     return int(page_num)
22 | 
23 | 
24 | def get_info(html):
25 |     soup = BeautifulSoup(html, 'html.parser')
26 |     for mc in soup.find_all(class_='mc_list'):
27 |         data = {}
28 |         data['name'] = mc.find(class_='tit').find('a').text
29 |         data['addr'] = mc.find(class_='yh_info').find_all('p')[1].text.strip().split()[0]
30 |         data['price'] = mc.find(class_='xiang_price').text.strip().split()[0]
31 |         try:
32 |             data['phone'] = ''.join(mc.find(class_='pt5').find('b').text.split())
33 |         except Exception as e:
34 |             print('售空，没给电话号码')
35 |             data['phone'] = 'Null'
36 |         print(data)
37 |         save_to_file(data)
38 | 
39 | def save_to_file(data):
40 |     # with open('houseInfo.txt', 'a', encoding='utf-8') as f:
41 |     #     f.write(str(data) + '\n')
42 |     with open('houseInfo.csv', 'a', encoding='utf-8', newline ='') as f:
43 |         writer = csv.DictWriter(f, ['name', 'phone', 'addr', 'price'])
44 |         writer.writerow(data)
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     html = get_html_text(1)
49 |     page_num = int(get_page(html) / 15) + 2
50 |     get_info(html)
51 |     for i in range(2, page_num):
52 |         html = get_html_text(str(i))
53 |         get_info(html)
54 |         time.sleep(3)
55 | 


--------------------------------------------------------------------------------
/spiders/scrapy/BaiduStocks/BaiduStockInfo.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/BaiduStocks/BaiduStockInfo.txt


--------------------------------------------------------------------------------
/spiders/scrapy/BaiduStocks/BaiduStocks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/BaiduStocks/BaiduStocks/__init__.py


--------------------------------------------------------------------------------
/spiders/scrapy/BaiduStocks/BaiduStocks/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/BaiduStocks/BaiduStocks/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/BaiduStocks/BaiduStocks/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/BaiduStocks/BaiduStocks/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/BaiduStocks/BaiduStocks/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/BaiduStocks/BaiduStocks/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/BaiduStocks/BaiduStocks/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class BaidustocksItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/spiders/scrapy/BaiduStocks/BaiduStocks/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class BaidustocksSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class BaidustocksDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/spiders/scrapy/BaiduStocks/BaiduStocks/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | 
10 | class BaidustocksInfoPipeline(object):
11 |     def process_item(self, item, spider):
12 |         try:
13 |             line = str(dict(item)) + '\n'
14 |             self.f.write(line)
15 |         except:
16 |             pass
17 |         return item


--------------------------------------------------------------------------------
/spiders/scrapy/BaiduStocks/BaiduStocks/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for BaiduStocks project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'BaiduStocks'
13 | 
14 | SPIDER_MODULES = ['BaiduStocks.spiders']
15 | NEWSPIDER_MODULE = 'BaiduStocks.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'BaiduStocks (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'BaiduStocks.middlewares.BaidustocksSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'BaiduStocks.middlewares.BaidustocksDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |    'BaiduStocks.pipelines.BaidustocksInfoPipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/spiders/scrapy/BaiduStocks/BaiduStocks/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/spiders/scrapy/BaiduStocks/BaiduStocks/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/BaiduStocks/BaiduStocks/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/BaiduStocks/BaiduStocks/spiders/__pycache__/stocks.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/BaiduStocks/BaiduStocks/spiders/__pycache__/stocks.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/BaiduStocks/BaiduStocks/spiders/stocks.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import re
 4 | 
 5 | 
 6 | class StocksSpider(scrapy.Spider):
 7 |     name = 'stocks'
 8 |     allowed_domains = ['baidu.com']
 9 |     start_urls = ['http://quote.eastmoney.com/stocklist.html']
10 | 
11 | 
12 |     def parse(self, response):
13 |         # temp = response.css('.a')
14 |         # print(temp.css('::text').extract(), '***************')
15 |         # print(response.css('a::attr(href)').extract(), '***************')
16 |         # print('***************************************************')
17 |         for href in response.css('a::attr(href)').extract():
18 |         # for href in temp.css('::attr(href)').extract():
19 |             # 找到class为a的所有结点。提取a标签属性为href的内容
20 |             # .extract()为了提取真实的原文数据  返回的系统自带的List    没有这个是SelectorList
21 |             # print(href)
22 |             try:
23 |                 # re.findall(r'[s][hz]\d{6}', href)[0]
24 |                 stock = re.findall(r'[s][hz]\d{6}', href)[0]   #以列表形式返回能匹配的字符串
25 |                 url = 'https://gupiao.baidu.com/stock/' + stock + '.html'
26 |                 # print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
27 |                 # print(url)
28 |                 yield scrapy.Request(url, callback=self.parse_stock)
29 |                 # Request类 由scrapy生产。由downloader执行
30 |                 # 表示一个HTTP请求。.method对应请求的方法
31 |                 # classback 回调函数 将此请求返回的response传递给下一个函数进行处理
32 |             except:
33 |                 continue
34 | 
35 |     def parse_stock(self, response):
36 |         info_dict = {}
37 |         temp = response
38 |         stock_info = response.css('.stock-bets')
39 |         name = stock_info.css('.bets-name').extract()[0]
40 |         key_list = stock_info.css('dt').extract()
41 |         value_list = stock_info.css('dd').extract()
42 |         for i in range(len(key_list)):
43 |             key = re.findall(r'>.*</dt>', key_list[i])[0][1:-5]
44 |             try:
45 |                 val = re.findall(r'\d+.?.*.</dd>', value_list[i])[0][0:-5]
46 |             except:
47 |                 val = '--'
48 |             info_dict[key] = val
49 |         info_dict.update(
50 |             {'股票名称': re.findall('\s.*\(', name)[0].split()[0] + re.findall('\>.*\<', name)[0][1: -1]})
51 |         yield info_dict
52 | 


--------------------------------------------------------------------------------
/spiders/scrapy/BaiduStocks/BaiduStocks/调试.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | from scrapy import cmdline
4 | name = 'stocks'
5 | cmd = 'scrapy crawl {0}'.format(name)
6 | cmdline.execute(cmd.split())


--------------------------------------------------------------------------------
/spiders/scrapy/BaiduStocks/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = BaiduStocks.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = BaiduStocks
12 | 


--------------------------------------------------------------------------------
/spiders/scrapy/Tencent/Tencent/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/Tencent/Tencent/__init__.py


--------------------------------------------------------------------------------
/spiders/scrapy/Tencent/Tencent/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/Tencent/Tencent/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/Tencent/Tencent/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/Tencent/Tencent/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/Tencent/Tencent/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/Tencent/Tencent/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/Tencent/Tencent/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class TencentItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     position_name = scrapy.Field()
15 |     position_link = scrapy.Field()
16 |     position_type = scrapy.Field()
17 |     position_number = scrapy.Field()
18 |     work_location = scrapy.Field()
19 |     publish_time = scrapy.Field()
20 | 


--------------------------------------------------------------------------------
/spiders/scrapy/Tencent/Tencent/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class TencentSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class TencentDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/spiders/scrapy/Tencent/Tencent/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class TencentPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/spiders/scrapy/Tencent/Tencent/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for Tencent project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'Tencent'
13 | 
14 | SPIDER_MODULES = ['Tencent.spiders']
15 | NEWSPIDER_MODULE = 'Tencent.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'Tencent (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'Tencent.middlewares.TencentSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'Tencent.middlewares.TencentDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'Tencent.pipelines.TencentPipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/spiders/scrapy/Tencent/Tencent/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/spiders/scrapy/Tencent/Tencent/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/Tencent/Tencent/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/Tencent/Tencent/spiders/__pycache__/tencent.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/Tencent/Tencent/spiders/__pycache__/tencent.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/Tencent/Tencent/spiders/tencent.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from scrapy import Spider, Request
 3 | 
 4 | from Tencent.items import TencentItem
 5 | 
 6 | 
 7 | class TencentSpider(Spider):
 8 |     name = 'tencent'
 9 |     allowed_domains = ['tencent.com']
10 |     url = 'https://hr.tencent.com/position.php?&start={index}#a'
11 | 
12 |     def start_requests(self):
13 |         start_url = self.url.format(index=0)
14 |         yield Request(start_url, callback=self.parse)
15 | 
16 |     def parse(self, response):
17 |         table_list = response.xpath('//*[@id="position"]/div[1]/table/tr')
18 |         for tr in table_list[1:-1]:
19 |             item = TencentItem()
20 |             item['position_name'] = tr.xpath('./td/a/text()').extract_first()
21 |             item['position_link'] = tr.xpath('./td/a/@href').extract_first()
22 |             item['position_type'] = tr.xpath('./td[2]/text()').extract_first()
23 |             item['position_number'] = tr.xpath('./td[3]/text()').extract_first()
24 |             item['work_location'] = tr.xpath('./td[4]/text()').extract_first()
25 |             item['publish_time'] = tr.xpath('./td[5]/text()').extract_first()
26 |             yield item
27 |         for i in range(2, 331):
28 |             url = self.url.format(index=i*10)
29 |             yield Request(url=url, callback=self.parse)
30 | 


--------------------------------------------------------------------------------
/spiders/scrapy/Tencent/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = Tencent.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = Tencent
12 | 


--------------------------------------------------------------------------------
/spiders/scrapy/jianshu/jishuspider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/jianshu/jishuspider/__init__.py


--------------------------------------------------------------------------------
/spiders/scrapy/jianshu/jishuspider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class JishuItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     title = scrapy.Field()
15 |     author = scrapy.Field()
16 |     publish_time = scrapy.Field()
17 |     wordage = scrapy.Field()
18 |     views_count = scrapy.Field()
19 |     comments_count = scrapy.Field()
20 |     likes_count = scrapy.Field()
21 |     rewards_count = scrapy.Field()
22 |     pass
23 | 


--------------------------------------------------------------------------------
/spiders/scrapy/jianshu/jishuspider/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class JishuspiderSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class JishuspiderDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/spiders/scrapy/jianshu/jishuspider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import pymongo
 8 | 
 9 | 
10 | class JishuspiderPipeline(object):
11 |     def process_item(self, item, spider):
12 |         return item
13 | 
14 | 
15 | class MongoPipeline(object):
16 |     def __init__(self, mongo_uri, mongo_db):
17 |         self.mongo_uri = mongo_uri
18 |         self.mongo_db = mongo_db
19 | 
20 |     @classmethod
21 |     def from_crawler(cls, crawler):
22 |         return cls(
23 |             mongo_db=crawler.settings.get('MONGO_DB'),
24 |             mongo_url=crawler.settings.get('MONGO_URI')
25 |         )
26 | 
27 |     def open_spider(self, spider):
28 |         self.client = pymongo.MongoClient(self.mongo_uri)
29 |         self.db = self.client[self.mongo_db]
30 | 
31 |     def process_item(self, item, spider):
32 |         name = item.__class__.__name__
33 |         self.db[name].insert(dict(item))
34 |         return item
35 | 
36 |     def close_spider(self, spider):
37 |         self.client.close()
38 | 
39 | 


--------------------------------------------------------------------------------
/spiders/scrapy/jianshu/jishuspider/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for jishuspider project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'jishuspider'
13 | 
14 | SPIDER_MODULES = ['jishuspider.spiders']
15 | NEWSPIDER_MODULE = 'jishuspider.spiders'
16 | 
17 | MONGO_URI = 'localhost'
18 | MONGO_DB = 'jianshu'
19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
20 | #USER_AGENT = 'jishuspider (+http://www.yourdomain.com)'
21 | 
22 | # Obey robots.txt rules
23 | ROBOTSTXT_OBEY = False
24 | 
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | #CONCURRENT_REQUESTS = 32
27 | 
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | #DOWNLOAD_DELAY = 3
32 | # The download delay setting will honor only one of:
33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
34 | #CONCURRENT_REQUESTS_PER_IP = 16
35 | 
36 | # Disable cookies (enabled by default)
37 | #COOKIES_ENABLED = False
38 | 
39 | # Disable Telnet Console (enabled by default)
40 | #TELNETCONSOLE_ENABLED = False
41 | 
42 | # Override the default request headers:
43 | DEFAULT_REQUEST_HEADERS = {
44 |   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45 |   'Accept-Language': 'en',
46 |    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
47 | }
48 | 
49 | # Enable or disable spider middlewares
50 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
51 | #SPIDER_MIDDLEWARES = {
52 | #    'jishuspider.middlewares.JishuspiderSpiderMiddleware': 543,
53 | #}
54 | 
55 | # Enable or disable downloader middlewares
56 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
57 | #DOWNLOADER_MIDDLEWARES = {
58 | #    'jishuspider.middlewares.JishuspiderDownloaderMiddleware': 543,
59 | #}
60 | 
61 | # Enable or disable extensions
62 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
63 | #EXTENSIONS = {
64 | #    'scrapy.extensions.telnet.TelnetConsole': None,
65 | #}
66 | 
67 | # Configure item pipelines
68 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
69 | # ITEM_PIPELINES = {
70 | #    'jishuspider.pipelines.MongoPipeline': 300,
71 | # }
72 | 
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | #AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | #AUTOTHROTTLE_DEBUG = False
85 | 
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | #HTTPCACHE_ENABLED = True
89 | #HTTPCACHE_EXPIRATION_SECS = 0
90 | #HTTPCACHE_DIR = 'httpcache'
91 | #HTTPCACHE_IGNORE_HTTP_CODES = []
92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 | 


--------------------------------------------------------------------------------
/spiders/scrapy/jianshu/jishuspider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/spiders/scrapy/jianshu/jishuspider/spiders/zhihu.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from jishuspider.items import JishuItem
 4 | import re
 5 | import json
 6 | 
 7 | 
 8 | class ZhihuSpider(scrapy.Spider):
 9 |     name = 'jishu'
10 |     def start_requests(self):
11 | 
12 |         self.url = 'https://www.jianshu.com/trending/weekly?&page=1'
13 |         self.headers = {
14 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
15 |         }
16 |         yield scrapy.Request(self.url, headers=self.headers, callback=self.parse)
17 | 
18 |     def parse(self, response):
19 |         lis = response.css('#list-container > ul li')
20 |         for li in lis:
21 |             href = li.css('.title::attr(href)').extract_first()
22 |             url = response.urljoin(href)
23 |             meta = {'Referer' : url}
24 |             yield scrapy.Request(url, meta=meta, headers=self.headers, callback=self.parse_page)
25 | 
26 |     def parse_page(self, response):
27 |         page_data =re.search(r'page-data">(.*?)<',response.text,re.S).group(1)
28 |         note = json.loads(page_data)['note']
29 |         id = note['id']
30 |         meta = {
31 |             'author': note['author'],  #作者信息
32 |             'title': response.css('body > div.note > div.post > div.article >.title::text').extract_first(), # 标题
33 |             'wordage': note['public_wordage'] , # 总字数
34 |             'views_count': note['views_count'],  # 阅读数
35 |             'comments_count': note['comments_count'],  # 评论数
36 |             'likes_count': note['likes_count'] , # 喜欢人数
37 |             'rewards_count': note['total_rewards_count']  # 赞赏人数
38 |         }
39 |         comment_headers = {
40 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
41 |             'Referer': response.meta['Referer'],
42 |             'Accept': 'application/json',
43 |             'Accept-Encoding':'gzip, deflate, sdch, br',
44 |             'Accept-Language':'zh-CN,zh;q=0.8',
45 |             'Connection': 'keep-alive'
46 |         }
47 |         comments_url = 'https://www.jianshu.com/notes/%s/comments?comment_id=&author_only=false&since_id=0&max_id=1586510606000&order_by=desc&page=1' % str(id)
48 |         yield scrapy.Request(url=comments_url, headers=comment_headers, meta=meta, callback=self.parse_comments)
49 | 
50 |     def parse_comments(self, response):
51 |         item = JishuItem()
52 |         page = json.loads(response.text)
53 |         comments = page['comments']
54 |         comment_info = {}   # 存放评论者的信息和评论的内容
55 |         for comment in comments:
56 |             user = comment['user']
57 |             comment_info['nick_name'] = user['nickname']   # 评论者的姓名
58 |             comment_info['compiled_comment'] = comment['compiled_content']  # 评论的内容
59 |             # for children in comment['children']:   # 跟评论内容
60 |             #     nick_name = user['nickname']  # 跟评论者的姓名
61 |             #     compiled_comment = comment['compiled_content']  # 跟评论的内容
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/spiders/scrapy/jianshu/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = jishuspider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = jishuspider
12 | 


--------------------------------------------------------------------------------
/spiders/scrapy/movie/movie/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/movie/movie/__init__.py


--------------------------------------------------------------------------------
/spiders/scrapy/movie/movie/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/movie/movie/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/movie/movie/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/movie/movie/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/movie/movie/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class MovieItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/spiders/scrapy/movie/movie/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class MovieSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class MovieDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/spiders/scrapy/movie/movie/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class MoviePipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/spiders/scrapy/movie/movie/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for movie project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'movie'
13 | 
14 | SPIDER_MODULES = ['movie.spiders']
15 | NEWSPIDER_MODULE = 'movie.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'movie (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'movie.middlewares.MovieSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'movie.middlewares.MovieDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'movie.pipelines.MoviePipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/spiders/scrapy/movie/movie/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/spiders/scrapy/movie/movie/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/movie/movie/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/movie/movie/spiders/__pycache__/movies.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/movie/movie/spiders/__pycache__/movies.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/movie/movie/spiders/movies.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | 
 4 | 
 5 | class MoviesSpider(scrapy.Spider):
 6 |     name = 'movies'
 7 |     allowed_domains = ['phthon.com']
 8 |     start_urls = ['http://phthon.com/']
 9 | 
10 |     def parse(self, response):
11 |         movies = response.x
12 | 


--------------------------------------------------------------------------------
/spiders/scrapy/movie/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = movie.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = movie
12 | 


--------------------------------------------------------------------------------
/spiders/scrapy/python123demo/python123demo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/python123demo/python123demo/__init__.py


--------------------------------------------------------------------------------
/spiders/scrapy/python123demo/python123demo/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/python123demo/python123demo/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/python123demo/python123demo/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/python123demo/python123demo/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/python123demo/python123demo/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class Python123DemoItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/spiders/scrapy/python123demo/python123demo/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class Python123DemoSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class Python123DemoDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/spiders/scrapy/python123demo/python123demo/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class Python123DemoPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/spiders/scrapy/python123demo/python123demo/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for python123demo project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'python123demo'
13 | 
14 | SPIDER_MODULES = ['python123demo.spiders']
15 | NEWSPIDER_MODULE = 'python123demo.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'python123demo (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'python123demo.middlewares.Python123DemoSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'python123demo.middlewares.Python123DemoDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'python123demo.pipelines.Python123DemoPipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/spiders/scrapy/python123demo/python123demo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/spiders/scrapy/python123demo/python123demo/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/python123demo/python123demo/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/python123demo/python123demo/spiders/__pycache__/demo.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/python123demo/python123demo/spiders/__pycache__/demo.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/python123demo/python123demo/spiders/demo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import os
 4 | 
 5 | 
 6 | class DemoSpider(scrapy.Spider):
 7 |     name = "demo"
 8 |     allowed_domains = ['python123,io']
 9 |     start_urls = ["http://python123.io/ws/demo.html"]
10 | 
11 |     def parse(self, response):
12 |         """
13 |         用于处理响应，解析内容形成字典，发现新的URL爬取请求
14 |         :param response:
15 |         """
16 |         fname = response.url.split('/')[-1]
17 |         with open(fname, 'wb') as f:
18 |             f.write(response.body)
19 |         self.log('Saved file &s.' % fname)
20 | 


--------------------------------------------------------------------------------
/spiders/scrapy/python123demo/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = python123demo.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = python123demo
12 | 


--------------------------------------------------------------------------------
/spiders/scrapy/quoteturorial/quoteturorial/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/quoteturorial/quoteturorial/__init__.py


--------------------------------------------------------------------------------
/spiders/scrapy/quoteturorial/quoteturorial/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/quoteturorial/quoteturorial/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/quoteturorial/quoteturorial/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/quoteturorial/quoteturorial/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/quoteturorial/quoteturorial/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/quoteturorial/quoteturorial/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/quoteturorial/quoteturorial/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/quoteturorial/quoteturorial/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/quoteturorial/quoteturorial/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class QuotesItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     text = scrapy.Field()
15 |     author = scrapy.Field()
16 |     tags = scrapy.Field()
17 | 


--------------------------------------------------------------------------------
/spiders/scrapy/quoteturorial/quoteturorial/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import pymongo
 8 | from scrapy.exceptions import DropItem
 9 | 
10 | 
11 | class TextPipeline(object):
12 | 
13 |     def __init__(self):
14 |         self.limit = 50
15 | 
16 |     def process_item(self, item, spider):
17 |         if item['text']:
18 |             if len(item['text']) > self.limit:
19 |                 item['text'] = item['text'][0:self.limit].rstrip() + '...'
20 |             return item
21 |         else:
22 |             return DropItem('Missing Text')
23 | 
24 | 
25 | class MongoPipeline(object):
26 | 
27 |     def __init__(self, mongo_url, mongo_db):
28 |         self.mongo_url = mongo_url
29 |         self.mongo_db = mongo_db
30 | 
31 |     @classmethod
32 |     def from_crawler(cls, crawler):
33 |         return cls(
34 |             mongo_url=crawler.settings.get('MONGO_URL'),
35 |             mongo_db=crawler.settings.get('MONGO_DB')
36 |         )
37 | 
38 |     # 打开一个爬虫的时候就链接数据库
39 |     def open_spider(self, spider):
40 |         self.client = pymongo.MongoClient(self.mongo_url)
41 |         self.db = self.client[self.mongo_db]
42 | 
43 |     def process_item(self, item, spider):
44 |         name = item.__class__.__name__
45 |         self.db['quotes'].insert(dict(item))
46 |         return item
47 | 
48 |     def close_spider(self, spider):
49 |         self.client.close()
50 | 


--------------------------------------------------------------------------------
/spiders/scrapy/quoteturorial/quoteturorial/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for quoteturorial project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'quoteturorial'
13 | 
14 | SPIDER_MODULES = ['quoteturorial.spiders']
15 | NEWSPIDER_MODULE = 'quoteturorial.spiders'
16 | 
17 | MONGO_URL = 'localhost'
18 | MONGO_DB = 'quotestutorial'
19 | 
20 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
21 | #USER_AGENT = 'quoteturorial (+http://www.yourdomain.com)'
22 | 
23 | # Obey robots.txt rules
24 | ROBOTSTXT_OBEY = True
25 | 
26 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
27 | #CONCURRENT_REQUESTS = 32
28 | 
29 | # Configure a delay for requests for the same website (default: 0)
30 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
31 | # See also autothrottle settings and docs
32 | #DOWNLOAD_DELAY = 3
33 | # The download delay setting will honor only one of:
34 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
35 | #CONCURRENT_REQUESTS_PER_IP = 16
36 | 
37 | # Disable cookies (enabled by default)
38 | #COOKIES_ENABLED = False
39 | 
40 | # Disable Telnet Console (enabled by default)
41 | #TELNETCONSOLE_ENABLED = False
42 | 
43 | # Override the default request headers:
44 | #DEFAULT_REQUEST_HEADERS = {
45 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
46 | #   'Accept-Language': 'en',
47 | #}
48 | 
49 | # Enable or disable spider middlewares
50 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
51 | #SPIDER_MIDDLEWARES = {
52 | #    'quoteturorial.middlewares.QuoteturorialSpiderMiddleware': 543,
53 | #}
54 | 
55 | # Enable or disable downloader middlewares
56 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
57 | #DOWNLOADER_MIDDLEWARES = {
58 | #    'quoteturorial.middlewares.QuoteturorialDownloaderMiddleware': 543,
59 | #}
60 | 
61 | # Enable or disable extensions
62 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
63 | #EXTENSIONS = {
64 | #    'scrapy.extensions.telnet.TelnetConsole': None,
65 | #}
66 | 
67 | # Configure item pipelines
68 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
69 | ITEM_PIPELINES = {
70 |    'quoteturorial.pipelines.TextPipeline': 300,
71 |    'quoteturorial.pipelines.MongoPipeline': 400,
72 | }
73 | 
74 | # Enable and configure the AutoThrottle extension (disabled by default)
75 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
76 | #AUTOTHROTTLE_ENABLED = True
77 | # The initial download delay
78 | #AUTOTHROTTLE_START_DELAY = 5
79 | # The maximum download delay to be set in case of high latencies
80 | #AUTOTHROTTLE_MAX_DELAY = 60
81 | # The average number of requests Scrapy should be sending in parallel to
82 | # each remote server
83 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
84 | # Enable showing throttling stats for every response received:
85 | #AUTOTHROTTLE_DEBUG = False
86 | 
87 | # Enable and configure HTTP caching (disabled by default)
88 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
89 | #HTTPCACHE_ENABLED = True
90 | #HTTPCACHE_EXPIRATION_SECS = 0
91 | #HTTPCACHE_DIR = 'httpcache'
92 | #HTTPCACHE_IGNORE_HTTP_CODES = []
93 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
94 | 


--------------------------------------------------------------------------------
/spiders/scrapy/quoteturorial/quoteturorial/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/spiders/scrapy/quoteturorial/quoteturorial/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/quoteturorial/quoteturorial/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/quoteturorial/quoteturorial/spiders/__pycache__/quotes.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/quoteturorial/quoteturorial/spiders/__pycache__/quotes.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/quoteturorial/quoteturorial/spiders/quotes.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from quoteturorial.items import QuotesItem
 4 | 
 5 | class QuotesSpider(scrapy.Spider):
 6 |     name = 'quotes'
 7 |     allowed_domains = ['quotes.toscrape.com']
 8 |     start_urls = ['http://quotes.toscrape.com/']
 9 | 
10 |     def parse(self, response):
11 |         item = QuotesItem()
12 |         quotes = response.css('.quote')
13 |         for quote in quotes:
14 |             text = quote.css('.text::text').extract_first()
15 |             author = quote.css('.author::text').extract_first()
16 |             tags = quote.css('.tags .tag::text').extract()
17 |             item['text'] = text
18 |             item['author'] = author
19 |             item['tags'] = tags
20 |             yield item
21 | 
22 |         next = response.css('.pager .next a::attr(href)').extract_first()
23 |         url = response.urljoin(next)
24 |         yield scrapy.Request(url=url, callback=self.parse)
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/spiders/scrapy/quoteturorial/quoteturorial/调试.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | from scrapy import cmdline
4 | name = 'quotes'
5 | cmd = 'scrapy crawl {0}'.format(name)
6 | cmdline.execute(cmd.split())


--------------------------------------------------------------------------------
/spiders/scrapy/quoteturorial/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = quoteturorial.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = quoteturorial
12 | 


--------------------------------------------------------------------------------
/spiders/scrapy/weibo/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = weibo.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = weibo
12 | 


--------------------------------------------------------------------------------
/spiders/scrapy/weibo/weibo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/weibo/weibo/__init__.py


--------------------------------------------------------------------------------
/spiders/scrapy/weibo/weibo/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/weibo/weibo/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/weibo/weibo/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/weibo/weibo/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/weibo/weibo/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class WeiboItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/spiders/scrapy/weibo/weibo/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/spiders/scrapy/weibo/weibo/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class WeiboPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/spiders/scrapy/weibo/weibo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/spiders/scrapy/weibo/weibo/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/weibo/weibo/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/weibo/weibo/spiders/__pycache__/weibos.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/weibo/weibo/spiders/__pycache__/weibos.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/weibo/weibo/spiders/weibos.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from scrapy import Spider, FormRequest
 3 | 
 4 | 
 5 | class WeibosSpider(Spider):
 6 |     name = 'weibos'
 7 |     allowed_domains = ['weibo.cn']
 8 |     start_urls = 'http://weibo.cn/search/mblog'
 9 |     max_page = 0
10 | 
11 |     def start_requests(self):
12 |         keyword = '000001'
13 |         url = '{url}?keyword={keyword}'.format(url=self.start_urls, keyword=keyword)
14 |         print(url)
15 |         for page in range(self.max_page + 1):
16 |             data = {
17 |                 'mp': str(self.max_page),
18 |                 'page': str(page)
19 |             }
20 |             yield FormRequest(url, callback=self.parse_index, formdata=data)
21 | 
22 |     def parse_index(self, response):
23 |         print(response.text)
24 | 


--------------------------------------------------------------------------------
/spiders/scrapy/zhihuuser/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = zhihuuser.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = zhihuuser
12 | 


--------------------------------------------------------------------------------
/spiders/scrapy/zhihuuser/zhihuuser/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/zhihuuser/zhihuuser/__init__.py


--------------------------------------------------------------------------------
/spiders/scrapy/zhihuuser/zhihuuser/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/zhihuuser/zhihuuser/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/zhihuuser/zhihuuser/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/zhihuuser/zhihuuser/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/zhihuuser/zhihuuser/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/zhihuuser/zhihuuser/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/zhihuuser/zhihuuser/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/zhihuuser/zhihuuser/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/zhihuuser/zhihuuser/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy import Item, Field
 9 | 
10 | 
11 | class UserItem(Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     allow_message = Field()
15 |     answer_count = Field()
16 |     articles_count = Field()
17 |     avatar_url = Field()
18 |     avatar_url_template = Field()
19 |     badge = Field()
20 |     employments = Field()
21 |     follower_count = Field()
22 |     gender = Field()
23 |     headline = Field()
24 |     id = Field()
25 |     is_org = Field()
26 |     name = Field()
27 |     type = Field()
28 |     url = Field()
29 |     url_token = Field()
30 |     user_type = Field()
31 | 


--------------------------------------------------------------------------------
/spiders/scrapy/zhihuuser/zhihuuser/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class ZhihuuserSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class ZhihuuserDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/spiders/scrapy/zhihuuser/zhihuuser/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import pymongo
 8 | from scrapy.exceptions import DropItem
 9 | 
10 | 
11 | class MongoPipeline(object):
12 | 
13 |     def __init__(self, mongo_url, mongo_db):
14 |         self.mongo_url = mongo_url
15 |         self.mongo_db = mongo_db
16 | 
17 |     @classmethod
18 |     def from_crawler(cls, crawler):
19 |         return cls(
20 |             mongo_url=crawler.settings.get('MONGO_URL'),
21 |             mongo_db=crawler.settings.get('MONGO_DB')
22 |         )
23 | 
24 |     # 打开一个爬虫的时候就链接数据库
25 |     def open_spider(self, spider):
26 |         self.client = pymongo.MongoClient(self.mongo_url)
27 |         self.db = self.client[self.mongo_db]
28 | 
29 |     def process_item(self, item, spider):
30 |         name = item.__class__.__name__
31 |         self.db['quotes'].insert(dict(item))
32 |         return item
33 | 
34 |     def close_spider(self, spider):
35 |         self.client.close()
36 | 


--------------------------------------------------------------------------------
/spiders/scrapy/zhihuuser/zhihuuser/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/spiders/scrapy/zhihuuser/zhihuuser/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/zhihuuser/zhihuuser/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/zhihuuser/zhihuuser/spiders/__pycache__/zhihu.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/scrapy/zhihuuser/zhihuuser/spiders/__pycache__/zhihu.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/scrapy/zhihuuser/zhihuuser/spiders/zhihu.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from scrapy import Spider, Request
 3 | import json
 4 | from zhihuuser.items import UserItem
 5 | # from scrapy_redis
 6 | 
 7 | class ZhihuSpider(Spider):
 8 |     name = 'zhihu'
 9 |     start_urls = ['http://www.zhihui.com/']
10 | 
11 |     start_user = 'excited-vczh'
12 |     user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
13 |     user_query = 'allow_message%2Cis_followed%2Cis_following%2Cis_org%2Cis_blocking%2Cemployments%2Canswer_count%2Cfollower_count%2Carticles_count%2Cgender%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics'
14 | 
15 |     follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
16 |     follows_query = 'data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics'
17 | 
18 |     followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}'
19 |     followers_query = 'data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics'
20 | 
21 |     def start_requests(self):
22 |         #  他关注的人的链接
23 |         yield Request(self.user_url.format(user=self.start_user, include=self.user_query), callback=self.parse_user)
24 |         yield Request(self.follows_url.format(user=self.start_user, include=self.follows_query, offset=0, limit=20), callback=self.parse_follows)
25 |         yield Request(self.followers_url.format(user=self.start_user, include=self.followers_query, offset=0, limit=20), callback=self.parse_followers)
26 | 
27 |     def parse_user(self, response):
28 |         result = json.loads(response.text)
29 |         item = UserItem()
30 |         for field in item.fields:
31 |             if field in result.keys():
32 |                 item[field] = result.get(field)
33 |         yield item
34 |         # 找到每个用户自己的关注列表
35 |         yield Request(self.follows_url.format(user=result.get('url_token'), include=self.follows_query, limit=20, offset=0), callback=self.parse_follows)
36 |         yield Request(self.followers_url.format(user=result.get('url_token'), include=self.followers_query, limit=20, offset=0), callback=self.parse_followers)
37 | 
38 |     def parse_follows(self, response):
39 |         results = json.loads(response.text)
40 |         if 'data' in results.keys():
41 |             for result in results.get('data'):
42 |                 yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query), callback=self.parse_user)
43 | 
44 |         if 'paging' in results.keys() and results.get('paging').get('is_end') ==False:
45 |             next_page = results.get('paging').get('next')
46 |             yield Request(next_page, callback=self.parse_follows)
47 | 
48 |     def parse_followers(self, response):
49 |         results = json.loads(response.text)
50 |         if 'data' in results.keys():
51 |             for result in results.get('data'):
52 |                 yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query), callback=self.parse_user)
53 | 
54 |         if 'paging' in results.keys() and results.get('paging').get('is_end') ==False:
55 |             next_page = results.get('paging').get('next')
56 |             yield Request(next_page, callback=self.parse_followers)
57 | 
58 | 


--------------------------------------------------------------------------------
/spiders/secondSpider/Data_Output.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | import time
 3 | 
 4 | 
 5 | class Data_Output(object):
 6 |     def __init__(self):
 7 |         self.filepath ='baike_%s.html' % (time.strftime("%Y_%m_%d_%h_%M_%S", time.localtime()))
 8 |         self.output_head(self.filepath)
 9 |         self.datas = []
10 | 
11 |     def store_data(self, data):
12 |         if data is None:
13 |             return
14 |         # print('************')
15 |         self.datas.append(data)
16 |         # print(len(self.datas))
17 |         # if len(self.datas) > 10:
18 |         self.output_html(self.filepath)
19 | 
20 |     def output_head(self, path):
21 |         """
22 |         将HTML头写进去
23 |         :param path:
24 |         :return:
25 |         """
26 |         # open 函数只能写入字符串
27 |         fout = codecs.open(path, 'a', encoding='utf-8')
28 |         fout.write("<html>")
29 |         fout.write("<body>")
30 |         fout.write("<table>")
31 |         fout.close()
32 | 
33 |     def output_html(self, path):
34 |         """
35 |         将数据写入HTML文件
36 |         :param path:文件路径
37 |         :return:
38 |         """
39 |         fout = codecs.open(path, 'a', encoding='Utf-8')
40 |         print(self.datas)
41 |         for data in self.datas:
42 |             fout.write("<tr>")
43 |             fout.write("<td>%s</td>" % data['url'])
44 |             fout.write("<td>%s</td>" % data['title'])
45 |             fout.write("<td>%s</td>" % data['summary'])
46 |             fout.write("</tr>")
47 |             self.datas.remove(data)
48 |         fout.close()
49 | 
50 |     def ouput_end(self, path):
51 |         """
52 |         输出HTML结束
53 |         :param path: 文件存储路径
54 |         :return:
55 |         """
56 |         fout = codecs.open(path, 'a', encoding='utf-8')
57 |         fout.write("</table>")
58 |         fout.write("</body>")
59 |         fout.write("</html>")
60 |         fout.close()


--------------------------------------------------------------------------------
/spiders/secondSpider/Html_Downloader.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | 
 4 | class Html_Downloader(object):
 5 | 
 6 |     def download(self, url):
 7 |         if url is None:
 8 |             return None
 9 |         user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/51.0.2704.63 Safari/537.36'
10 |         headers = {'User-Agent': user_agent}
11 |         r = requests.get(url, headers=headers)
12 |         if r.status_code == 200:
13 |             r.encoding = 'utf-8'
14 |             return r.text
15 |         return None
16 | 


--------------------------------------------------------------------------------
/spiders/secondSpider/Html_Parser.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from bs4 import BeautifulSoup
 3 | from urllib import parse
 4 | import urllib
 5 | 
 6 | 
 7 | class Html_Parser(object):
 8 | 
 9 |     def parser(self, page_url, html_cont):
10 |         """
11 |         用于解析网页内容，抽取URL 和数据
12 |         :param page_url: 下载页面的URL
13 |         :param html_cont: 下载的网页内容
14 |         :return: 返回URL和数据
15 |         """
16 |         if page_url is None or html_cont is None:
17 |             return
18 |         soup = BeautifulSoup(html_cont, 'lxml')
19 |         new_urls = self._get_new_urls(page_url, soup)
20 |         new_data = self._get_new_data(page_url, soup)
21 |         return new_urls, new_data
22 | 
23 |     def _get_new_urls(self, page_url, soup):
24 |         """
25 |         抽取新的URl集合
26 |         :param page_url: 下载页面的URL
27 |         :param soup: soup
28 |         :return: 返回新的URL集合
29 |         """
30 |         new_urls = set()
31 |         # 抽取符合要求的a标记
32 |         links = soup.find_all('a', href=re.compile(r'/item/.*'))
33 |         for link in links:
34 |             # 提取href属性
35 |             new_url = link['href']
36 |             # 拼接成完成网址
37 |             new_full_url = parse.urljoin(page_url, new_url)
38 |             new_urls.add(new_full_url)
39 |         return new_urls
40 | 
41 |     def _get_new_data(self, page_url, soup):
42 |         """
43 |         抽取有效数据
44 |         :param page_url:下载页面的URL
45 |         :param soup:
46 |         :return: 返回有效数据
47 |         """
48 |         data = {}
49 |         data['url'] = page_url
50 |         title = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').find('h1')
51 |         data['title'] = title.text
52 |         summary = soup.find('div', class_='lemma-summary')
53 |         data['summary'] = summary.text
54 | 
55 |         return data
56 | 
57 | 


--------------------------------------------------------------------------------
/spiders/secondSpider/SpiderWork.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing.managers import BaseManager
 2 | from Html_Downloader import Html_Downloader
 3 | from Html_Parser import Html_Parser
 4 | 
 5 | 
 6 | class SpiderWork(object):
 7 |     def __init__(self):
 8 |         # 初始化分布式进程工作节点的连接作业
 9 |         # 实现第一步：使用BaseManager注册用于获取Queue的方法名称
10 |         BaseManager.register('get_task_queue')
11 |         BaseManager.register('get_result_queue')
12 |         # 实现第二步，连接到服务器
13 |         server_addr = '192.168.43.149'
14 |         print(f'connect to server {server_addr}...')
15 |         self.m = BaseManager(address=(server_addr, 8001), authkey=b'baike')
16 |         # 从网络连接
17 |         self.m.connect()
18 |         # 实现第三步：获取Queue的对象
19 |         self.task = self.m.get_task_queue()
20 |         self.result = self.m.get_result_queue()
21 |         # 初始化网页下载器和解析器
22 |         self.downloader = Html_Downloader()
23 |         self.parser = Html_Parser()
24 |         print('init finish')
25 | 
26 |     def crawl(self):
27 |         while 1:
28 |             try:
29 |                 if not self.task.empty():
30 |                     url = self.task.get()
31 |                     if url == 'end':
32 |                         print('控制节点通知爬虫节点停止工作...')
33 |                         # 接的通知其他节点停止工作
34 |                         self.result.put({'new_urls': 'end', 'data': 'end'})
35 |                         return
36 |                     print('爬虫节点正在解析%s' % url.encode('utf-8'))
37 |                     content = self.downloader.download(url)
38 |                     new_urls, data = self.parser.parser(url, content)
39 |                     # print(new_urls)
40 |                     self.result.put({'new_urls': new_urls, 'data': data})
41 |             except EOFError as e:
42 |                 print(e, '连接工作节点失败')
43 |             except Exception as e:
44 |                 print(e)
45 |                 print('Crawl fail')
46 | 
47 | if __name__ == '__main__':
48 |     spider = SpiderWork()
49 |     spider.crawl()


--------------------------------------------------------------------------------
/spiders/secondSpider/URL_Manager.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import hashlib
 3 | 
 4 | 
 5 | class URL_Manager(object):
 6 |     def __init__(self):
 7 |         self.new_urls = self.load_progress('new_urls.txt')  # 未爬取URL集合
 8 |         self.old_urls = self.load_progress('old_urls.txt')  # 已爬取URL集合
 9 | 
10 |     def has_new_url(self):
11 |         """
12 |         判断是否有未爬取的URL
13 |         """
14 |         return self.new_url_size() != 0
15 | 
16 |     def get_new_url(self):
17 |         """
18 |         获取一个未爬取的URL
19 |         """
20 |         new_url = self.new_urls.pop()
21 |         # 摘要算法md()
22 |         m = hashlib.md5()
23 |         m.update(new_url.encode('utf-8'))
24 |         self.old_urls.add(m.hexdigest()[8: -8])
25 |         return new_url
26 | 
27 |     def add_new_url(self, url):
28 |         """
29 |         将新的URL添加到未爬取的URl集合中
30 |         :param url: 单个RUL
31 |         :return:
32 |         """
33 |         if url is None:
34 |             return
35 |         m = hashlib.md5()
36 |         m.update(url.encode('utf-8'))
37 |         url_md5 = m.hexdigest()[8: -8]
38 |         if url not in self.new_urls and url_md5 not in self.old_urls:
39 |             self.new_urls.add(url)
40 | 
41 |     def add_new_urls(self, urls):
42 |         """
43 |         将新的URL添加到未爬取的URL集合中
44 |         :param urls: url集合
45 |         :return:
46 |         """
47 |         if urls is None or len(urls) == 0:
48 |             return
49 |         for url in urls:
50 |             self.add_new_url(url)
51 | 
52 | 
53 |     def new_url_size(self):
54 |         """
55 |         获取未爬取URL集合的大小
56 |         :return:
57 |         """
58 |         return len(self.new_urls)
59 | 
60 |     def old_url_size(self):
61 |         """
62 |         获取已经爬取URL集合的大小
63 |         :return:
64 |         """
65 |         return len(self.old_urls)
66 | 
67 |     def save_process(self, path, data):
68 |         """
69 |         保存进度
70 |         :param path:文件路径
71 |         :param data: 数据
72 |         :return:
73 |         """
74 |         with open(path, 'wb') as f:
75 |             pickle.dump(data, f)
76 | 
77 |     def load_progress(self, path):
78 |         """
79 |         从本地文件加载进度
80 |         :param path: 文件路径
81 |         :return: 返回set集合
82 |         """
83 |         print(f'[+]从文件加载进度：{path}')
84 |         try:
85 |             with open(path, 'rb') as f:
86 |                 tmp = pickle.load(f)
87 |                 return tmp
88 |         except:
89 |             print(f'[!]无进度文件，创建：{path}')
90 |         return set()


--------------------------------------------------------------------------------
/spiders/secondSpider/__pycache__/Data_Output.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/secondSpider/__pycache__/Data_Output.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/secondSpider/__pycache__/Html_Downloader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/secondSpider/__pycache__/Html_Downloader.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/secondSpider/__pycache__/Html_Parser.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/secondSpider/__pycache__/Html_Parser.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/secondSpider/__pycache__/URL_Manager.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/secondSpider/__pycache__/URL_Manager.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/secondSpider/new_urls.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/secondSpider/new_urls.txt


--------------------------------------------------------------------------------
/spiders/secondSpider/old_urls.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/secondSpider/old_urls.txt


--------------------------------------------------------------------------------
/spiders/selenium/Frame.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/selenium/Frame.py


--------------------------------------------------------------------------------
/spiders/selenium/Jiaohu.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | import time
 3 | 
 4 | browser = webdriver.Chrome()
 5 | browser.get('http://www.taobao.com')
 6 | input =browser.find_element_by_id('q')
 7 | input.send_keys('IPhone')
 8 | time.sleep(1)
 9 | input.clear()
10 | input.send_keys('Ipad')
11 | button = browser.find_element_by_class_name('btn-search')
12 | button.click()


--------------------------------------------------------------------------------
/spiders/selenium/javaScript.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | 
3 | browser = webdriver.Chrome()
4 | browser.get('https://www.zhihu.com/explore')
5 | browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
6 | browser.execute_script('alert("To Bottom")')


--------------------------------------------------------------------------------
/spiders/selenium/前进后退.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | 
3 | browser = webdriver.Chrome()
4 | browser.get('http://www.baidu.com/')
5 | input = browser.find_element_by_id('kw')
6 | input.send_keys('图片')
7 | botton = browser.find_element_by_id('su')
8 | botton.click()


--------------------------------------------------------------------------------
/spiders/selenium/获取属性.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from selenium.webdriver import ActionChains
 3 | 
 4 | browser = webdriver.Chrome()
 5 | url = 'https://www.zhihu.com/explore'
 6 | 
 7 | browser.get(url)
 8 | # logo = browser.find_element_by_id('zh-top-link-logo')
 9 | # print(logo)
10 | # print(logo.get_attribute('class'))
11 | input = browser.find_element_by_class_name('zu-top-link-logo')
12 | print(input.text)


--------------------------------------------------------------------------------
/spiders/selenium/选项卡管理.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from selenium import webdriver
 3 | 
 4 | 
 5 | browser = webdriver.Chrome()
 6 | browser.get('http://www.baidu.com')
 7 | browser.execute_script('window.open()')
 8 | browser.switch_to_window(browser.window_handles[1])
 9 | browser.get('https://www.taobao.com')
10 | time.sleep(1)
11 | browser.switch_to_window(browser.window_handles[0])
12 | browser.get('https://python.org')


--------------------------------------------------------------------------------
/spiders/selenium个人邮箱.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from selenium.webdriver.common.by import By
 3 | from selenium.webdriver.support.ui import WebDriverWait
 4 | from selenium.webdriver.support import expected_conditions as EC
 5 | 
 6 | browser = webdriver.Chrome()
 7 | wait = WebDriverWait(browser, 10)
 8 | 
 9 | 
10 | def search():
11 |     browser.get('https://www.baidu.com/')
12 |     input = wait.until(EC.presence_of_element_located((By.ID, 'kw')))
13 |     submit = wait.until(EC.element_to_be_clickable((By.ID, 'su')))
14 |     input.send_keys('qq邮箱')
15 |     submit.click()
16 |     wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="1"]/h3/a[1]'))).click()
17 |     browser.implicitly_wait(10)
18 |     browser.switch_to_window(browser.window_handles[1])
19 |     browser.switch_to_frame('login_frame')
20 |     wait.until(EC.presence_of_element_located((By.ID, 'switcher_plogin'))).click()
21 |     u = wait.until(EC.presence_of_element_located((By.ID, 'u')))
22 |     p = wait.until(EC.presence_of_element_located((By.ID, 'p')))
23 |     u.send_keys('942203701')
24 |     p.send_keys('xingfu1314...')
25 |     wait.until(EC.element_to_be_clickable((By.ID, 'login_button'))).click()
26 | 
27 | 
28 | def main():
29 |     search()
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     main()


--------------------------------------------------------------------------------
/spiders/selenium模拟淘宝.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | import re
 3 | from bs4 import BeautifulSoup
 4 | from selenium.common.exceptions import TimeoutException
 5 | from selenium.webdriver.common.by import By
 6 | from selenium.webdriver.support.ui import WebDriverWait
 7 | from selenium.webdriver.support import expected_conditions as EC
 8 | from config import *
 9 | import pymongo
10 | # from selenium.webdriver.chrome.options import Options
11 | 
12 | client = pymongo.MongoClient(MONGO_URL)
13 | db = client[MONGO_DB]
14 | # chrome_options = Options()
15 | # chrome_options.add_argument('--headless')
16 | # chrome_options.add_argument('--disable-gpu')
17 | browser = webdriver.Chrome(chrome_options=chrome_options)
18 | # browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
19 | wait = WebDriverWait(browser, 10)
20 | 
21 | # browser.set_window_size(1400, 900)
22 | 
23 | 
24 | def search():
25 |     try:
26 |         browser.get('https://www.taobao.com')
27 |         input = wait.until(EC.presence_of_element_located((By.ID, 'q')))
28 |         submit = wait.until(
29 |             EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')))
30 |         input.send_keys('美食')
31 |         submit.click()
32 |         total = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'total')))
33 |         get_products()
34 |         return total.text
35 |     except TimeoutException:
36 |         return search()
37 | 
38 | 
39 | def next_page(page_number):
40 |     try:
41 |         submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.next > a > span:nth-child(1)')))
42 |         submit.click()
43 |         num = wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number)))
44 |         print(f'正在解析第{page_number}页')
45 |         get_products()
46 |     except TimeoutException:
47 |         next_page(page_number)
48 | 
49 | 
50 | def get_products():
51 |     product = {}
52 |     wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item')))
53 |     html = browser.page_source
54 |     soup = BeautifulSoup(html, 'lxml')
55 |     items = soup.select('#mainsrp-itemlist .items .item')
56 |     # item = soup.find_all(class_='item')
57 |     for item in items:
58 |         product = {
59 |             'image': item.select('.pic .img')[0]['data-src'],
60 |             'price': item.find(class_='price').text.strip(),
61 |             'deal': item.find(class_='deal-cnt').text[: -3],
62 |             'title': item.find(class_='title').text.strip(),
63 |             'shop': item.find(class_='shop').text.strip(),
64 |             'location': item.find(class_='location').text
65 |         }
66 |         save_to_mongo(product)
67 | 
68 | 
69 | def save_to_mongo(result):
70 |     try:
71 |         if db[MONGO_DB].insert(result):
72 |             print('存储到MONGODB成功')
73 |     except Exception:
74 |         print('存储失败')
75 | 
76 | 
77 | def main():
78 |     total = search()
79 |     total = int(re.search('(\d+)', total).group(1))
80 |     for i in range(2, total+1):
81 |         next_page(i)
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     main()


--------------------------------------------------------------------------------
/spiders/中国大学排名定向爬虫.py:
--------------------------------------------------------------------------------
 1 | import bs4
 2 | import requests
 3 | import urllib.error
 4 | from bs4 import BeautifulSoup
 5 | import os
 6 | 
 7 | 
 8 | def get_html_text(url):
 9 |     try:
10 |         r = requests.get(url, timeout=30)
11 |         r.raise_for_status()
12 |         r.encoding = r.apparent_encoding
13 |         return r.text
14 |     except urllib.error.URLError as e:
15 |         print(e.reason)
16 |         return ""
17 | 
18 | 
19 | def fill_univ_list(ulist, html):
20 |     soup = BeautifulSoup(html, 'lxml')  # html/xml这两种格式
21 |     # print(soup.prettify())
22 |     # print(soup.find('tbody'))
23 |     for tr in soup.find("tbody").children:
24 |         if isinstance(tr, bs4.element.Tag):   # 过滤掉非标签类型
25 |             tds = tr('td')
26 |             # print(tds)
27 |             ulist.append([tds[0].string, tds[1].string, tds[2].string])
28 | 
29 |     # all_u = soup.find("tbody").children
30 |     # print(all_u)
31 |     # for tr in all_u:
32 |     #     tds = tr.find_all('td')
33 |     #     ulist.append([tds[0], tds[1], tds[2]])
34 | 
35 | 
36 | def print_univ_list(ulist, num):
37 |     tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"
38 |     print(tplt.format("排名", "学校名称", "省市", chr(12288)))
39 |     for i in range(num):
40 |         u = ulist[i]
41 |         print(tplt.format(u[0], u[1], u[2], chr(12288)))
42 | 
43 | if __name__ == '__main__':
44 |     uinfo = []
45 |     url = "http://www.zuihaodaxue.cn/shengyuanzhiliangpaiming2018.html"
46 |     html = get_html_text(url)
47 |     fill_univ_list(uinfo, html)
48 |     print_univ_list(uinfo, 20)


--------------------------------------------------------------------------------
/spiders/分布式爬虫/Data_Output.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | 
 3 | 
 4 | class Data_Output(object):
 5 |     def __init__(self):
 6 |         self.datas = []
 7 | 
 8 |     def store_data(self, data):
 9 |         if data is None:
10 |             return
11 |         self.datas.append(data)
12 | 
13 |     def output_html(self):
14 |         fout = codecs.open('baike.html', 'w', encoding='gbk')
15 |         fout.write("<html>")
16 |         fout.write("<body>")
17 |         fout.write("<table>")
18 |         for data in self.datas:
19 |             fout.write("<tr>")
20 |             fout.write("<td>%s</td>" % data['url'])
21 |             fout.write("<td>%s</td>" % data['title'])
22 |             fout.write("<td>%s</td>" % data['summary'])
23 |             fout.write("</tr>")
24 |         fout.write("</html>")
25 |         fout.write("</body>")
26 |         fout.write("</table>")
27 |         fout.close()


--------------------------------------------------------------------------------
/spiders/分布式爬虫/Html_Downloader.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | 
 4 | class Html_Downloader(object):
 5 | 
 6 |     def download(self, url):
 7 |         if url is None:
 8 |             return None
 9 |         user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/51.0.2704.63 Safari/537.36'
10 |         headers = {'User-Agent': user_agent}
11 |         r = requests.get(url, headers=headers)
12 |         if r.status_code == 200:
13 |             r.encoding = 'Utf-8'
14 |             return r.text
15 |         return None
16 | 


--------------------------------------------------------------------------------
/spiders/分布式爬虫/Html_Parser.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from bs4 import BeautifulSoup
 3 | from urllib import parse
 4 | import urllib
 5 | 
 6 | 
 7 | class Html_Parser(object):
 8 | 
 9 |     def parser(self, page_url, html_cont):
10 |         """
11 |         用于解析网页内容，抽取URL 和数据
12 |         :param page_url: 下载页面的URL
13 |         :param html_cont: 下载的网页内容
14 |         :return: 返回URL和数据
15 |         """
16 |         if page_url is None or html_cont is None:
17 |             return
18 |         soup = BeautifulSoup(html_cont, 'lxml')
19 |         new_urls = self._get_new_urls(page_url, soup)
20 |         new_data = self._get_new_data(page_url, soup)
21 |         # print(new_urls, new_data)
22 |         return new_urls, new_data
23 | 
24 |     def _get_new_urls(self, page_url, soup):
25 |         """
26 |         抽取新的URl集合
27 |         :param page_url: 下载页面的URL
28 |         :param soup: soup
29 |         :return: 返回新的URL集合
30 |         """
31 |         new_urls = set()
32 |         # 抽取符合要求的a标记
33 |         links = soup.find_all('a', href=re.compile(r'/view/\d+\.htm'))
34 |         # print('****************')
35 |         # print(links)
36 |         # print('***************')
37 |         for link in links:
38 |             # 提取href属性
39 |             new_url = link['href']
40 |             # 拼接成完成网址
41 |             new_full_url = parse.urljoin(page_url, new_url)
42 |             new_urls.add(new_full_url)
43 |         return new_urls
44 | 
45 | 
46 |     def _get_new_data(self, page_url, soup):
47 |         """
48 |         抽取有效数据
49 |         :param page_url:下载页面的URL
50 |         :param soup:
51 |         :return: 返回有效数据
52 |         """
53 |         data = {}
54 |         data['url'] = page_url
55 |         title = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').find('h1')
56 |         # print(title.get_text)
57 |         data['title'] = title.text
58 |         summary = soup.find('div', class_='lemma-summary')
59 |         # print(summary.text)
60 |         # 获取tag中包含的所有文本内容，包括子孙tag中的内容，并将结果作为Unico字符串返回
61 |         data['summary'] = summary.text
62 | 
63 |         return data
64 | 
65 | 


--------------------------------------------------------------------------------
/spiders/分布式爬虫/SpiderMan.py:
--------------------------------------------------------------------------------
 1 | from Data_Output import Data_Output
 2 | from Html_Downloader import Html_Downloader
 3 | from Html_Parser import Html_Parser
 4 | from URL_Manager import URL_Manager
 5 | 
 6 | 
 7 | class SpiderMan(object):
 8 |     def __init__(self):
 9 |         self.mamager = URL_Manager()
10 |         self.downloader = Html_Downloader()
11 |         self.parser = Html_Parser()
12 |         self.output = Data_Output()
13 | 
14 |     def crawl(self, root_url):
15 |         # 添加入口URL
16 |         self.mamager.add_new_url(root_url)
17 |         # 判断url管理器中是否有新的url, 同时判断抓取了多少个url
18 |         while (self.mamager.has_new_url() and self.mamager.old_url_size()<100):
19 |             try:
20 |                 # 从URL管理器获取新的url
21 |                 new_url = self.mamager.get_new_url()
22 |                 html = self.downloader.download(new_url)
23 |                 # 从html解析器抽取网页数据
24 |                 new_url, data = self.parser.parser(new_url, html)
25 |                 # 将抽取的url 添加到URL管理器中
26 |                 # print(new_url, data)
27 |                 self.mamager.add_new_urls(new_url)
28 |                 # 数据存储器存储文件
29 |                 # print('***************8')
30 |                 self.output.store_data(data)
31 | 
32 |                 print(f"已经抓取{self.mamager.old_url_size()}个链接")
33 |             except Exception as e:
34 |                 print(e)
35 |         self.output.output_html()
36 | 
37 | if __name__ == '__main__':
38 |     spider_man = SpiderMan()
39 |     spider_man.crawl("http://baike.baidu.com/view/284853.htm")
40 | 


--------------------------------------------------------------------------------
/spiders/分布式爬虫/URL_Manager.py:
--------------------------------------------------------------------------------
 1 | class URL_Manager(object):
 2 |     def __init__(self):
 3 |         self.new_urls = set()  # 未爬取URL集合
 4 |         self.old_urls = set()  # 已爬取URL集合
 5 | 
 6 |     def has_new_url(self):
 7 |         """
 8 |         判断是否有未爬取的URL
 9 |         """
10 |         return self.new_url_size() != 0
11 | 
12 |     def get_new_url(self):
13 |         """
14 |         获取一个未爬取的URL
15 |         """
16 |         new_url = self.new_urls.pop()
17 |         self.old_urls.add(new_url)
18 |         return new_url
19 | 
20 |     def add_new_url(self, url):
21 |         """
22 |         将新的URL添加到未爬取的URl集合中
23 |         :param url: 单个RUL
24 |         :return:
25 |         """
26 |         if url is None:
27 |             return
28 |         if url not in self.new_urls and url not in self.old_urls:
29 |             # print(self.new_urls, self.old_urls)
30 |             self.new_urls.add(url)
31 |             # print(self.new_urls)
32 |             # print('*****')
33 | 
34 |     def add_new_urls(self, urls):
35 |         """
36 |         将新的URL添加到未爬取的URL集合中
37 |         :param urls: url集合
38 |         :return:
39 |         """
40 |         if urls is None or len(urls) == 0:
41 |             return
42 |         for url in urls:
43 |             self.add_new_url(url)
44 | 
45 | 
46 |     def new_url_size(self):
47 |         """
48 |         获取未爬取URL集合的大小
49 |         :return:
50 |         """
51 |         return len(self.new_urls)
52 | 
53 |     def old_url_size(self):
54 |         """
55 |         获取已经爬取URL集合的大小
56 |         :return:
57 |         """
58 |         return len(self.old_urls)
59 | 


--------------------------------------------------------------------------------
/spiders/分布式爬虫/__pycache__/Data_Output.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/分布式爬虫/__pycache__/Data_Output.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/分布式爬虫/__pycache__/Html_Downloader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/分布式爬虫/__pycache__/Html_Downloader.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/分布式爬虫/__pycache__/Html_Parser.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/分布式爬虫/__pycache__/Html_Parser.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/分布式爬虫/__pycache__/URL_Manager.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dli98/py-/cd0fd5509bebd57c48389b40d58390f2234b9945/spiders/分布式爬虫/__pycache__/URL_Manager.cpython-36.pyc


--------------------------------------------------------------------------------
/spiders/分布式进程/taskManager.py:
--------------------------------------------------------------------------------
 1 | import queue
 2 | import socket
 3 | from socket import SOL_SOCKET, SO_REUSEADDR
 4 | from multiprocessing.managers import BaseManager
 5 | from multiprocessing import freeze_support
 6 | # 任务个数
 7 | task_number = 10
 8 | # 定义收发队列
 9 | task_queue = queue.Queue(task_number)
10 | result_queue = queue.Queue(task_number)
11 | 
12 | 
13 | def get_task():
14 |     return task_queue
15 | 
16 | 
17 | def get_result():
18 |     return result_queue
19 | 
20 | 
21 | # 创建类似的queueManager：
22 | # 从BaseManager继承的
23 | class QueueManager(BaseManager):
24 |     pass
25 | 
26 | 
27 | def win_run():
28 |     # windows 下绑定调用接口不能使用lambda，所以只能先定义函数再绑定
29 |     # 把两个队列注册到网络上
30 |     sk = socket.socket()
31 |     QueueManager.register('get_task_queue', callable=get_task)
32 |     QueueManager.register('get_result_queue', callable=get_result)
33 |     # 绑定端口并设置验证口令，Windows下需要填写IP地址，Linux 下不填默认本机地址
34 |     sk.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1)
35 |     manager = QueueManager(address=('192.168.43.149', 8001), authkey=b'qiye')
36 |     # 启动
37 |     manager.start()
38 |     try:
39 |         # 通过网络获取任务队列和结果队列
40 |         task = manager.get_task_queue()
41 |         result = manager.get_result_queue()
42 |         # 添加任务
43 |         for url in ["ImageUrl_"+str(i) for i in range(10)]:
44 |             print(f'put task {url}...')
45 |             task.put(url)
46 |         print('try get result...')
47 |         for i in range(10):
48 |             print(f'result is {result.get(timeout=100)}')
49 |     except:
50 |         print('Manager error')
51 |     finally:
52 |         # 一定要关闭，否则会报管道未关闭的错误
53 |         manager.shutdown()
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     # Windos 下多进程可能会有问题，添加这句可以缓解
58 |     freeze_support()
59 |     win_run()


--------------------------------------------------------------------------------
/spiders/分布式进程/taskWork.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from multiprocessing.managers import BaseManager
 3 | 
 4 | 
 5 | class QueueManager(BaseManager):
 6 |     pass
 7 | # 第一步，使用QueueManager 注册用于获取Queen的方法名称
 8 | QueueManager.register('get_task_queue')
 9 | QueueManager.register('get_result_queue')
10 | # 第二步，链接服务器
11 | server_addr = '192.168.43.149'
12 | print(f'Connect to server {server_addr}')
13 | # 端口和验证口令注意保持与服务进程完全一致：
14 | m = QueueManager(address=(server_addr, 8001), authkey=b'qiye')
15 | # 从网络链接：
16 | m.connect()
17 | # 第三步：获取queue的对象
18 | task = m.get_task_queue()
19 | result = m.get_result_queue()
20 | # 第四步，从task队列中获取任务，并把结果写入result队列：
21 | while (not task.empty()):
22 |     immage_url = task.get(True, timeout=20)
23 |     print(f'run task download {immage_url}...')
24 |     time.sleep(1)
25 |     result.put(f'{immage_url}--->success')
26 | 
27 | # 处理结束：
28 | print('worker exit.')
29 | 
30 | 


--------------------------------------------------------------------------------
/spiders/医生信息索取.py:
--------------------------------------------------------------------------------
 1 | import bs4
 2 | import requests
 3 | import urllib.error
 4 | from bs4 import BeautifulSoup
 5 | import os
 6 | import random
 7 | import re
 8 | 
 9 | user_agents = [
10 |     'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
11 |     'Opera/9.25 (Windows NT 5.1; U; en)',
12 |     'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
13 |     'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
14 |     'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
15 |     'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
16 |     "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
17 |     "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ",
18 |     "Mozilla/5.0 (Windows NT 10.0; WOW64)",
19 |     'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/51.0.2704.63 Safari/537.36'
20 |     ]
21 | 
22 | 
23 | def get_html_text(url, page):
24 |     try:
25 |         headers = {'User-Agent': user_agents[-1],
26 |                    'Host': '400.haodf.com'
27 |                    }
28 |         kv = {'nowpage': page}
29 |         r = requests.get(url, headers=headers, params=kv)
30 |         r.raise_for_status()
31 |         r.encoding = r.apparent_encoding
32 |         return r.text
33 |     except Exception as e:
34 |         print(e)
35 |         return ""
36 | 
37 | 
38 | def parse_doc_info(ulis, html):
39 |     soup = BeautifulSoup(html, 'lxml')
40 |     all_info = soup.find_all(class_='clearfix showResult-cell bb pb10 mt15')
41 |     for p in all_info:
42 |         try:
43 |             tc_p = p.find('p', class_='tc mt5')
44 |             name = tc_p.find('a').text
45 |             grade = re.findall(r'</a>.*<!', str(tc_p))[0][4:-2]
46 |             fb_p = p.find('p', class_='fb').text.split()
47 |             print(fb_p[0], grade)
48 |             ulis.append([name, grade, fb_p[0], fb_p[1]])
49 |             # print(fb_p[0], fb_p[1])
50 |         except:
51 |             continue
52 | 
53 | 
54 | def print_doc_list(ulist):
55 |     tplt = "{0:{4}^10}\t{1:{4}^20}\t{2:{4}^20}\t{3:{4}^10}"
56 |     print(tplt.format("医生姓名", "医生等级", "医院", "科室", chr(12288)))
57 |     for i in range(len(ulist)):
58 |         u = ulist[i]
59 |         # print(tplt.format(u[0], u[0], u[0], chr(12288)))
60 |         print(tplt.format(u[0], u[1], u[2], u[3], chr(12288)))
61 | 
62 | if __name__ == '__main__':
63 |     info = []
64 |     url = "https://400.haodf.com/index/search"
65 |     for i in range(1, 3):
66 |         html = get_html_text(url, i)
67 |         parse_doc_info(info, html)
68 |     print_doc_list(info)
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/spiders/多线程爬取医生.py:
--------------------------------------------------------------------------------
 1 | import bs4
 2 | import requests
 3 | import urllib.error
 4 | from bs4 import BeautifulSoup
 5 | import os
 6 | import random
 7 | import re
 8 | import threading
 9 | 
10 | user_agents = [
11 |     'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
12 |     'Opera/9.25 (Windows NT 5.1; U; en)',
13 |     'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
14 |     'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
15 |     'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
16 |     'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
17 |     "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
18 |     "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ",
19 |     "Mozilla/5.0 (Windows NT 10.0; WOW64)",
20 |     'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/51.0.2704.63 Safari/537.36'
21 |     ]
22 | 
23 | 
24 | def get_html_text(url, page):
25 |     try:
26 |         headers = {'User-Agent': user_agents[-1],
27 |                    'Host': '400.haodf.com'
28 |                    }
29 |         kv = {'nowpage': page}
30 |         r = requests.get(url, headers=headers, params=kv)
31 |         r.raise_for_status()
32 |         r.encoding = r.apparent_encoding
33 |         parse_doc_info(info, r.text)
34 |         print(threading.current_thread())
35 |         pass
36 |     except Exception as e:
37 |         print(e)
38 |         pass
39 | 
40 | 
41 | def parse_doc_info(ulis, html):
42 |     soup = BeautifulSoup(html, 'lxml')
43 |     all_info = soup.find_all(class_='clearfix showResult-cell bb pb10 mt15')
44 |     for p in all_info:
45 |         try:
46 |             tc_p = p.find('p', class_='tc mt5').text.split()
47 |             fb_p = p.find('p', class_='fb').text.split()
48 |             ulis.append([tc_p[0], fb_p[0], fb_p[1]])
49 |         except:
50 |             continue
51 | 
52 | 
53 | def print_doc_list(ulist):
54 |     tplt = "{0:{3}^10}\t{1:{3}^20}\t{2:{3}^10}"
55 |     print(tplt.format("医生姓名", "医院", "科室", chr(12288)))
56 |     for i in range(len(ulist)):
57 |         u = ulist[i]
58 |         print(tplt.format(u[0], u[1], u[2], chr(12288)))
59 | 
60 | if __name__ == '__main__':
61 |     info = []
62 |     threads = []
63 |     url = "https://400.haodf.com/index/search"
64 |     for i in range(1, 10):
65 |         thread = threading.Thread(target=get_html_text, args=(url, i))
66 |         thread.start()
67 |         threads.append(thread)
68 |     for i in threads:
69 |         i.join()
70 |     print_doc_list(info)
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/spiders/淘宝商品信息爬取.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import requests
 3 | import urllib.error
 4 | 
 5 | 
 6 | def get_html_text(url):
 7 |     try:
 8 |         headers = {'User_Agent': 'Mozilla/4.0(compatible; MSIE 6.0; Windows nt)'}
 9 |         r = requests.get(url, headers, timeout=30)
10 |         r.raise_for_status()
11 |         r.encoding = r.apparent_encoding
12 |         return r.text
13 |     except urllib.error.URLError as e:
14 |         print(e.reason)
15 |         return ""
16 | 
17 | def parse_html(ilt, html):
18 |     try:
19 |         plt = re.findall(r'"view_price":"[\d.]*"', html)
20 |         til = re.findall(r'"raw_title":".*?"', html)
21 |         for i in range(len(plt)):
22 |             price = eval(plt[i].split(':')[1])
23 |             title = eval(til[i].split(':')[1])
24 |             ilt.append([price, title])
25 |     except :
26 |         print('失败')
27 | 
28 | 
29 | def print_goods_list(ilt):
30 |     tplt = "{:4}\t{:8}\t{:16}"
31 |     print(tplt.format("序号", "价格", "商品名称"))
32 |     count = 0
33 |     for g in ilt:
34 |         count = count + 1
35 |         print(tplt.format(count, g[0], g[1]))
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     infolist = []
40 |     depth = 1
41 |     goods = '书包' #input('请输入你想查询的商品：')
42 |     start_url = "https://s.taobao.com/search?q=" + goods
43 |     for i in range(depth):
44 |         try:
45 |             url = start_url + '&s=' + str(44*i)
46 |             print(url)
47 |             html = get_html_text(url)
48 |             parse_html(infolist, html)
49 |             # print(len(infolist))
50 |             # print(infolist)
51 |         except:
52 |             continue
53 |     print_goods_list(infolist)
54 | 


--------------------------------------------------------------------------------
/spiders/猫眼电影.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | from multiprocessing import Lock
 4 | from multiprocessing.pool import Pool
 5 | 
 6 | import requests
 7 | import os
 8 | 
 9 | 
10 | def get_htme_text(url):
11 |     try:
12 |         headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
13 |         r = requests.get(url, headers=headers)
14 |         r.raise_for_status()
15 |         # r.encoding = 'utf-8'
16 |         return r.text
17 |     except Exception as e:
18 |         print(e)
19 | 
20 | 
21 | def parse_html(html):
22 |     dds = re.findall(r'<dd>.*?</dd>', html, flags=re.S)
23 |     for dd in dds:
24 |         try:
25 |             title = re.findall(r'title="(.+?)"', dd)[0],
26 |             staring = re.findall(r'star">(.*?)</p>', dd, flags=re.S)[0].split()[0],
27 |             # print(staring)
28 |             releasetime = re.findall(r'releasetime">(.*?)</p>', dd)[0],
29 |             yield {
30 |                 'title': title,
31 |                 'staring': staring,
32 |                 'releasetime': releasetime
33 |             }
34 |         except:
35 |             continue
36 | 
37 | 
38 | def output_info(content):
39 |     # mutex = Lock()
40 |     # mutex.acquire()
41 |     with open('test.txt', 'a', encoding='utf-8') as f:
42 |         f.write(json.dumps(content, ensure_ascii=False) + '\n', )
43 |         f.close()
44 |     # mutex.release()
45 | 
46 | 
47 | def main(offset):
48 |     info = []
49 |     url = 'http://maoyan.com/board/4?offset=' + str(offset)
50 |     html = get_htme_text(url)
51 |     for item in parse_html(html):
52 |         print(item)
53 |         output_info(item)
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     for i in range(10):
58 |         main(i*10)
59 |     # pool = Pool()
60 |     # # for i in range(10):
61 |     # #     pool.apply_async(main, args=(i*10,))
62 |     # pool.map(main, [i * 10 for i in range(10)])
63 |     # pool.close()
64 |     # pool.join()
65 | 
66 | 
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/spiders/百度图片.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import os
 3 | import urllib.error
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | 
 7 | class Bai_du:
 8 |     def __init__(self):
 9 |         self.user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36' \
10 |                           ' (KHTML, like Gecko)Chrome/51.0.2704.63 Safari/537.36'
11 |         keyword = input("Input key word：")
12 |         self.keyword = {'word': keyword}
13 |         self.header = {'User_Agent': self.user_agent}
14 |         self.url = "https://image.baidu.com/search/index?tn=baiduimage&" \
15 |                    "ipn=r&ct=201326592&cl=2&lm=-1&st=-1&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0" \
16 |                    "&fb=0&width=&height=&face=0&istype=2&ie=utf-8&fm=index&pos=history&"
17 | 
18 |     def get_html_text(self):
19 |         try:
20 |             r = requests.get(url=self.url, headers=self.header, params=self.keyword)
21 |             r.raise_for_status()
22 |             print(r.request.url)
23 |             r.encoding = r.apparent_encoding
24 |             self.html = r.text
25 |         except urllib.error.URLError as e:
26 |             print(e.reason)
27 | 
28 |     def pick_pic(self):
29 |         print(self.html)
30 |         soup = BeautifulSoup(self.html, 'lxml')
31 |         img_list = soup.find('ul', class_='imglist clearfix pageNum0')
32 |         print(img_list)
33 |         
34 | 
35 | if __name__ == "__main__":
36 |     spider = Bai_du()
37 |     spider.get_html_text()
38 |     spider.pick_pic()
39 | 
40 | # try:
41 | #     if os.path.exists('./photo.jpg'):
42 | #         os.remove('./photo.jpg')
43 | #         print('删除同名文件')
44 | #     with open('photo.jpg', 'ab+') as f:
45 | #         f.write(r.content + b'\n')   # HTTP响应内容二进制形式
46 | #     print("下载完成")
47 | # except urllib.error.URLError as e:
48 | #     print(e.reason)
49 | #     print("爬取失败")


--------------------------------------------------------------------------------
/spiders/股票爬虫.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import requests
 3 | import traceback
 4 | import os
 5 | from bs4 import BeautifulSoup
 6 | 
 7 | 
 8 | def get_html_text(url, code='utf-8'):
 9 |     try:
10 |         r = requests.get(url, timeout=30)
11 |         r.raise_for_status()
12 |         r.encoding = code
13 |         return r.text
14 |     except:
15 |         print('失败')
16 |         return ""
17 | 
18 | 
19 | def get_stock_list(lst, stockurl):
20 |     """
21 |      从东方财富网获取股票列表
22 |     :param lst:
23 |     :param stockurl:
24 |     """
25 |     html = get_html_text(stockurl, 'GB2312')
26 |     soup = BeautifulSoup(html, 'lxml')
27 |     all_a = soup.find_all('a')
28 |     for i in all_a:
29 |         try:
30 |             print(type(i))
31 |             href = i.attrs['href']
32 |             # print(type(href))
33 |             lst.append(re.findall(r'[s][hz]\d{6}', href)[0])
34 |             # print(type(re.findall(r'[s][hz]\d{6}', href)[0]))
35 |         except:
36 |             continue
37 | 
38 | 
39 | def get_stock_info(lst, stockurl, fpath):
40 |     count = 0
41 |     for stock in lst:
42 |         url = stockurl + stock + ".html"
43 |         html =get_html_text(url)
44 |         try:
45 |             if html =='':
46 |                 continue
47 |             info_dict = {}
48 |             soup = BeautifulSoup(html, 'lxml')
49 |             stock_info = soup.find('div', class_='stock-bets')
50 |             # print(type(stock_info))   find()返回的是个标签
51 |             name = stock_info.find(class_='bets-name')
52 |             # print(name.text)
53 |             info_dict.update({'股票名称': name.text.split()[0]})
54 | 
55 |             keylist = stock_info.find_all('dt')
56 |             # print(type(keylist))   find_all返回的是结果集
57 |             valuelist = stock_info.find_all('dd')
58 |             for i in range(len(keylist)):
59 |                 key = keylist[i].text
60 |                 val = valuelist[i].text
61 |                 info_dict[key] = val
62 | 
63 |             with open(fpath, 'a', encoding='utf-8') as f:
64 |                 f.write(str(info_dict) + '\n')
65 |                 count = count + 1
66 |                 print('当前速度:{:.2%}'.format(count/len(lst)), end='\r')
67 | 
68 |         except:
69 |             count = count + 1
70 |             print('当前速度:{:.2%}'.format(count/len(lst)), end='\r')
71 |             traceback.print_exc()
72 |             continue
73 | 
74 | 
75 | if __name__ == '__main__':
76 |     stock_list_url = 'http://quote.eastmoney.com/stocklist.html'
77 |     stock_info_url = 'https://gupiao.baidu.com/stock/'
78 |     output_file = 'D://BaidustockInfo.txt'
79 |     if os.path.exists(output_file):
80 |         os.remove(output_file)
81 |         print('删除同名文件')
82 |     slist = []
83 |     get_stock_list(slist, stock_list_url)
84 |     get_stock_info(slist, stock_info_url, output_file)


--------------------------------------------------------------------------------
/spiders/豆瓣.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import re
  3 | import requests
  4 | from lxml import etree
  5 | # 第三方库User-Agent模块，它提供了最新、最全的浏览器的标识
  6 | # 支持谷歌、火狐、IE、Opera
  7 | # 使用命令pip3 install fake-useragent安装
  8 | from fake_useragent import UserAgent
  9 | 
 10 | 
 11 | class DBMovie(object):
 12 |     def __init__(self):
 13 |         self.base_url = 'https://movie.douban.com/top250'
 14 |         # self.ua = UserAgent()
 15 |         self.html_obj = None
 16 | 
 17 |     def open_file(self):
 18 |         csv_file = open('movie.csv', 'w', encoding='utf-8', newline='')
 19 |         self.writer = csv.DictWriter(
 20 |             csv_file,
 21 |             fieldnames=[
 22 |                 'movie_rank', 'movie_name', 'movie_member', 'movie_star', 'movie_comment', 'movie_quote'
 23 |             ]
 24 |         )
 25 |         self.writer.writeheader()
 26 | 
 27 |     def get_next_page_url(self):
 28 |         a = self.html_obj.xpath('//span[@class="next"]/a')
 29 |         if len(a) == 0:
 30 |             print('已经是最后一页')
 31 |             return
 32 |         next_page = a[0].xpath('@href')[0]
 33 |         # next_page：?start=50&filter=
 34 |         self.get_page_code(next_page)
 35 | 
 36 |     def write_movie_info(self, movie_list):
 37 |         for index, moive in enumerate(movie_list):
 38 |             self.writer.writerow(moive)
 39 |             print('第{}页写入完成'.format(index))
 40 |         self.get_next_page_url()
 41 | 
 42 |     def get_content_by_xpath(self, html_obj):
 43 |         movie_list = []
 44 |         item_div = html_obj.xpath('//div[@class="item"]')
 45 |         for item_tag in item_div:
 46 |             movie_dict = {}
 47 | 
 48 |             em = item_tag.xpath('.//em/text()')[0]
 49 |             print(em)
 50 |             hd = item_tag.xpath('.//div[@class="hd"]/a/span/text()')
 51 |             # 将hd中的3个信息拼接在一起
 52 |             info = ''
 53 |             for info_text in hd:
 54 |                 content = info_text.strip('\n').strip()
 55 |                 info += content
 56 |             # 演员
 57 |             member_info = item_tag.xpath('.//p[@class=""]/text()')[0].strip('\n').strip()
 58 |             # 电影评分
 59 |             star_number = item_tag.xpath('.//span[@class="rating_num"]/text()')[0]
 60 |             # 电影评论
 61 |             comment_number = item_tag.xpath('.//div[@class="star"]/span[last()]/text()')[0]
 62 |             comment_number = re.search(re.compile('(\d+)'), comment_number).group(1)
 63 |             # 电影点评
 64 |             quote = item_tag.xpath('.//span[@class="inq"]')
 65 |             if len(quote) != 0:
 66 |                 quote = quote[0].xpath('text()')[0]
 67 |             else:
 68 |                 quote = '影评不存在'
 69 | 
 70 |             # 将以上数据添加到movie_dict里
 71 |             movie_dict['movie_rank'] = em
 72 |             movie_dict['movie_name'] = info
 73 |             movie_dict['movie_member'] = member_info
 74 |             movie_dict['movie_star'] = star_number
 75 |             movie_dict['movie_comment'] = comment_number
 76 |             movie_dict['movie_quote'] = quote
 77 | 
 78 |             movie_list.append(movie_dict)
 79 | 
 80 |         self.write_movie_info(movie_list)
 81 | 
 82 |     def get_page_code(self, url=""):
 83 |         # abs_url：请求的绝对路径
 84 |         # 第2页 url = ?start=25&filter=
 85 |         # 第2次请求abs_url = https://www.douban.com/top250 + ?start=50&filter=
 86 |         abs_url = self.base_url + url
 87 |         content = requests.get(abs_url, headers={
 88 |             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0'
 89 |         }).content.decode()
 90 |         print(content)
 91 | 
 92 |         # 把网页源代码解析成文档树对象
 93 |         self.html_obj = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8'))
 94 | 
 95 |         # 调用get_content_by_xpath()
 96 |         self.get_content_by_xpath(self.html_obj)
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     movie_obj = DBMovie()
101 |     movie_obj.open_file()
102 |     movie_obj.get_page_code()
103 | 


--------------------------------------------------------------------------------