├── .gitignore
├── BaiduStocks
    ├── BaiduStocks
    │   ├── __init__.py
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── stock.py
    └── scrapy.cfg
├── CrawBaiduStocksA.py
├── CrawTaobaoPrice.py
├── CrawUnivRanking.py
├── MaoYan
    └── spider.py
├── TaobaoProduct
    ├── config.py
    └── spider.py
├── TouTiao
    ├── config.py
    └── spider.py
├── Weixin
    ├── config.py
    └── spider.py
└── Zhihu
    ├── Zhihu
        ├── __init__.py
        ├── items.py
        ├── middlewares.py
        ├── pipelines.py
        ├── settings.py
        └── spiders
        │   ├── __init__.py
        │   └── zhihu.py
    └── scrapy.cfg


/.gitignore:
--------------------------------------------------------------------------------
1 | BaiduStockInfo.txt
2 | .vscode
3 | .idea
4 | __pycache__
5 | pics


--------------------------------------------------------------------------------
/BaiduStocks/BaiduStocks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CP-0/py_spider_codes/d7875b33c21d06b3a9af952248bf9f3acd50db93/BaiduStocks/BaiduStocks/__init__.py


--------------------------------------------------------------------------------
/BaiduStocks/BaiduStocks/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class BaidustocksItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/BaiduStocks/BaiduStocks/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class BaidustocksSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/BaiduStocks/BaiduStocks/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class BaidustocksPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 
13 | 
14 | class BaidustocksInfoPipeline(object):
15 |     def open_spider(self, spider):
16 |         self.f = open('BaiduStockInfo.txt', 'w')
17 | 
18 |     def close_spider(self, spider):
19 |         self.f.close()
20 | 
21 |     def process_item(self, item, spider):
22 |         try:
23 |             line = str(dict(item)) + '\n'
24 |             self.f.write(line)
25 |         except:
26 |             pass
27 |         return item
28 | 


--------------------------------------------------------------------------------
/BaiduStocks/BaiduStocks/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for BaiduStocks project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'BaiduStocks'
13 | 
14 | SPIDER_MODULES = ['BaiduStocks.spiders']
15 | NEWSPIDER_MODULE = 'BaiduStocks.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'BaiduStocks (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'BaiduStocks.middlewares.BaidustocksSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'BaiduStocks.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |    'BaiduStocks.pipelines.BaidustocksInfoPipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/BaiduStocks/BaiduStocks/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/BaiduStocks/BaiduStocks/spiders/stock.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import re
 4 | 
 5 | 
 6 | class StockSpider(scrapy.Spider):
 7 |     name = 'stock'
 8 |     start_urls = ['http://quote.eastmoney.com/stocklist.html']
 9 | 
10 |     def parse(self, response):
11 |         for href in response.css('a::attr(href)').extract():
12 |             try:
13 |                 stock = re.findall(r"[s][hz]\d{6}", href)[0]
14 |                 url = 'https://gupiao.baidu.com/stock/' + stock + '.html'
15 |                 yield scrapy.Request(url, callback=self.parse_stock)
16 |             except:
17 |                 continue
18 | 
19 |     def parse_stock(self, response):
20 |         infoDict = {}
21 |         stockInfo = response.css('.stock-bets')
22 |         name = stockInfo.css('.bets-name').extract()[0]
23 |         infoDict.update(
24 |             {'股票名称': re.findall(r'\s.*\(', name)[0].split()[0] +
25 |                 re.findall(r'\>.*\<', name)[0][1:-1]})
26 | 
27 |         keyList = stockInfo.css('dt').extract()
28 |         valueList = stockInfo.css('dd').extract()
29 |         for i in range(len(keyList)):
30 |             key = re.findall(r'>.*</dt>', keyList[i])[0][1:-5]
31 |             try:
32 |                 val = re.findall(r'\d+\.?.*</dd>', valueList[i])[0][0:-5]
33 |             except:
34 |                 val = '--'
35 |             infoDict[key] = val
36 | 
37 |         yield infoDict
38 | 


--------------------------------------------------------------------------------
/BaiduStocks/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = BaiduStocks.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = BaiduStocks
12 | 


--------------------------------------------------------------------------------
/CrawBaiduStocksA.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import re
 4 | import traceback
 5 | 
 6 | 
 7 | def getHTMLText(url, code='utf-8'):
 8 |     try:
 9 |         r = requests.get(url)
10 |         r.raise_for_status()
11 |         r.encoding = code
12 |         return r.text
13 |     except:
14 |         return ''
15 | 
16 | 
17 | def getStockList(lst, stockURL):
18 |     html = getHTMLText(stockURL, 'GB2312')
19 |     soup = BeautifulSoup(html, 'html.parser')
20 |     a = soup.find_all('a')
21 |     for i in a:
22 |         try:
23 |             href = i.attrs['href']
24 |             lst.append(re.findall(r'[s][hz]\d{6}', href)[0])
25 |         except:
26 |             continue
27 | 
28 | 
29 | def getStockInfo(lst, stockURL, fpath):
30 |     count = 0
31 |     for stock in lst:
32 |         url = stockURL + stock + '.html'
33 |         html = getHTMLText(url)
34 |         try:
35 |             if html == '':
36 |                 continue
37 |             infoDict = {}
38 |             soup = BeautifulSoup(html, 'html.parser')
39 |             stockInfo = soup.find('div', attrs={'class': 'stock-bets'})
40 | 
41 |             name = stockInfo.find_all(attrs={'class': 'bets-name'})[0]
42 |             infoDict.update({'股票名称': name.text.split()[0]})
43 | 
44 |             keyList = stockInfo.find_all('dt')
45 |             valueList = stockInfo.find_all('dd')
46 |             for i in range(len(keyList)):
47 |                 key = keyList[i].text
48 |                 val = valueList[i].text
49 |                 infoDict[key] = val
50 | 
51 |             with open(fpath, 'a', encoding='utf-8') as f:
52 |                 f.write(str(infoDict) + '\n')
53 |                 count = count + 1
54 |                 print('\r当前进度：{:.2f}%'.format(count * 100 / len(lst)), end="")
55 |         except:
56 |             count = count + 1
57 |             print('\r当前进度：{:.2f}%'.format(count * 100 / len(lst)), end="")
58 |             continue
59 | 
60 | 
61 | def main():
62 |     stock_list_url = 'http://quote.eastmoney.com/stocklist.html'
63 |     stock_info_url = 'https://gupiao.baidu.com/stock/'
64 |     output_file = './BaiduStockInfo.txt'
65 |     slist = []
66 |     getStockList(slist, stock_list_url)
67 |     getStockInfo(slist, stock_info_url, output_file)
68 | 
69 | 
70 | main()
71 | 


--------------------------------------------------------------------------------
/CrawTaobaoPrice.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import re
 3 | 
 4 | 
 5 | def getHTMLText(url):
 6 |     try:
 7 |         r = requests.get(url, timeout=30)
 8 |         r.raise_for_status()
 9 |         r.encoding = r.apparent_encoding
10 |         return r.text
11 |     except:
12 |         return ''
13 | 
14 | 
15 | def parsePage(ilt, html):
16 |     try:
17 |         plt = re.findall(r'\"price\"\:\"[\d.]*\"', html)
18 |         tlt = re.findall(r'\"title\"\:\".*?\"', html)
19 |         tilt = re.findall(r'\"tag_info\"\:.*?]{1}', html)
20 |         for i in range(len(plt)):
21 |             price = "￥" + eval(plt[i].split(':')[1])
22 |             title = eval(tlt[i].split(':')[1])
23 |             tglt = re.findall(r'\"tag\"\:\"(.*?)\"', tilt[i])
24 |             feature = ', '.join(tglt)
25 |             ilt.append([price, title, feature])
26 |     except:
27 |         print('')
28 | 
29 | 
30 | def printGoodsList(ilt):
31 |     tplt = "{:4}\t{:8}\t{:16}\t{:16}"
32 |     print(tplt.format("序号", "价格", "商品名称", "特色"))
33 |     count = 0
34 |     for j in ilt:
35 |         count = count + 1
36 |         print(tplt.format(count, j[0], j[1], j[2]))
37 | 
38 | 
39 | def main():
40 |     goods = "手机"
41 |     depth = 2
42 |     start_url = 'https://s.taobao.com/search?q=' + goods
43 |     infoList = []
44 |     for i in range(depth):
45 |         try:
46 |             url = start_url + "&s=" + str(i * 48)
47 |             html = getHTMLText(url)
48 |             parsePage(infoList, html)
49 |         except:
50 |             continue
51 |     printGoodsList(infoList)
52 | 
53 | 
54 | main()
55 | 


--------------------------------------------------------------------------------
/CrawUnivRanking.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import bs4
 4 | 
 5 | 
 6 | def getHTMLText(url):
 7 |     try:
 8 |         r = requests.get(url)
 9 |         r.raise_for_status()
10 |         r.encoding = r.apparent_encoding
11 |         return r.text
12 |     except:
13 |         return ''
14 | 
15 | 
16 | def fillUnivList(ulist, html):
17 |     soup = BeautifulSoup(html, 'html.parser')
18 |     for tr in soup.find('tbody').children:
19 |         if isinstance(tr, bs4.element.Tag):
20 |             tds = tr('td')
21 |             ulist.append([tds[0].string, tds[1].string, tds[2].string])
22 | 
23 | 
24 | def printUnivList(ulist, num):
25 |     tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"
26 |     print(tplt.format('排名', '学校', '地区', chr(12288)))
27 |     for i in range(num):
28 |         u = ulist[i]
29 |         print(tplt.format(u[0], u[1], u[2], chr(12288)))
30 | 
31 | 
32 | def main():
33 |     uinfo = []
34 |     url = 'http://www.zuihaodaxue.com/zuihaodaxuepaiming2016.html'
35 |     html = getHTMLText(url)
36 |     fillUnivList(uinfo, html)
37 |     printUnivList(uinfo, 20)
38 | 
39 | 
40 | main()
41 | 


--------------------------------------------------------------------------------
/MaoYan/spider.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing import Pool
 2 | import re
 3 | import json
 4 | import requests
 5 | from requests.exceptions import RequestException
 6 | 
 7 | 
 8 | def get_one_page(url):
 9 |     try:
10 |         headers = {'user-agent': 'Mozilla/5.0'}
11 |         response = requests.get(url, headers=headers)
12 |         if response.status_code == 200:
13 |             return response.text
14 |         return None
15 |     except RequestException:
16 |         return None
17 | 
18 | 
19 | def parse_one_page(html):
20 |     pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
21 |                          + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
22 |                          + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
23 |     items = re.findall(pattern, html)
24 |     for item in items:
25 |         yield {
26 |             'index': item[0],
27 |             'image': item[1],
28 |             'title': item[2],
29 |             'actor': item[3].strip()[3:],
30 |             'time': item[4].strip()[5:],
31 |             'score': item[5] + item[6]
32 |         }
33 | 
34 | 
35 | def write_to_file(content):
36 |     with open('result.txt', 'a', encoding='utf-8') as f:
37 |         f.write(json.dumps(content, ensure_ascii=False) + '\n')
38 |         f.close()
39 | 
40 | 
41 | def main(offset):
42 |     url = 'http://maoyan.com/board/4?offset=' + str(offset)
43 |     html = get_one_page(url)
44 |     for item in parse_one_page(html):
45 |         print(item)
46 |         write_to_file(item)
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     pool = Pool()
51 |     pool.map(main, [i * 10 for i in range(10)])
52 |     pool.close()
53 |     pool.join()
54 | 


--------------------------------------------------------------------------------
/TaobaoProduct/config.py:
--------------------------------------------------------------------------------
1 | MONGO_URL = 'localhost'
2 | MONGO_DB = 'taobao'
3 | MONGO_TABLE = 'product'
4 | 
5 | SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
6 | 
7 | KEYWORD = '美食'


--------------------------------------------------------------------------------
/TaobaoProduct/spider.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from selenium import webdriver
  3 | from selenium.common.exceptions import TimeoutException
  4 | from selenium.webdriver.common.by import By
  5 | from selenium.webdriver.support.ui import WebDriverWait
  6 | from selenium.webdriver.support import expected_conditions as EC
  7 | from pyquery import PyQuery as pq
  8 | from config import *
  9 | import pymongo
 10 | 
 11 | client = pymongo.MongoClient(MONGO_URL)
 12 | db = client[MONGO_DB]
 13 | 
 14 | # browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
 15 | browser = webdriver.Chrome()
 16 | wait = WebDriverWait(browser, 10)
 17 | 
 18 | # browser.set_window_size(1400, 900)
 19 | 
 20 | 
 21 | def search():
 22 |     print('正在搜索')
 23 |     try:
 24 |         browser.get('https://www.taobao.com')
 25 |         input = wait.until(
 26 |             EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))
 27 |         )
 28 |         submit = wait.until(
 29 |             EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')))
 30 |         input.send_keys(KEYWORD)
 31 |         submit.click()
 32 |         total = wait.until(
 33 |             EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total')))
 34 |         get_products()
 35 |         return total.text
 36 |     except TimeoutException:
 37 |         return search()
 38 | 
 39 | 
 40 | def next_page(page_number):
 41 |     print('正在翻页', page_number)
 42 |     try:
 43 |         input = wait.until(
 44 |             EC.presence_of_element_located(
 45 |                 (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))
 46 |         )
 47 |         submit = wait.until(EC.element_to_be_clickable(
 48 |             (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
 49 |         input.clear()
 50 |         input.send_keys(page_number)
 51 |         submit.click()
 52 |         wait.until(EC.text_to_be_present_in_element(
 53 |             (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_number)))
 54 |         get_products()
 55 |     except TimeoutException:
 56 |         next_page(page_number)
 57 | 
 58 | 
 59 | def get_products():
 60 |     wait.until(EC.presence_of_element_located(
 61 |         (By.CSS_SELECTOR, '#mainsrp-itemlist .items .item')))
 62 |     html = browser.page_source
 63 |     doc = pq(html)
 64 |     items = doc('#mainsrp-itemlist .items .item').items()
 65 |     for item in items:
 66 |         product = {
 67 |             'image': item.find('.pic .img').attr('src'),
 68 |             'price': item.find('.price').text(),
 69 |             'deal': item.find('.deal-cnt').text()[:-3],
 70 |             'title': item.find('.title').text(),
 71 |             'shop': item.find('.shop').text(),
 72 |             'location': item.find('.location').text()
 73 |         }
 74 |         print(product)
 75 |         save_to_mongo(product)
 76 | 
 77 | 
 78 | def save_to_mongo(result):
 79 |     try:
 80 |         if db[MONGO_TABLE].insert(result):
 81 |             print('存储到MONGODB成功', result)
 82 |     except Exception:
 83 |         print('存储到MONGODB失败', result)
 84 | 
 85 | 
 86 | def main():
 87 |     try:
 88 |         total = search()
 89 |         total = int(re.compile('(\d+)').search(total).group(1))
 90 |         for i in range(2, total + 1):
 91 |             next_page(i)
 92 |     except Exception:
 93 |         print('出错啦')
 94 |     finally:
 95 |         browser.close()
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     main()
100 | 


--------------------------------------------------------------------------------
/TouTiao/config.py:
--------------------------------------------------------------------------------
1 | MONGO_URL = 'localhost'
2 | MONGO_DB = 'toutiao'
3 | MONGO_TABLE = 'toutiao'
4 | 
5 | GROUP_START = 1
6 | GROUP_END = 10
7 | KEYWORD='街拍'


--------------------------------------------------------------------------------
/TouTiao/spider.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from urllib.parse import urlencode
  4 | import pymongo
  5 | import requests
  6 | from bs4 import BeautifulSoup
  7 | from requests.exceptions import ConnectionError
  8 | import re
  9 | from multiprocessing import Pool
 10 | from hashlib import md5
 11 | from json.decoder import JSONDecodeError
 12 | from config import *
 13 | 
 14 | client = pymongo.MongoClient(MONGO_URL, connect=False)
 15 | db = client[MONGO_DB]
 16 | 
 17 | if not os.path.exists('pics'):
 18 |     os.mkdir('pics')
 19 | 
 20 | 
 21 | def get_page_index(offset, keyword):
 22 |     data = {
 23 |         'autoload': 'true',
 24 |         'count': 20,
 25 |         'cur_tab': 3,
 26 |         'format': 'json',
 27 |         'keyword': keyword,
 28 |         'offset': offset,
 29 |     }
 30 |     params = urlencode(data)
 31 |     base = 'http://www.toutiao.com/search_content/'
 32 |     url = base + '?' + params
 33 |     try:
 34 |         response = requests.get(url)
 35 |         if response.status_code == 200:
 36 |             return response.text
 37 |         return None
 38 |     except ConnectionError:
 39 |         print('Error occurred')
 40 |         return None
 41 | 
 42 | 
 43 | def download_image(url):
 44 |     print('Downloading', url)
 45 |     try:
 46 |         response = requests.get(url)
 47 |         if response.status_code == 200:
 48 |             imgType = response.headers.get('Content-Type')[6:]
 49 |             if imgType == 'jpeg':
 50 |                 imgType = 'jpg'
 51 |             save_image(response.content, imgType)
 52 |         return None
 53 |     except ConnectionError:
 54 |         return None
 55 | 
 56 | 
 57 | def save_image(content, type):
 58 |     file_path = '{0}/{1}.{2}'.format(os.getcwd() + '/pics',
 59 |                                      md5(content).hexdigest(), type)
 60 |     print(file_path)
 61 |     if not os.path.exists(file_path):
 62 |         with open(file_path, 'wb') as f:
 63 |             f.write(content)
 64 |             f.close()
 65 | 
 66 | 
 67 | def parse_page_index(text):
 68 |     try:
 69 |         data = json.loads(text)
 70 |         if data and 'data' in data.keys():
 71 |             for item in data.get('data'):
 72 |                 yield item.get('article_url')
 73 |     except JSONDecodeError:
 74 |         pass
 75 | 
 76 | 
 77 | def get_page_detail(url):
 78 |     try:
 79 |         response = requests.get(url)
 80 |         if response.status_code == 200:
 81 |             return response.text
 82 |         return None
 83 |     except ConnectionError:
 84 |         print('Error occurred')
 85 |         return None
 86 | 
 87 | 
 88 | def parse_page_detail(html, url):
 89 |     soup = BeautifulSoup(html, 'lxml')
 90 |     result = soup.select('title')
 91 |     title = result[0].get_text() if result else ''
 92 |     images_pattern = re.compile('gallery: JSON.parse\(\"(.*?)\"\),', re.S)
 93 |     result = re.search(images_pattern, html)
 94 |     if result:
 95 |         data = json.loads(result.group(1).replace(
 96 |             '\\"', '\"').replace('\\\\', ''))
 97 |         if data and 'sub_images' in data.keys():
 98 |             sub_images = data.get('sub_images')
 99 |             images = [item.get('url') for item in sub_images]
100 |             for image in images:
101 |                 download_image(image)
102 |             return {
103 |                 'title': title,
104 |                 'url': url,
105 |                 'images': images
106 |             }
107 | 
108 | 
109 | def save_to_mongo(result):
110 |     if db[MONGO_TABLE].insert(result):
111 |         print('Successfully Saved to Mongo', result)
112 |         return True
113 |     return False
114 | 
115 | 
116 | def main(offset):
117 |     text = get_page_index(offset, KEYWORD)
118 |     urls = parse_page_index(text)
119 |     for url in urls:
120 |         html = get_page_detail(url)
121 |         result = parse_page_detail(html, url)
122 |         if result:
123 |             save_to_mongo(result)
124 | 
125 | 
126 | pool = Pool()
127 | groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
128 | pool.map(main, groups)
129 | pool.close()
130 | pool.join()
131 | 


--------------------------------------------------------------------------------
/Weixin/config.py:
--------------------------------------------------------------------------------
1 | PROXY_POOL_URL = 'http://127.0.0.1:5000/get'
2 | KEYWORD = 'Python'
3 | MONGO_URI = 'localhost'
4 | MONGO_DB = 'weixin'
5 | MAX_COUNT = 5


--------------------------------------------------------------------------------
/Weixin/spider.py:
--------------------------------------------------------------------------------
  1 | from urllib.parse import urlencode
  2 | import pymongo
  3 | import requests
  4 | from lxml.etree import XMLSyntaxError
  5 | from requests.exceptions import ConnectionError
  6 | from pyquery import PyQuery as pq
  7 | from config import *
  8 | 
  9 | client = pymongo.MongoClient(MONGO_URI)
 10 | db = client[MONGO_DB]
 11 | 
 12 | base_url = 'http://weixin.sogou.com/weixin?'
 13 | 
 14 | headers = {
 15 |     'Cookie': 'SUV=1509129808368866; SMYUV=1509129808395088; ABTEST=0|1511965164|v1; SNUID=48EF0F91E5E0BBFA36750514E5021D03; SUID=AC0BEB744842910A000000005A1EC1EC; SUID=AC0BEB745018910A000000005A1EC1ED; weixinIndexVisited=1; IPLOC=CN3100; JSESSIONID=aaaxgt9qdt-zzePJ7Pv8v; ppinf=5|1511965755|1513175355|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo3OlZhbXBpcmV8Y3J0OjEwOjE1MTE5NjU3NTV8cmVmbmljazo3OlZhbXBpcmV8dXNlcmlkOjQ0Om85dDJsdUExVEZaUHQwcGN6ZVo3cU54RDV4MDhAd2VpeGluLnNvaHUuY29tfA; pprdig=c6yA_9zg5lQZIzDaGlJmPdRCP6fkQ-FcEBHcjtOkZLhPHbF7ld-lilOnLnyR6Xu9cjbAJUfdsYIYI2pvMBUdahClUcpxYJK46PqJZ1WgU1nA7BR1IRwvjqnpCKdzAT5WkmlKLXi2L9XEKcZF7ItOHVy0mLb-yIZ9nU2OlWr4CMc; sgid=02-30131415-AVoexDskULls7k9ia1Km7RuA; ppmdig=15119657560000008c4babee24ee063e8afb031f772b2169; sct=4',
 16 |     'Host': 'weixin.sogou.com',
 17 |     'Upgrade-Insecure-Requests': '1',
 18 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
 19 | }
 20 | 
 21 | proxy = None
 22 | 
 23 | 
 24 | def get_proxy():
 25 |     try:
 26 |         response = requests.get(PROXY_POOL_URL)
 27 |         if response.status_code == 200:
 28 |             return response.text
 29 |         return None
 30 |     except ConnectionError:
 31 |         return None
 32 | 
 33 | def get_html(url, count=1):
 34 |     print('Crawling', url)
 35 |     print('Trying Count', count)
 36 |     global proxy
 37 |     if count >= MAX_COUNT:
 38 |         print('Tried Too Many Counts')
 39 |         return None
 40 |     try:
 41 |         if proxy:
 42 |             proxies = {
 43 |                 'http': 'http://' + proxy
 44 |             }
 45 |             response = requests.get(url, allow_redirects=False, headers=headers, proxies=proxies)
 46 |         else:
 47 |             response = requests.get(url, allow_redirects=False, headers=headers)
 48 |         if response.status_code == 200:
 49 |             return response.text
 50 |         if response.status_code == 302:
 51 |             # Need Proxy
 52 |             print('302')
 53 |             proxy = get_proxy()
 54 |             if proxy:
 55 |                 print('Using Proxy', proxy)
 56 |                 return get_html(url)
 57 |             else:
 58 |                 print('Get Proxy Failed')
 59 |                 return None
 60 |     except ConnectionError as e:
 61 |         print('Error Occurred', e.args)
 62 |         proxy = get_proxy()
 63 |         count += 1
 64 |         return get_html(url, count)
 65 | 
 66 | 
 67 | 
 68 | def get_index(keyword, page):
 69 |     data = {
 70 |         'query': keyword,
 71 |         'type': 2,
 72 |         'page': page
 73 |     }
 74 |     queries = urlencode(data)
 75 |     url = base_url + queries
 76 |     html = get_html(url)
 77 |     return html
 78 | 
 79 | def parse_index(html):
 80 |     doc = pq(html)
 81 |     items = doc('.news-box .news-list li .txt-box h3 a').items()
 82 |     for item in items:
 83 |         yield item.attr('href')
 84 | 
 85 | def get_detail(url):
 86 |     try:
 87 |         response = requests.get(url)
 88 |         if response.status_code == 200:
 89 |             return response.text
 90 |         return None
 91 |     except ConnectionError:
 92 |         return None
 93 | 
 94 | def parse_detail(html):
 95 |     try:
 96 |         doc = pq(html)
 97 |         title = doc('.rich_media_title').text()
 98 |         content = doc('.rich_media_content').text()
 99 |         date = doc('#post-date').text()
100 |         nickname = doc('#js_profile_qrcode > div > strong').text()
101 |         wechat = doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
102 |         return {
103 |             'title': title,
104 |             'content': content,
105 |             'date': date,
106 |             'nickname': nickname,
107 |             'wechat': wechat
108 |         }
109 |     except XMLSyntaxError:
110 |         return None
111 | 
112 | def save_to_mongo(data):
113 |     if db['articles'].update({'title': data['title']}, {'$set': data}, True):
114 |         print('Saved to Mongo', data['title'])
115 |     else:
116 |         print('Saved to Mongo Failed', data['title'])
117 | 
118 | 
119 | def main():
120 |     for page in range(1, 101):
121 |         html = get_index(KEYWORD, page)
122 |         if html:
123 |             article_urls = parse_index(html)
124 |             for article_url in article_urls:
125 |                 article_html = get_detail(article_url)
126 |                 if article_html:
127 |                     article_data = parse_detail(article_html)
128 |                     print(article_data)
129 |                     if article_data:
130 |                         save_to_mongo(article_data)
131 | 
132 | 
133 | 
134 | if __name__ == '__main__':
135 |     main()


--------------------------------------------------------------------------------
/Zhihu/Zhihu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CP-0/py_spider_codes/d7875b33c21d06b3a9af952248bf9f3acd50db93/Zhihu/Zhihu/__init__.py


--------------------------------------------------------------------------------
/Zhihu/Zhihu/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy import Item, Field
 9 | 
10 | 
11 | class UserItem(Item):
12 |     # define the fields for your item here like:
13 |     id = Field()
14 |     name = Field()
15 |     avatar_url = Field()
16 |     headline = Field()
17 |     description = Field()
18 |     url = Field()
19 |     url_token = Field()
20 |     gender = Field()
21 |     cover_url = Field()
22 |     type = Field()
23 |     badge = Field()
24 | 
25 |     answer_count = Field()
26 |     articles_count = Field()
27 |     commercial_question_count = Field()
28 |     favorite_count = Field()
29 |     favorited_count = Field()
30 |     follower_count = Field()
31 |     following_columns_count = Field()
32 |     following_count = Field()
33 |     pins_count = Field()
34 |     question_count = Field()
35 |     thank_from_count = Field()
36 |     thank_to_count = Field()
37 |     thanked_count = Field()
38 |     vote_from_count = Field()
39 |     vote_to_count = Field()
40 |     voteup_count = Field()
41 |     following_favlists_count = Field()
42 |     following_question_count = Field()
43 |     following_topic_count = Field()
44 |     marked_answers_count = Field()
45 |     mutual_followees_count = Field()
46 |     hosted_live_count = Field()
47 |     participated_live_count = Field()
48 | 
49 |     locations = Field()
50 |     educations = Field()
51 |     employments = Field()


--------------------------------------------------------------------------------
/Zhihu/Zhihu/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class ZhihuSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/Zhihu/Zhihu/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import pymongo
 8 | 
 9 | 
10 | class ZhihuPipeline(object):
11 |     def process_item(self, item, spider):
12 |         return item
13 | 
14 | 
15 | class MongoPipeline(object):
16 |     collection_name = 'users'
17 | 
18 |     def __init__(self, mongo_uri, mongo_db):
19 |         self.mongo_uri = mongo_uri
20 |         self.mongo_db = mongo_db
21 | 
22 |     @classmethod
23 |     def from_crawler(cls, crawler):
24 |         return cls(
25 |             mongo_uri=crawler.settings.get('MONGO_URI'),
26 |             mongo_db=crawler.settings.get('MONGO_DATABASE')
27 |         )
28 | 
29 |     def open_spider(self, spider):
30 |         self.client = pymongo.MongoClient(self.mongo_uri)
31 |         self.db = self.client[self.mongo_db]
32 | 
33 |     def close_spider(self, spider):
34 |         self.client.close()
35 | 
36 |     def process_item(self, item, spider):
37 |         self.db[self.collection_name].update({'url_token': item['url_token']}, dict(item), True)
38 |         return item
39 | 


--------------------------------------------------------------------------------
/Zhihu/Zhihu/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for Zhihu project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'Zhihu'
13 | 
14 | SPIDER_MODULES = ['Zhihu.spiders']
15 | NEWSPIDER_MODULE = 'Zhihu.spiders'
16 | 
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | # USER_AGENT = 'Zhihu (+http://www.yourdomain.com)'
19 | 
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = False
22 | 
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | # CONCURRENT_REQUESTS = 32
25 | 
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | # DOWNLOAD_DELAY = 3
30 | # The download delay setting will honor only one of:
31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | # CONCURRENT_REQUESTS_PER_IP = 16
33 | 
34 | # Disable cookies (enabled by default)
35 | # COOKIES_ENABLED = False
36 | 
37 | # Disable Telnet Console (enabled by default)
38 | # TELNETCONSOLE_ENABLED = False
39 | 
40 | # Override the default request headers:
41 | DEFAULT_REQUEST_HEADERS = {
42 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 |     'Accept-Language': 'en',
44 |     'authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20',
45 |     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
46 | }
47 | 
48 | # Enable or disable spider middlewares
49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
50 | # SPIDER_MIDDLEWARES = {
51 | #    'Zhihu.middlewares.ZhihuSpiderMiddleware': 543,
52 | # }
53 | 
54 | # Enable or disable downloader middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
56 | # DOWNLOADER_MIDDLEWARES = {
57 | #    'Zhihu.middlewares.MyCustomDownloaderMiddleware': 543,
58 | # }
59 | 
60 | # Enable or disable extensions
61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
62 | # EXTENSIONS = {
63 | #    'scrapy.extensions.telnet.TelnetConsole': None,
64 | # }
65 | 
66 | # Configure item pipelines
67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
68 | ITEM_PIPELINES = {
69 |    'Zhihu.pipelines.MongoPipeline': 300,
70 | }
71 | 
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | # AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | # AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | # AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | # AUTOTHROTTLE_DEBUG = False
84 | 
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | # HTTPCACHE_ENABLED = True
88 | # HTTPCACHE_EXPIRATION_SECS = 0
89 | # HTTPCACHE_DIR = 'httpcache'
90 | # HTTPCACHE_IGNORE_HTTP_CODES = []
91 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 | 
93 | MONGO_URI = 'localhost'
94 | MONGO_DATABASE = 'zhihu'


--------------------------------------------------------------------------------
/Zhihu/Zhihu/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/Zhihu/Zhihu/spiders/zhihu.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | 
 4 | from scrapy import Request, Spider
 5 | 
 6 | from Zhihu.items import UserItem
 7 | 
 8 | 
 9 | class ZhihuSpider(Spider):
10 |     name = 'zhihu'
11 |     allowed_domains = ['www.zhihu.com']
12 |     user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
13 |     follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
14 |     followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}'
15 |     start_user = 'excited-vczh'
16 |     user_query = 'locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,answer_count,articles_count,pins_count,question_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_force_renamed,is_bind_sina,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics'
17 |     follows_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
18 |     followers_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
19 | 
20 |     def start_requests(self):
21 |         yield Request(self.user_url.format(user=self.start_user, include=self.user_query), self.parse_user)
22 |         yield Request(self.follows_url.format(user=self.start_user, include=self.follows_query, limit=20, offset=0),
23 |                       self.parse_follows)
24 |         yield Request(self.followers_url.format(user=self.start_user, include=self.followers_query, limit=20, offset=0),
25 |                       self.parse_followers)
26 | 
27 |     def parse_user(self, response):
28 |         result = json.loads(response.text)
29 |         item = UserItem()
30 | 
31 |         for field in item.fields:
32 |             if field in result.keys():
33 |                 item[field] = result.get(field)
34 |         yield item
35 | 
36 |         yield Request(
37 |             self.follows_url.format(user=result.get(
38 |                 'url_token'), include=self.follows_query, limit=20, offset=0),
39 |             self.parse_follows)
40 | 
41 |         yield Request(
42 |             self.followers_url.format(user=result.get(
43 |                 'url_token'), include=self.followers_query, limit=20, offset=0),
44 |             self.parse_followers)
45 | 
46 |     def parse_follows(self, response):
47 |         results = json.loads(response.text)
48 | 
49 |         if 'data' in results.keys():
50 |             for result in results.get('data'):
51 |                 yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
52 |                               self.parse_user)
53 | 
54 |         if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
55 |             next_page = results.get('paging').get('next')
56 |             yield Request(next_page, self.parse_follows)
57 | 
58 |     def parse_followers(self, response):
59 |         results = json.loads(response.text)
60 | 
61 |         if 'data' in results.keys():
62 |             for result in results.get('data'):
63 |                 yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
64 |                               self.parse_user)
65 | 
66 |         if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
67 |             next_page = results.get('paging').get('next')
68 |             yield Request(next_page, self.parse_followers)
69 | 


--------------------------------------------------------------------------------
/Zhihu/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = Zhihu.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = Zhihu
12 | 


--------------------------------------------------------------------------------