├── .gitignore
├── BaiduStocks
├── BaiduStocks
│ ├── __init__.py
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ │ ├── __init__.py
│ │ └── stock.py
└── scrapy.cfg
├── CrawBaiduStocksA.py
├── CrawTaobaoPrice.py
├── CrawUnivRanking.py
├── MaoYan
└── spider.py
├── TaobaoProduct
├── config.py
└── spider.py
├── TouTiao
├── config.py
└── spider.py
├── Weixin
├── config.py
└── spider.py
└── Zhihu
├── Zhihu
├── __init__.py
├── items.py
├── middlewares.py
├── pipelines.py
├── settings.py
└── spiders
│ ├── __init__.py
│ └── zhihu.py
└── scrapy.cfg
/.gitignore:
--------------------------------------------------------------------------------
1 | BaiduStockInfo.txt
2 | .vscode
3 | .idea
4 | __pycache__
5 | pics
--------------------------------------------------------------------------------
/BaiduStocks/BaiduStocks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CP-0/py_spider_codes/d7875b33c21d06b3a9af952248bf9f3acd50db93/BaiduStocks/BaiduStocks/__init__.py
--------------------------------------------------------------------------------
/BaiduStocks/BaiduStocks/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class BaidustocksItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | pass
15 |
--------------------------------------------------------------------------------
/BaiduStocks/BaiduStocks/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class BaidustocksSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
--------------------------------------------------------------------------------
/BaiduStocks/BaiduStocks/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class BaidustocksPipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
13 |
14 | class BaidustocksInfoPipeline(object):
15 | def open_spider(self, spider):
16 | self.f = open('BaiduStockInfo.txt', 'w')
17 |
18 | def close_spider(self, spider):
19 | self.f.close()
20 |
21 | def process_item(self, item, spider):
22 | try:
23 | line = str(dict(item)) + '\n'
24 | self.f.write(line)
25 | except:
26 | pass
27 | return item
28 |
--------------------------------------------------------------------------------
/BaiduStocks/BaiduStocks/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for BaiduStocks project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'BaiduStocks'
13 |
14 | SPIDER_MODULES = ['BaiduStocks.spiders']
15 | NEWSPIDER_MODULE = 'BaiduStocks.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'BaiduStocks (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'BaiduStocks.middlewares.BaidustocksSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'BaiduStocks.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'BaiduStocks.pipelines.BaidustocksInfoPipeline': 300,
69 | }
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/BaiduStocks/BaiduStocks/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/BaiduStocks/BaiduStocks/spiders/stock.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | import re
4 |
5 |
6 | class StockSpider(scrapy.Spider):
7 | name = 'stock'
8 | start_urls = ['http://quote.eastmoney.com/stocklist.html']
9 |
10 | def parse(self, response):
11 | for href in response.css('a::attr(href)').extract():
12 | try:
13 | stock = re.findall(r"[s][hz]\d{6}", href)[0]
14 | url = 'https://gupiao.baidu.com/stock/' + stock + '.html'
15 | yield scrapy.Request(url, callback=self.parse_stock)
16 | except:
17 | continue
18 |
19 | def parse_stock(self, response):
20 | infoDict = {}
21 | stockInfo = response.css('.stock-bets')
22 | name = stockInfo.css('.bets-name').extract()[0]
23 | infoDict.update(
24 | {'股票名称': re.findall(r'\s.*\(', name)[0].split()[0] +
25 | re.findall(r'\>.*\<', name)[0][1:-1]})
26 |
27 | keyList = stockInfo.css('dt').extract()
28 | valueList = stockInfo.css('dd').extract()
29 | for i in range(len(keyList)):
30 | key = re.findall(r'>.*', keyList[i])[0][1:-5]
31 | try:
32 | val = re.findall(r'\d+\.?.*', valueList[i])[0][0:-5]
33 | except:
34 | val = '--'
35 | infoDict[key] = val
36 |
37 | yield infoDict
38 |
--------------------------------------------------------------------------------
/BaiduStocks/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = BaiduStocks.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = BaiduStocks
12 |
--------------------------------------------------------------------------------
/CrawBaiduStocksA.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import re
4 | import traceback
5 |
6 |
7 | def getHTMLText(url, code='utf-8'):
8 | try:
9 | r = requests.get(url)
10 | r.raise_for_status()
11 | r.encoding = code
12 | return r.text
13 | except:
14 | return ''
15 |
16 |
17 | def getStockList(lst, stockURL):
18 | html = getHTMLText(stockURL, 'GB2312')
19 | soup = BeautifulSoup(html, 'html.parser')
20 | a = soup.find_all('a')
21 | for i in a:
22 | try:
23 | href = i.attrs['href']
24 | lst.append(re.findall(r'[s][hz]\d{6}', href)[0])
25 | except:
26 | continue
27 |
28 |
29 | def getStockInfo(lst, stockURL, fpath):
30 | count = 0
31 | for stock in lst:
32 | url = stockURL + stock + '.html'
33 | html = getHTMLText(url)
34 | try:
35 | if html == '':
36 | continue
37 | infoDict = {}
38 | soup = BeautifulSoup(html, 'html.parser')
39 | stockInfo = soup.find('div', attrs={'class': 'stock-bets'})
40 |
41 | name = stockInfo.find_all(attrs={'class': 'bets-name'})[0]
42 | infoDict.update({'股票名称': name.text.split()[0]})
43 |
44 | keyList = stockInfo.find_all('dt')
45 | valueList = stockInfo.find_all('dd')
46 | for i in range(len(keyList)):
47 | key = keyList[i].text
48 | val = valueList[i].text
49 | infoDict[key] = val
50 |
51 | with open(fpath, 'a', encoding='utf-8') as f:
52 | f.write(str(infoDict) + '\n')
53 | count = count + 1
54 | print('\r当前进度:{:.2f}%'.format(count * 100 / len(lst)), end="")
55 | except:
56 | count = count + 1
57 | print('\r当前进度:{:.2f}%'.format(count * 100 / len(lst)), end="")
58 | continue
59 |
60 |
61 | def main():
62 | stock_list_url = 'http://quote.eastmoney.com/stocklist.html'
63 | stock_info_url = 'https://gupiao.baidu.com/stock/'
64 | output_file = './BaiduStockInfo.txt'
65 | slist = []
66 | getStockList(slist, stock_list_url)
67 | getStockInfo(slist, stock_info_url, output_file)
68 |
69 |
70 | main()
71 |
--------------------------------------------------------------------------------
/CrawTaobaoPrice.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import re
3 |
4 |
5 | def getHTMLText(url):
6 | try:
7 | r = requests.get(url, timeout=30)
8 | r.raise_for_status()
9 | r.encoding = r.apparent_encoding
10 | return r.text
11 | except:
12 | return ''
13 |
14 |
15 | def parsePage(ilt, html):
16 | try:
17 | plt = re.findall(r'\"price\"\:\"[\d.]*\"', html)
18 | tlt = re.findall(r'\"title\"\:\".*?\"', html)
19 | tilt = re.findall(r'\"tag_info\"\:.*?]{1}', html)
20 | for i in range(len(plt)):
21 | price = "¥" + eval(plt[i].split(':')[1])
22 | title = eval(tlt[i].split(':')[1])
23 | tglt = re.findall(r'\"tag\"\:\"(.*?)\"', tilt[i])
24 | feature = ', '.join(tglt)
25 | ilt.append([price, title, feature])
26 | except:
27 | print('')
28 |
29 |
30 | def printGoodsList(ilt):
31 | tplt = "{:4}\t{:8}\t{:16}\t{:16}"
32 | print(tplt.format("序号", "价格", "商品名称", "特色"))
33 | count = 0
34 | for j in ilt:
35 | count = count + 1
36 | print(tplt.format(count, j[0], j[1], j[2]))
37 |
38 |
39 | def main():
40 | goods = "手机"
41 | depth = 2
42 | start_url = 'https://s.taobao.com/search?q=' + goods
43 | infoList = []
44 | for i in range(depth):
45 | try:
46 | url = start_url + "&s=" + str(i * 48)
47 | html = getHTMLText(url)
48 | parsePage(infoList, html)
49 | except:
50 | continue
51 | printGoodsList(infoList)
52 |
53 |
54 | main()
55 |
--------------------------------------------------------------------------------
/CrawUnivRanking.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import bs4
4 |
5 |
6 | def getHTMLText(url):
7 | try:
8 | r = requests.get(url)
9 | r.raise_for_status()
10 | r.encoding = r.apparent_encoding
11 | return r.text
12 | except:
13 | return ''
14 |
15 |
16 | def fillUnivList(ulist, html):
17 | soup = BeautifulSoup(html, 'html.parser')
18 | for tr in soup.find('tbody').children:
19 | if isinstance(tr, bs4.element.Tag):
20 | tds = tr('td')
21 | ulist.append([tds[0].string, tds[1].string, tds[2].string])
22 |
23 |
24 | def printUnivList(ulist, num):
25 | tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"
26 | print(tplt.format('排名', '学校', '地区', chr(12288)))
27 | for i in range(num):
28 | u = ulist[i]
29 | print(tplt.format(u[0], u[1], u[2], chr(12288)))
30 |
31 |
32 | def main():
33 | uinfo = []
34 | url = 'http://www.zuihaodaxue.com/zuihaodaxuepaiming2016.html'
35 | html = getHTMLText(url)
36 | fillUnivList(uinfo, html)
37 | printUnivList(uinfo, 20)
38 |
39 |
40 | main()
41 |
--------------------------------------------------------------------------------
/MaoYan/spider.py:
--------------------------------------------------------------------------------
1 | from multiprocessing import Pool
2 | import re
3 | import json
4 | import requests
5 | from requests.exceptions import RequestException
6 |
7 |
8 | def get_one_page(url):
9 | try:
10 | headers = {'user-agent': 'Mozilla/5.0'}
11 | response = requests.get(url, headers=headers)
12 | if response.status_code == 200:
13 | return response.text
14 | return None
15 | except RequestException:
16 | return None
17 |
18 |
19 | def parse_one_page(html):
20 | pattern = re.compile('
.*?board-index.*?>(\d+).*?data-src="(.*?)".*?name">(.*?).*?star">(.*?).*?releasetime">(.*?)'
22 | + '.*?integer">(.*?).*?fraction">(.*?).*?', re.S)
23 | items = re.findall(pattern, html)
24 | for item in items:
25 | yield {
26 | 'index': item[0],
27 | 'image': item[1],
28 | 'title': item[2],
29 | 'actor': item[3].strip()[3:],
30 | 'time': item[4].strip()[5:],
31 | 'score': item[5] + item[6]
32 | }
33 |
34 |
35 | def write_to_file(content):
36 | with open('result.txt', 'a', encoding='utf-8') as f:
37 | f.write(json.dumps(content, ensure_ascii=False) + '\n')
38 | f.close()
39 |
40 |
41 | def main(offset):
42 | url = 'http://maoyan.com/board/4?offset=' + str(offset)
43 | html = get_one_page(url)
44 | for item in parse_one_page(html):
45 | print(item)
46 | write_to_file(item)
47 |
48 |
49 | if __name__ == '__main__':
50 | pool = Pool()
51 | pool.map(main, [i * 10 for i in range(10)])
52 | pool.close()
53 | pool.join()
54 |
--------------------------------------------------------------------------------
/TaobaoProduct/config.py:
--------------------------------------------------------------------------------
1 | MONGO_URL = 'localhost'
2 | MONGO_DB = 'taobao'
3 | MONGO_TABLE = 'product'
4 |
5 | SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
6 |
7 | KEYWORD = '美食'
--------------------------------------------------------------------------------
/TaobaoProduct/spider.py:
--------------------------------------------------------------------------------
1 | import re
2 | from selenium import webdriver
3 | from selenium.common.exceptions import TimeoutException
4 | from selenium.webdriver.common.by import By
5 | from selenium.webdriver.support.ui import WebDriverWait
6 | from selenium.webdriver.support import expected_conditions as EC
7 | from pyquery import PyQuery as pq
8 | from config import *
9 | import pymongo
10 |
11 | client = pymongo.MongoClient(MONGO_URL)
12 | db = client[MONGO_DB]
13 |
14 | # browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
15 | browser = webdriver.Chrome()
16 | wait = WebDriverWait(browser, 10)
17 |
18 | # browser.set_window_size(1400, 900)
19 |
20 |
21 | def search():
22 | print('正在搜索')
23 | try:
24 | browser.get('https://www.taobao.com')
25 | input = wait.until(
26 | EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))
27 | )
28 | submit = wait.until(
29 | EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')))
30 | input.send_keys(KEYWORD)
31 | submit.click()
32 | total = wait.until(
33 | EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total')))
34 | get_products()
35 | return total.text
36 | except TimeoutException:
37 | return search()
38 |
39 |
40 | def next_page(page_number):
41 | print('正在翻页', page_number)
42 | try:
43 | input = wait.until(
44 | EC.presence_of_element_located(
45 | (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))
46 | )
47 | submit = wait.until(EC.element_to_be_clickable(
48 | (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
49 | input.clear()
50 | input.send_keys(page_number)
51 | submit.click()
52 | wait.until(EC.text_to_be_present_in_element(
53 | (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_number)))
54 | get_products()
55 | except TimeoutException:
56 | next_page(page_number)
57 |
58 |
59 | def get_products():
60 | wait.until(EC.presence_of_element_located(
61 | (By.CSS_SELECTOR, '#mainsrp-itemlist .items .item')))
62 | html = browser.page_source
63 | doc = pq(html)
64 | items = doc('#mainsrp-itemlist .items .item').items()
65 | for item in items:
66 | product = {
67 | 'image': item.find('.pic .img').attr('src'),
68 | 'price': item.find('.price').text(),
69 | 'deal': item.find('.deal-cnt').text()[:-3],
70 | 'title': item.find('.title').text(),
71 | 'shop': item.find('.shop').text(),
72 | 'location': item.find('.location').text()
73 | }
74 | print(product)
75 | save_to_mongo(product)
76 |
77 |
78 | def save_to_mongo(result):
79 | try:
80 | if db[MONGO_TABLE].insert(result):
81 | print('存储到MONGODB成功', result)
82 | except Exception:
83 | print('存储到MONGODB失败', result)
84 |
85 |
86 | def main():
87 | try:
88 | total = search()
89 | total = int(re.compile('(\d+)').search(total).group(1))
90 | for i in range(2, total + 1):
91 | next_page(i)
92 | except Exception:
93 | print('出错啦')
94 | finally:
95 | browser.close()
96 |
97 |
98 | if __name__ == '__main__':
99 | main()
100 |
--------------------------------------------------------------------------------
/TouTiao/config.py:
--------------------------------------------------------------------------------
1 | MONGO_URL = 'localhost'
2 | MONGO_DB = 'toutiao'
3 | MONGO_TABLE = 'toutiao'
4 |
5 | GROUP_START = 1
6 | GROUP_END = 10
7 | KEYWORD='街拍'
--------------------------------------------------------------------------------
/TouTiao/spider.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | from urllib.parse import urlencode
4 | import pymongo
5 | import requests
6 | from bs4 import BeautifulSoup
7 | from requests.exceptions import ConnectionError
8 | import re
9 | from multiprocessing import Pool
10 | from hashlib import md5
11 | from json.decoder import JSONDecodeError
12 | from config import *
13 |
14 | client = pymongo.MongoClient(MONGO_URL, connect=False)
15 | db = client[MONGO_DB]
16 |
17 | if not os.path.exists('pics'):
18 | os.mkdir('pics')
19 |
20 |
21 | def get_page_index(offset, keyword):
22 | data = {
23 | 'autoload': 'true',
24 | 'count': 20,
25 | 'cur_tab': 3,
26 | 'format': 'json',
27 | 'keyword': keyword,
28 | 'offset': offset,
29 | }
30 | params = urlencode(data)
31 | base = 'http://www.toutiao.com/search_content/'
32 | url = base + '?' + params
33 | try:
34 | response = requests.get(url)
35 | if response.status_code == 200:
36 | return response.text
37 | return None
38 | except ConnectionError:
39 | print('Error occurred')
40 | return None
41 |
42 |
43 | def download_image(url):
44 | print('Downloading', url)
45 | try:
46 | response = requests.get(url)
47 | if response.status_code == 200:
48 | imgType = response.headers.get('Content-Type')[6:]
49 | if imgType == 'jpeg':
50 | imgType = 'jpg'
51 | save_image(response.content, imgType)
52 | return None
53 | except ConnectionError:
54 | return None
55 |
56 |
57 | def save_image(content, type):
58 | file_path = '{0}/{1}.{2}'.format(os.getcwd() + '/pics',
59 | md5(content).hexdigest(), type)
60 | print(file_path)
61 | if not os.path.exists(file_path):
62 | with open(file_path, 'wb') as f:
63 | f.write(content)
64 | f.close()
65 |
66 |
67 | def parse_page_index(text):
68 | try:
69 | data = json.loads(text)
70 | if data and 'data' in data.keys():
71 | for item in data.get('data'):
72 | yield item.get('article_url')
73 | except JSONDecodeError:
74 | pass
75 |
76 |
77 | def get_page_detail(url):
78 | try:
79 | response = requests.get(url)
80 | if response.status_code == 200:
81 | return response.text
82 | return None
83 | except ConnectionError:
84 | print('Error occurred')
85 | return None
86 |
87 |
88 | def parse_page_detail(html, url):
89 | soup = BeautifulSoup(html, 'lxml')
90 | result = soup.select('title')
91 | title = result[0].get_text() if result else ''
92 | images_pattern = re.compile('gallery: JSON.parse\(\"(.*?)\"\),', re.S)
93 | result = re.search(images_pattern, html)
94 | if result:
95 | data = json.loads(result.group(1).replace(
96 | '\\"', '\"').replace('\\\\', ''))
97 | if data and 'sub_images' in data.keys():
98 | sub_images = data.get('sub_images')
99 | images = [item.get('url') for item in sub_images]
100 | for image in images:
101 | download_image(image)
102 | return {
103 | 'title': title,
104 | 'url': url,
105 | 'images': images
106 | }
107 |
108 |
109 | def save_to_mongo(result):
110 | if db[MONGO_TABLE].insert(result):
111 | print('Successfully Saved to Mongo', result)
112 | return True
113 | return False
114 |
115 |
116 | def main(offset):
117 | text = get_page_index(offset, KEYWORD)
118 | urls = parse_page_index(text)
119 | for url in urls:
120 | html = get_page_detail(url)
121 | result = parse_page_detail(html, url)
122 | if result:
123 | save_to_mongo(result)
124 |
125 |
126 | pool = Pool()
127 | groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
128 | pool.map(main, groups)
129 | pool.close()
130 | pool.join()
131 |
--------------------------------------------------------------------------------
/Weixin/config.py:
--------------------------------------------------------------------------------
1 | PROXY_POOL_URL = 'http://127.0.0.1:5000/get'
2 | KEYWORD = 'Python'
3 | MONGO_URI = 'localhost'
4 | MONGO_DB = 'weixin'
5 | MAX_COUNT = 5
--------------------------------------------------------------------------------
/Weixin/spider.py:
--------------------------------------------------------------------------------
1 | from urllib.parse import urlencode
2 | import pymongo
3 | import requests
4 | from lxml.etree import XMLSyntaxError
5 | from requests.exceptions import ConnectionError
6 | from pyquery import PyQuery as pq
7 | from config import *
8 |
9 | client = pymongo.MongoClient(MONGO_URI)
10 | db = client[MONGO_DB]
11 |
12 | base_url = 'http://weixin.sogou.com/weixin?'
13 |
14 | headers = {
15 | 'Cookie': 'SUV=1509129808368866; SMYUV=1509129808395088; ABTEST=0|1511965164|v1; SNUID=48EF0F91E5E0BBFA36750514E5021D03; SUID=AC0BEB744842910A000000005A1EC1EC; SUID=AC0BEB745018910A000000005A1EC1ED; weixinIndexVisited=1; IPLOC=CN3100; JSESSIONID=aaaxgt9qdt-zzePJ7Pv8v; ppinf=5|1511965755|1513175355|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo3OlZhbXBpcmV8Y3J0OjEwOjE1MTE5NjU3NTV8cmVmbmljazo3OlZhbXBpcmV8dXNlcmlkOjQ0Om85dDJsdUExVEZaUHQwcGN6ZVo3cU54RDV4MDhAd2VpeGluLnNvaHUuY29tfA; pprdig=c6yA_9zg5lQZIzDaGlJmPdRCP6fkQ-FcEBHcjtOkZLhPHbF7ld-lilOnLnyR6Xu9cjbAJUfdsYIYI2pvMBUdahClUcpxYJK46PqJZ1WgU1nA7BR1IRwvjqnpCKdzAT5WkmlKLXi2L9XEKcZF7ItOHVy0mLb-yIZ9nU2OlWr4CMc; sgid=02-30131415-AVoexDskULls7k9ia1Km7RuA; ppmdig=15119657560000008c4babee24ee063e8afb031f772b2169; sct=4',
16 | 'Host': 'weixin.sogou.com',
17 | 'Upgrade-Insecure-Requests': '1',
18 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
19 | }
20 |
21 | proxy = None
22 |
23 |
24 | def get_proxy():
25 | try:
26 | response = requests.get(PROXY_POOL_URL)
27 | if response.status_code == 200:
28 | return response.text
29 | return None
30 | except ConnectionError:
31 | return None
32 |
33 | def get_html(url, count=1):
34 | print('Crawling', url)
35 | print('Trying Count', count)
36 | global proxy
37 | if count >= MAX_COUNT:
38 | print('Tried Too Many Counts')
39 | return None
40 | try:
41 | if proxy:
42 | proxies = {
43 | 'http': 'http://' + proxy
44 | }
45 | response = requests.get(url, allow_redirects=False, headers=headers, proxies=proxies)
46 | else:
47 | response = requests.get(url, allow_redirects=False, headers=headers)
48 | if response.status_code == 200:
49 | return response.text
50 | if response.status_code == 302:
51 | # Need Proxy
52 | print('302')
53 | proxy = get_proxy()
54 | if proxy:
55 | print('Using Proxy', proxy)
56 | return get_html(url)
57 | else:
58 | print('Get Proxy Failed')
59 | return None
60 | except ConnectionError as e:
61 | print('Error Occurred', e.args)
62 | proxy = get_proxy()
63 | count += 1
64 | return get_html(url, count)
65 |
66 |
67 |
68 | def get_index(keyword, page):
69 | data = {
70 | 'query': keyword,
71 | 'type': 2,
72 | 'page': page
73 | }
74 | queries = urlencode(data)
75 | url = base_url + queries
76 | html = get_html(url)
77 | return html
78 |
79 | def parse_index(html):
80 | doc = pq(html)
81 | items = doc('.news-box .news-list li .txt-box h3 a').items()
82 | for item in items:
83 | yield item.attr('href')
84 |
85 | def get_detail(url):
86 | try:
87 | response = requests.get(url)
88 | if response.status_code == 200:
89 | return response.text
90 | return None
91 | except ConnectionError:
92 | return None
93 |
94 | def parse_detail(html):
95 | try:
96 | doc = pq(html)
97 | title = doc('.rich_media_title').text()
98 | content = doc('.rich_media_content').text()
99 | date = doc('#post-date').text()
100 | nickname = doc('#js_profile_qrcode > div > strong').text()
101 | wechat = doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
102 | return {
103 | 'title': title,
104 | 'content': content,
105 | 'date': date,
106 | 'nickname': nickname,
107 | 'wechat': wechat
108 | }
109 | except XMLSyntaxError:
110 | return None
111 |
112 | def save_to_mongo(data):
113 | if db['articles'].update({'title': data['title']}, {'$set': data}, True):
114 | print('Saved to Mongo', data['title'])
115 | else:
116 | print('Saved to Mongo Failed', data['title'])
117 |
118 |
119 | def main():
120 | for page in range(1, 101):
121 | html = get_index(KEYWORD, page)
122 | if html:
123 | article_urls = parse_index(html)
124 | for article_url in article_urls:
125 | article_html = get_detail(article_url)
126 | if article_html:
127 | article_data = parse_detail(article_html)
128 | print(article_data)
129 | if article_data:
130 | save_to_mongo(article_data)
131 |
132 |
133 |
134 | if __name__ == '__main__':
135 | main()
--------------------------------------------------------------------------------
/Zhihu/Zhihu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CP-0/py_spider_codes/d7875b33c21d06b3a9af952248bf9f3acd50db93/Zhihu/Zhihu/__init__.py
--------------------------------------------------------------------------------
/Zhihu/Zhihu/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | from scrapy import Item, Field
9 |
10 |
11 | class UserItem(Item):
12 | # define the fields for your item here like:
13 | id = Field()
14 | name = Field()
15 | avatar_url = Field()
16 | headline = Field()
17 | description = Field()
18 | url = Field()
19 | url_token = Field()
20 | gender = Field()
21 | cover_url = Field()
22 | type = Field()
23 | badge = Field()
24 |
25 | answer_count = Field()
26 | articles_count = Field()
27 | commercial_question_count = Field()
28 | favorite_count = Field()
29 | favorited_count = Field()
30 | follower_count = Field()
31 | following_columns_count = Field()
32 | following_count = Field()
33 | pins_count = Field()
34 | question_count = Field()
35 | thank_from_count = Field()
36 | thank_to_count = Field()
37 | thanked_count = Field()
38 | vote_from_count = Field()
39 | vote_to_count = Field()
40 | voteup_count = Field()
41 | following_favlists_count = Field()
42 | following_question_count = Field()
43 | following_topic_count = Field()
44 | marked_answers_count = Field()
45 | mutual_followees_count = Field()
46 | hosted_live_count = Field()
47 | participated_live_count = Field()
48 |
49 | locations = Field()
50 | educations = Field()
51 | employments = Field()
--------------------------------------------------------------------------------
/Zhihu/Zhihu/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class ZhihuSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
--------------------------------------------------------------------------------
/Zhihu/Zhihu/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import pymongo
8 |
9 |
10 | class ZhihuPipeline(object):
11 | def process_item(self, item, spider):
12 | return item
13 |
14 |
15 | class MongoPipeline(object):
16 | collection_name = 'users'
17 |
18 | def __init__(self, mongo_uri, mongo_db):
19 | self.mongo_uri = mongo_uri
20 | self.mongo_db = mongo_db
21 |
22 | @classmethod
23 | def from_crawler(cls, crawler):
24 | return cls(
25 | mongo_uri=crawler.settings.get('MONGO_URI'),
26 | mongo_db=crawler.settings.get('MONGO_DATABASE')
27 | )
28 |
29 | def open_spider(self, spider):
30 | self.client = pymongo.MongoClient(self.mongo_uri)
31 | self.db = self.client[self.mongo_db]
32 |
33 | def close_spider(self, spider):
34 | self.client.close()
35 |
36 | def process_item(self, item, spider):
37 | self.db[self.collection_name].update({'url_token': item['url_token']}, dict(item), True)
38 | return item
39 |
--------------------------------------------------------------------------------
/Zhihu/Zhihu/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for Zhihu project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'Zhihu'
13 |
14 | SPIDER_MODULES = ['Zhihu.spiders']
15 | NEWSPIDER_MODULE = 'Zhihu.spiders'
16 |
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | # USER_AGENT = 'Zhihu (+http://www.yourdomain.com)'
19 |
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = False
22 |
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | # CONCURRENT_REQUESTS = 32
25 |
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | # DOWNLOAD_DELAY = 3
30 | # The download delay setting will honor only one of:
31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | # CONCURRENT_REQUESTS_PER_IP = 16
33 |
34 | # Disable cookies (enabled by default)
35 | # COOKIES_ENABLED = False
36 |
37 | # Disable Telnet Console (enabled by default)
38 | # TELNETCONSOLE_ENABLED = False
39 |
40 | # Override the default request headers:
41 | DEFAULT_REQUEST_HEADERS = {
42 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 | 'Accept-Language': 'en',
44 | 'authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20',
45 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
46 | }
47 |
48 | # Enable or disable spider middlewares
49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
50 | # SPIDER_MIDDLEWARES = {
51 | # 'Zhihu.middlewares.ZhihuSpiderMiddleware': 543,
52 | # }
53 |
54 | # Enable or disable downloader middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
56 | # DOWNLOADER_MIDDLEWARES = {
57 | # 'Zhihu.middlewares.MyCustomDownloaderMiddleware': 543,
58 | # }
59 |
60 | # Enable or disable extensions
61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
62 | # EXTENSIONS = {
63 | # 'scrapy.extensions.telnet.TelnetConsole': None,
64 | # }
65 |
66 | # Configure item pipelines
67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
68 | ITEM_PIPELINES = {
69 | 'Zhihu.pipelines.MongoPipeline': 300,
70 | }
71 |
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | # AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | # AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | # AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | # AUTOTHROTTLE_DEBUG = False
84 |
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | # HTTPCACHE_ENABLED = True
88 | # HTTPCACHE_EXPIRATION_SECS = 0
89 | # HTTPCACHE_DIR = 'httpcache'
90 | # HTTPCACHE_IGNORE_HTTP_CODES = []
91 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 |
93 | MONGO_URI = 'localhost'
94 | MONGO_DATABASE = 'zhihu'
--------------------------------------------------------------------------------
/Zhihu/Zhihu/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/Zhihu/Zhihu/spiders/zhihu.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import json
3 |
4 | from scrapy import Request, Spider
5 |
6 | from Zhihu.items import UserItem
7 |
8 |
9 | class ZhihuSpider(Spider):
10 | name = 'zhihu'
11 | allowed_domains = ['www.zhihu.com']
12 | user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
13 | follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
14 | followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}'
15 | start_user = 'excited-vczh'
16 | user_query = 'locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,answer_count,articles_count,pins_count,question_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_force_renamed,is_bind_sina,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics'
17 | follows_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
18 | followers_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
19 |
20 | def start_requests(self):
21 | yield Request(self.user_url.format(user=self.start_user, include=self.user_query), self.parse_user)
22 | yield Request(self.follows_url.format(user=self.start_user, include=self.follows_query, limit=20, offset=0),
23 | self.parse_follows)
24 | yield Request(self.followers_url.format(user=self.start_user, include=self.followers_query, limit=20, offset=0),
25 | self.parse_followers)
26 |
27 | def parse_user(self, response):
28 | result = json.loads(response.text)
29 | item = UserItem()
30 |
31 | for field in item.fields:
32 | if field in result.keys():
33 | item[field] = result.get(field)
34 | yield item
35 |
36 | yield Request(
37 | self.follows_url.format(user=result.get(
38 | 'url_token'), include=self.follows_query, limit=20, offset=0),
39 | self.parse_follows)
40 |
41 | yield Request(
42 | self.followers_url.format(user=result.get(
43 | 'url_token'), include=self.followers_query, limit=20, offset=0),
44 | self.parse_followers)
45 |
46 | def parse_follows(self, response):
47 | results = json.loads(response.text)
48 |
49 | if 'data' in results.keys():
50 | for result in results.get('data'):
51 | yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
52 | self.parse_user)
53 |
54 | if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
55 | next_page = results.get('paging').get('next')
56 | yield Request(next_page, self.parse_follows)
57 |
58 | def parse_followers(self, response):
59 | results = json.loads(response.text)
60 |
61 | if 'data' in results.keys():
62 | for result in results.get('data'):
63 | yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
64 | self.parse_user)
65 |
66 | if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
67 | next_page = results.get('paging').get('next')
68 | yield Request(next_page, self.parse_followers)
69 |
--------------------------------------------------------------------------------
/Zhihu/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = Zhihu.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = Zhihu
12 |
--------------------------------------------------------------------------------