├── .gitignore ├── BaiduStocks ├── BaiduStocks │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── stock.py └── scrapy.cfg ├── CrawBaiduStocksA.py ├── CrawTaobaoPrice.py ├── CrawUnivRanking.py ├── MaoYan └── spider.py ├── TaobaoProduct ├── config.py └── spider.py ├── TouTiao ├── config.py └── spider.py ├── Weixin ├── config.py └── spider.py └── Zhihu ├── Zhihu ├── __init__.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ └── zhihu.py └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | BaiduStockInfo.txt 2 | .vscode 3 | .idea 4 | __pycache__ 5 | pics -------------------------------------------------------------------------------- /BaiduStocks/BaiduStocks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CP-0/py_spider_codes/d7875b33c21d06b3a9af952248bf9f3acd50db93/BaiduStocks/BaiduStocks/__init__.py -------------------------------------------------------------------------------- /BaiduStocks/BaiduStocks/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class BaidustocksItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /BaiduStocks/BaiduStocks/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class BaidustocksSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /BaiduStocks/BaiduStocks/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class BaidustocksPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | 13 | 14 | class BaidustocksInfoPipeline(object): 15 | def open_spider(self, spider): 16 | self.f = open('BaiduStockInfo.txt', 'w') 17 | 18 | def close_spider(self, spider): 19 | self.f.close() 20 | 21 | def process_item(self, item, spider): 22 | try: 23 | line = str(dict(item)) + '\n' 24 | self.f.write(line) 25 | except: 26 | pass 27 | return item 28 | -------------------------------------------------------------------------------- /BaiduStocks/BaiduStocks/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for BaiduStocks project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'BaiduStocks' 13 | 14 | SPIDER_MODULES = ['BaiduStocks.spiders'] 15 | NEWSPIDER_MODULE = 'BaiduStocks.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'BaiduStocks (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'BaiduStocks.middlewares.BaidustocksSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'BaiduStocks.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'BaiduStocks.pipelines.BaidustocksInfoPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /BaiduStocks/BaiduStocks/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /BaiduStocks/BaiduStocks/spiders/stock.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import re 4 | 5 | 6 | class StockSpider(scrapy.Spider): 7 | name = 'stock' 8 | start_urls = ['http://quote.eastmoney.com/stocklist.html'] 9 | 10 | def parse(self, response): 11 | for href in response.css('a::attr(href)').extract(): 12 | try: 13 | stock = re.findall(r"[s][hz]\d{6}", href)[0] 14 | url = 'https://gupiao.baidu.com/stock/' + stock + '.html' 15 | yield scrapy.Request(url, callback=self.parse_stock) 16 | except: 17 | continue 18 | 19 | def parse_stock(self, response): 20 | infoDict = {} 21 | stockInfo = response.css('.stock-bets') 22 | name = stockInfo.css('.bets-name').extract()[0] 23 | infoDict.update( 24 | {'股票名称': re.findall(r'\s.*\(', name)[0].split()[0] + 25 | re.findall(r'\>.*\<', name)[0][1:-1]}) 26 | 27 | keyList = stockInfo.css('dt').extract() 28 | valueList = stockInfo.css('dd').extract() 29 | for i in range(len(keyList)): 30 | key = re.findall(r'>.*', keyList[i])[0][1:-5] 31 | try: 32 | val = re.findall(r'\d+\.?.*', valueList[i])[0][0:-5] 33 | except: 34 | val = '--' 35 | infoDict[key] = val 36 | 37 | yield infoDict 38 | -------------------------------------------------------------------------------- /BaiduStocks/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = BaiduStocks.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = BaiduStocks 12 | -------------------------------------------------------------------------------- /CrawBaiduStocksA.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import re 4 | import traceback 5 | 6 | 7 | def getHTMLText(url, code='utf-8'): 8 | try: 9 | r = requests.get(url) 10 | r.raise_for_status() 11 | r.encoding = code 12 | return r.text 13 | except: 14 | return '' 15 | 16 | 17 | def getStockList(lst, stockURL): 18 | html = getHTMLText(stockURL, 'GB2312') 19 | soup = BeautifulSoup(html, 'html.parser') 20 | a = soup.find_all('a') 21 | for i in a: 22 | try: 23 | href = i.attrs['href'] 24 | lst.append(re.findall(r'[s][hz]\d{6}', href)[0]) 25 | except: 26 | continue 27 | 28 | 29 | def getStockInfo(lst, stockURL, fpath): 30 | count = 0 31 | for stock in lst: 32 | url = stockURL + stock + '.html' 33 | html = getHTMLText(url) 34 | try: 35 | if html == '': 36 | continue 37 | infoDict = {} 38 | soup = BeautifulSoup(html, 'html.parser') 39 | stockInfo = soup.find('div', attrs={'class': 'stock-bets'}) 40 | 41 | name = stockInfo.find_all(attrs={'class': 'bets-name'})[0] 42 | infoDict.update({'股票名称': name.text.split()[0]}) 43 | 44 | keyList = stockInfo.find_all('dt') 45 | valueList = stockInfo.find_all('dd') 46 | for i in range(len(keyList)): 47 | key = keyList[i].text 48 | val = valueList[i].text 49 | infoDict[key] = val 50 | 51 | with open(fpath, 'a', encoding='utf-8') as f: 52 | f.write(str(infoDict) + '\n') 53 | count = count + 1 54 | print('\r当前进度:{:.2f}%'.format(count * 100 / len(lst)), end="") 55 | except: 56 | count = count + 1 57 | print('\r当前进度:{:.2f}%'.format(count * 100 / len(lst)), end="") 58 | continue 59 | 60 | 61 | def main(): 62 | stock_list_url = 'http://quote.eastmoney.com/stocklist.html' 63 | stock_info_url = 'https://gupiao.baidu.com/stock/' 64 | output_file = './BaiduStockInfo.txt' 65 | slist = [] 66 | getStockList(slist, stock_list_url) 67 | getStockInfo(slist, stock_info_url, output_file) 68 | 69 | 70 | main() 71 | -------------------------------------------------------------------------------- /CrawTaobaoPrice.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re 3 | 4 | 5 | def getHTMLText(url): 6 | try: 7 | r = requests.get(url, timeout=30) 8 | r.raise_for_status() 9 | r.encoding = r.apparent_encoding 10 | return r.text 11 | except: 12 | return '' 13 | 14 | 15 | def parsePage(ilt, html): 16 | try: 17 | plt = re.findall(r'\"price\"\:\"[\d.]*\"', html) 18 | tlt = re.findall(r'\"title\"\:\".*?\"', html) 19 | tilt = re.findall(r'\"tag_info\"\:.*?]{1}', html) 20 | for i in range(len(plt)): 21 | price = "¥" + eval(plt[i].split(':')[1]) 22 | title = eval(tlt[i].split(':')[1]) 23 | tglt = re.findall(r'\"tag\"\:\"(.*?)\"', tilt[i]) 24 | feature = ', '.join(tglt) 25 | ilt.append([price, title, feature]) 26 | except: 27 | print('') 28 | 29 | 30 | def printGoodsList(ilt): 31 | tplt = "{:4}\t{:8}\t{:16}\t{:16}" 32 | print(tplt.format("序号", "价格", "商品名称", "特色")) 33 | count = 0 34 | for j in ilt: 35 | count = count + 1 36 | print(tplt.format(count, j[0], j[1], j[2])) 37 | 38 | 39 | def main(): 40 | goods = "手机" 41 | depth = 2 42 | start_url = 'https://s.taobao.com/search?q=' + goods 43 | infoList = [] 44 | for i in range(depth): 45 | try: 46 | url = start_url + "&s=" + str(i * 48) 47 | html = getHTMLText(url) 48 | parsePage(infoList, html) 49 | except: 50 | continue 51 | printGoodsList(infoList) 52 | 53 | 54 | main() 55 | -------------------------------------------------------------------------------- /CrawUnivRanking.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import bs4 4 | 5 | 6 | def getHTMLText(url): 7 | try: 8 | r = requests.get(url) 9 | r.raise_for_status() 10 | r.encoding = r.apparent_encoding 11 | return r.text 12 | except: 13 | return '' 14 | 15 | 16 | def fillUnivList(ulist, html): 17 | soup = BeautifulSoup(html, 'html.parser') 18 | for tr in soup.find('tbody').children: 19 | if isinstance(tr, bs4.element.Tag): 20 | tds = tr('td') 21 | ulist.append([tds[0].string, tds[1].string, tds[2].string]) 22 | 23 | 24 | def printUnivList(ulist, num): 25 | tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}" 26 | print(tplt.format('排名', '学校', '地区', chr(12288))) 27 | for i in range(num): 28 | u = ulist[i] 29 | print(tplt.format(u[0], u[1], u[2], chr(12288))) 30 | 31 | 32 | def main(): 33 | uinfo = [] 34 | url = 'http://www.zuihaodaxue.com/zuihaodaxuepaiming2016.html' 35 | html = getHTMLText(url) 36 | fillUnivList(uinfo, html) 37 | printUnivList(uinfo, 20) 38 | 39 | 40 | main() 41 | -------------------------------------------------------------------------------- /MaoYan/spider.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Pool 2 | import re 3 | import json 4 | import requests 5 | from requests.exceptions import RequestException 6 | 7 | 8 | def get_one_page(url): 9 | try: 10 | headers = {'user-agent': 'Mozilla/5.0'} 11 | response = requests.get(url, headers=headers) 12 | if response.status_code == 200: 13 | return response.text 14 | return None 15 | except RequestException: 16 | return None 17 | 18 | 19 | def parse_one_page(html): 20 | pattern = re.compile('
.*?board-index.*?>(\d+).*?data-src="(.*?)".*?name">(.*?).*?star">(.*?)

.*?releasetime">(.*?)

' 22 | + '.*?integer">(.*?).*?fraction">(.*?).*?
', re.S) 23 | items = re.findall(pattern, html) 24 | for item in items: 25 | yield { 26 | 'index': item[0], 27 | 'image': item[1], 28 | 'title': item[2], 29 | 'actor': item[3].strip()[3:], 30 | 'time': item[4].strip()[5:], 31 | 'score': item[5] + item[6] 32 | } 33 | 34 | 35 | def write_to_file(content): 36 | with open('result.txt', 'a', encoding='utf-8') as f: 37 | f.write(json.dumps(content, ensure_ascii=False) + '\n') 38 | f.close() 39 | 40 | 41 | def main(offset): 42 | url = 'http://maoyan.com/board/4?offset=' + str(offset) 43 | html = get_one_page(url) 44 | for item in parse_one_page(html): 45 | print(item) 46 | write_to_file(item) 47 | 48 | 49 | if __name__ == '__main__': 50 | pool = Pool() 51 | pool.map(main, [i * 10 for i in range(10)]) 52 | pool.close() 53 | pool.join() 54 | -------------------------------------------------------------------------------- /TaobaoProduct/config.py: -------------------------------------------------------------------------------- 1 | MONGO_URL = 'localhost' 2 | MONGO_DB = 'taobao' 3 | MONGO_TABLE = 'product' 4 | 5 | SERVICE_ARGS = ['--load-images=false', '--disk-cache=true'] 6 | 7 | KEYWORD = '美食' -------------------------------------------------------------------------------- /TaobaoProduct/spider.py: -------------------------------------------------------------------------------- 1 | import re 2 | from selenium import webdriver 3 | from selenium.common.exceptions import TimeoutException 4 | from selenium.webdriver.common.by import By 5 | from selenium.webdriver.support.ui import WebDriverWait 6 | from selenium.webdriver.support import expected_conditions as EC 7 | from pyquery import PyQuery as pq 8 | from config import * 9 | import pymongo 10 | 11 | client = pymongo.MongoClient(MONGO_URL) 12 | db = client[MONGO_DB] 13 | 14 | # browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) 15 | browser = webdriver.Chrome() 16 | wait = WebDriverWait(browser, 10) 17 | 18 | # browser.set_window_size(1400, 900) 19 | 20 | 21 | def search(): 22 | print('正在搜索') 23 | try: 24 | browser.get('https://www.taobao.com') 25 | input = wait.until( 26 | EC.presence_of_element_located((By.CSS_SELECTOR, '#q')) 27 | ) 28 | submit = wait.until( 29 | EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button'))) 30 | input.send_keys(KEYWORD) 31 | submit.click() 32 | total = wait.until( 33 | EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total'))) 34 | get_products() 35 | return total.text 36 | except TimeoutException: 37 | return search() 38 | 39 | 40 | def next_page(page_number): 41 | print('正在翻页', page_number) 42 | try: 43 | input = wait.until( 44 | EC.presence_of_element_located( 45 | (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input')) 46 | ) 47 | submit = wait.until(EC.element_to_be_clickable( 48 | (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))) 49 | input.clear() 50 | input.send_keys(page_number) 51 | submit.click() 52 | wait.until(EC.text_to_be_present_in_element( 53 | (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_number))) 54 | get_products() 55 | except TimeoutException: 56 | next_page(page_number) 57 | 58 | 59 | def get_products(): 60 | wait.until(EC.presence_of_element_located( 61 | (By.CSS_SELECTOR, '#mainsrp-itemlist .items .item'))) 62 | html = browser.page_source 63 | doc = pq(html) 64 | items = doc('#mainsrp-itemlist .items .item').items() 65 | for item in items: 66 | product = { 67 | 'image': item.find('.pic .img').attr('src'), 68 | 'price': item.find('.price').text(), 69 | 'deal': item.find('.deal-cnt').text()[:-3], 70 | 'title': item.find('.title').text(), 71 | 'shop': item.find('.shop').text(), 72 | 'location': item.find('.location').text() 73 | } 74 | print(product) 75 | save_to_mongo(product) 76 | 77 | 78 | def save_to_mongo(result): 79 | try: 80 | if db[MONGO_TABLE].insert(result): 81 | print('存储到MONGODB成功', result) 82 | except Exception: 83 | print('存储到MONGODB失败', result) 84 | 85 | 86 | def main(): 87 | try: 88 | total = search() 89 | total = int(re.compile('(\d+)').search(total).group(1)) 90 | for i in range(2, total + 1): 91 | next_page(i) 92 | except Exception: 93 | print('出错啦') 94 | finally: 95 | browser.close() 96 | 97 | 98 | if __name__ == '__main__': 99 | main() 100 | -------------------------------------------------------------------------------- /TouTiao/config.py: -------------------------------------------------------------------------------- 1 | MONGO_URL = 'localhost' 2 | MONGO_DB = 'toutiao' 3 | MONGO_TABLE = 'toutiao' 4 | 5 | GROUP_START = 1 6 | GROUP_END = 10 7 | KEYWORD='街拍' -------------------------------------------------------------------------------- /TouTiao/spider.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from urllib.parse import urlencode 4 | import pymongo 5 | import requests 6 | from bs4 import BeautifulSoup 7 | from requests.exceptions import ConnectionError 8 | import re 9 | from multiprocessing import Pool 10 | from hashlib import md5 11 | from json.decoder import JSONDecodeError 12 | from config import * 13 | 14 | client = pymongo.MongoClient(MONGO_URL, connect=False) 15 | db = client[MONGO_DB] 16 | 17 | if not os.path.exists('pics'): 18 | os.mkdir('pics') 19 | 20 | 21 | def get_page_index(offset, keyword): 22 | data = { 23 | 'autoload': 'true', 24 | 'count': 20, 25 | 'cur_tab': 3, 26 | 'format': 'json', 27 | 'keyword': keyword, 28 | 'offset': offset, 29 | } 30 | params = urlencode(data) 31 | base = 'http://www.toutiao.com/search_content/' 32 | url = base + '?' + params 33 | try: 34 | response = requests.get(url) 35 | if response.status_code == 200: 36 | return response.text 37 | return None 38 | except ConnectionError: 39 | print('Error occurred') 40 | return None 41 | 42 | 43 | def download_image(url): 44 | print('Downloading', url) 45 | try: 46 | response = requests.get(url) 47 | if response.status_code == 200: 48 | imgType = response.headers.get('Content-Type')[6:] 49 | if imgType == 'jpeg': 50 | imgType = 'jpg' 51 | save_image(response.content, imgType) 52 | return None 53 | except ConnectionError: 54 | return None 55 | 56 | 57 | def save_image(content, type): 58 | file_path = '{0}/{1}.{2}'.format(os.getcwd() + '/pics', 59 | md5(content).hexdigest(), type) 60 | print(file_path) 61 | if not os.path.exists(file_path): 62 | with open(file_path, 'wb') as f: 63 | f.write(content) 64 | f.close() 65 | 66 | 67 | def parse_page_index(text): 68 | try: 69 | data = json.loads(text) 70 | if data and 'data' in data.keys(): 71 | for item in data.get('data'): 72 | yield item.get('article_url') 73 | except JSONDecodeError: 74 | pass 75 | 76 | 77 | def get_page_detail(url): 78 | try: 79 | response = requests.get(url) 80 | if response.status_code == 200: 81 | return response.text 82 | return None 83 | except ConnectionError: 84 | print('Error occurred') 85 | return None 86 | 87 | 88 | def parse_page_detail(html, url): 89 | soup = BeautifulSoup(html, 'lxml') 90 | result = soup.select('title') 91 | title = result[0].get_text() if result else '' 92 | images_pattern = re.compile('gallery: JSON.parse\(\"(.*?)\"\),', re.S) 93 | result = re.search(images_pattern, html) 94 | if result: 95 | data = json.loads(result.group(1).replace( 96 | '\\"', '\"').replace('\\\\', '')) 97 | if data and 'sub_images' in data.keys(): 98 | sub_images = data.get('sub_images') 99 | images = [item.get('url') for item in sub_images] 100 | for image in images: 101 | download_image(image) 102 | return { 103 | 'title': title, 104 | 'url': url, 105 | 'images': images 106 | } 107 | 108 | 109 | def save_to_mongo(result): 110 | if db[MONGO_TABLE].insert(result): 111 | print('Successfully Saved to Mongo', result) 112 | return True 113 | return False 114 | 115 | 116 | def main(offset): 117 | text = get_page_index(offset, KEYWORD) 118 | urls = parse_page_index(text) 119 | for url in urls: 120 | html = get_page_detail(url) 121 | result = parse_page_detail(html, url) 122 | if result: 123 | save_to_mongo(result) 124 | 125 | 126 | pool = Pool() 127 | groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)]) 128 | pool.map(main, groups) 129 | pool.close() 130 | pool.join() 131 | -------------------------------------------------------------------------------- /Weixin/config.py: -------------------------------------------------------------------------------- 1 | PROXY_POOL_URL = 'http://127.0.0.1:5000/get' 2 | KEYWORD = 'Python' 3 | MONGO_URI = 'localhost' 4 | MONGO_DB = 'weixin' 5 | MAX_COUNT = 5 -------------------------------------------------------------------------------- /Weixin/spider.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlencode 2 | import pymongo 3 | import requests 4 | from lxml.etree import XMLSyntaxError 5 | from requests.exceptions import ConnectionError 6 | from pyquery import PyQuery as pq 7 | from config import * 8 | 9 | client = pymongo.MongoClient(MONGO_URI) 10 | db = client[MONGO_DB] 11 | 12 | base_url = 'http://weixin.sogou.com/weixin?' 13 | 14 | headers = { 15 | 'Cookie': 'SUV=1509129808368866; SMYUV=1509129808395088; ABTEST=0|1511965164|v1; SNUID=48EF0F91E5E0BBFA36750514E5021D03; SUID=AC0BEB744842910A000000005A1EC1EC; SUID=AC0BEB745018910A000000005A1EC1ED; weixinIndexVisited=1; IPLOC=CN3100; JSESSIONID=aaaxgt9qdt-zzePJ7Pv8v; ppinf=5|1511965755|1513175355|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo3OlZhbXBpcmV8Y3J0OjEwOjE1MTE5NjU3NTV8cmVmbmljazo3OlZhbXBpcmV8dXNlcmlkOjQ0Om85dDJsdUExVEZaUHQwcGN6ZVo3cU54RDV4MDhAd2VpeGluLnNvaHUuY29tfA; pprdig=c6yA_9zg5lQZIzDaGlJmPdRCP6fkQ-FcEBHcjtOkZLhPHbF7ld-lilOnLnyR6Xu9cjbAJUfdsYIYI2pvMBUdahClUcpxYJK46PqJZ1WgU1nA7BR1IRwvjqnpCKdzAT5WkmlKLXi2L9XEKcZF7ItOHVy0mLb-yIZ9nU2OlWr4CMc; sgid=02-30131415-AVoexDskULls7k9ia1Km7RuA; ppmdig=15119657560000008c4babee24ee063e8afb031f772b2169; sct=4', 16 | 'Host': 'weixin.sogou.com', 17 | 'Upgrade-Insecure-Requests': '1', 18 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36' 19 | } 20 | 21 | proxy = None 22 | 23 | 24 | def get_proxy(): 25 | try: 26 | response = requests.get(PROXY_POOL_URL) 27 | if response.status_code == 200: 28 | return response.text 29 | return None 30 | except ConnectionError: 31 | return None 32 | 33 | def get_html(url, count=1): 34 | print('Crawling', url) 35 | print('Trying Count', count) 36 | global proxy 37 | if count >= MAX_COUNT: 38 | print('Tried Too Many Counts') 39 | return None 40 | try: 41 | if proxy: 42 | proxies = { 43 | 'http': 'http://' + proxy 44 | } 45 | response = requests.get(url, allow_redirects=False, headers=headers, proxies=proxies) 46 | else: 47 | response = requests.get(url, allow_redirects=False, headers=headers) 48 | if response.status_code == 200: 49 | return response.text 50 | if response.status_code == 302: 51 | # Need Proxy 52 | print('302') 53 | proxy = get_proxy() 54 | if proxy: 55 | print('Using Proxy', proxy) 56 | return get_html(url) 57 | else: 58 | print('Get Proxy Failed') 59 | return None 60 | except ConnectionError as e: 61 | print('Error Occurred', e.args) 62 | proxy = get_proxy() 63 | count += 1 64 | return get_html(url, count) 65 | 66 | 67 | 68 | def get_index(keyword, page): 69 | data = { 70 | 'query': keyword, 71 | 'type': 2, 72 | 'page': page 73 | } 74 | queries = urlencode(data) 75 | url = base_url + queries 76 | html = get_html(url) 77 | return html 78 | 79 | def parse_index(html): 80 | doc = pq(html) 81 | items = doc('.news-box .news-list li .txt-box h3 a').items() 82 | for item in items: 83 | yield item.attr('href') 84 | 85 | def get_detail(url): 86 | try: 87 | response = requests.get(url) 88 | if response.status_code == 200: 89 | return response.text 90 | return None 91 | except ConnectionError: 92 | return None 93 | 94 | def parse_detail(html): 95 | try: 96 | doc = pq(html) 97 | title = doc('.rich_media_title').text() 98 | content = doc('.rich_media_content').text() 99 | date = doc('#post-date').text() 100 | nickname = doc('#js_profile_qrcode > div > strong').text() 101 | wechat = doc('#js_profile_qrcode > div > p:nth-child(3) > span').text() 102 | return { 103 | 'title': title, 104 | 'content': content, 105 | 'date': date, 106 | 'nickname': nickname, 107 | 'wechat': wechat 108 | } 109 | except XMLSyntaxError: 110 | return None 111 | 112 | def save_to_mongo(data): 113 | if db['articles'].update({'title': data['title']}, {'$set': data}, True): 114 | print('Saved to Mongo', data['title']) 115 | else: 116 | print('Saved to Mongo Failed', data['title']) 117 | 118 | 119 | def main(): 120 | for page in range(1, 101): 121 | html = get_index(KEYWORD, page) 122 | if html: 123 | article_urls = parse_index(html) 124 | for article_url in article_urls: 125 | article_html = get_detail(article_url) 126 | if article_html: 127 | article_data = parse_detail(article_html) 128 | print(article_data) 129 | if article_data: 130 | save_to_mongo(article_data) 131 | 132 | 133 | 134 | if __name__ == '__main__': 135 | main() -------------------------------------------------------------------------------- /Zhihu/Zhihu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CP-0/py_spider_codes/d7875b33c21d06b3a9af952248bf9f3acd50db93/Zhihu/Zhihu/__init__.py -------------------------------------------------------------------------------- /Zhihu/Zhihu/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Item, Field 9 | 10 | 11 | class UserItem(Item): 12 | # define the fields for your item here like: 13 | id = Field() 14 | name = Field() 15 | avatar_url = Field() 16 | headline = Field() 17 | description = Field() 18 | url = Field() 19 | url_token = Field() 20 | gender = Field() 21 | cover_url = Field() 22 | type = Field() 23 | badge = Field() 24 | 25 | answer_count = Field() 26 | articles_count = Field() 27 | commercial_question_count = Field() 28 | favorite_count = Field() 29 | favorited_count = Field() 30 | follower_count = Field() 31 | following_columns_count = Field() 32 | following_count = Field() 33 | pins_count = Field() 34 | question_count = Field() 35 | thank_from_count = Field() 36 | thank_to_count = Field() 37 | thanked_count = Field() 38 | vote_from_count = Field() 39 | vote_to_count = Field() 40 | voteup_count = Field() 41 | following_favlists_count = Field() 42 | following_question_count = Field() 43 | following_topic_count = Field() 44 | marked_answers_count = Field() 45 | mutual_followees_count = Field() 46 | hosted_live_count = Field() 47 | participated_live_count = Field() 48 | 49 | locations = Field() 50 | educations = Field() 51 | employments = Field() -------------------------------------------------------------------------------- /Zhihu/Zhihu/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class ZhihuSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /Zhihu/Zhihu/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymongo 8 | 9 | 10 | class ZhihuPipeline(object): 11 | def process_item(self, item, spider): 12 | return item 13 | 14 | 15 | class MongoPipeline(object): 16 | collection_name = 'users' 17 | 18 | def __init__(self, mongo_uri, mongo_db): 19 | self.mongo_uri = mongo_uri 20 | self.mongo_db = mongo_db 21 | 22 | @classmethod 23 | def from_crawler(cls, crawler): 24 | return cls( 25 | mongo_uri=crawler.settings.get('MONGO_URI'), 26 | mongo_db=crawler.settings.get('MONGO_DATABASE') 27 | ) 28 | 29 | def open_spider(self, spider): 30 | self.client = pymongo.MongoClient(self.mongo_uri) 31 | self.db = self.client[self.mongo_db] 32 | 33 | def close_spider(self, spider): 34 | self.client.close() 35 | 36 | def process_item(self, item, spider): 37 | self.db[self.collection_name].update({'url_token': item['url_token']}, dict(item), True) 38 | return item 39 | -------------------------------------------------------------------------------- /Zhihu/Zhihu/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for Zhihu project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'Zhihu' 13 | 14 | SPIDER_MODULES = ['Zhihu.spiders'] 15 | NEWSPIDER_MODULE = 'Zhihu.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | # USER_AGENT = 'Zhihu (+http://www.yourdomain.com)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = False 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | # CONCURRENT_REQUESTS = 32 25 | 26 | # Configure a delay for requests for the same website (default: 0) 27 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 28 | # See also autothrottle settings and docs 29 | # DOWNLOAD_DELAY = 3 30 | # The download delay setting will honor only one of: 31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 32 | # CONCURRENT_REQUESTS_PER_IP = 16 33 | 34 | # Disable cookies (enabled by default) 35 | # COOKIES_ENABLED = False 36 | 37 | # Disable Telnet Console (enabled by default) 38 | # TELNETCONSOLE_ENABLED = False 39 | 40 | # Override the default request headers: 41 | DEFAULT_REQUEST_HEADERS = { 42 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 43 | 'Accept-Language': 'en', 44 | 'authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20', 45 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36' 46 | } 47 | 48 | # Enable or disable spider middlewares 49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 50 | # SPIDER_MIDDLEWARES = { 51 | # 'Zhihu.middlewares.ZhihuSpiderMiddleware': 543, 52 | # } 53 | 54 | # Enable or disable downloader middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 56 | # DOWNLOADER_MIDDLEWARES = { 57 | # 'Zhihu.middlewares.MyCustomDownloaderMiddleware': 543, 58 | # } 59 | 60 | # Enable or disable extensions 61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 62 | # EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | # } 65 | 66 | # Configure item pipelines 67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 68 | ITEM_PIPELINES = { 69 | 'Zhihu.pipelines.MongoPipeline': 300, 70 | } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | # AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | # AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | # AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | # AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | # HTTPCACHE_ENABLED = True 88 | # HTTPCACHE_EXPIRATION_SECS = 0 89 | # HTTPCACHE_DIR = 'httpcache' 90 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | 93 | MONGO_URI = 'localhost' 94 | MONGO_DATABASE = 'zhihu' -------------------------------------------------------------------------------- /Zhihu/Zhihu/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Zhihu/Zhihu/spiders/zhihu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | 4 | from scrapy import Request, Spider 5 | 6 | from Zhihu.items import UserItem 7 | 8 | 9 | class ZhihuSpider(Spider): 10 | name = 'zhihu' 11 | allowed_domains = ['www.zhihu.com'] 12 | user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}' 13 | follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}' 14 | followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}' 15 | start_user = 'excited-vczh' 16 | user_query = 'locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,answer_count,articles_count,pins_count,question_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_force_renamed,is_bind_sina,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics' 17 | follows_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics' 18 | followers_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics' 19 | 20 | def start_requests(self): 21 | yield Request(self.user_url.format(user=self.start_user, include=self.user_query), self.parse_user) 22 | yield Request(self.follows_url.format(user=self.start_user, include=self.follows_query, limit=20, offset=0), 23 | self.parse_follows) 24 | yield Request(self.followers_url.format(user=self.start_user, include=self.followers_query, limit=20, offset=0), 25 | self.parse_followers) 26 | 27 | def parse_user(self, response): 28 | result = json.loads(response.text) 29 | item = UserItem() 30 | 31 | for field in item.fields: 32 | if field in result.keys(): 33 | item[field] = result.get(field) 34 | yield item 35 | 36 | yield Request( 37 | self.follows_url.format(user=result.get( 38 | 'url_token'), include=self.follows_query, limit=20, offset=0), 39 | self.parse_follows) 40 | 41 | yield Request( 42 | self.followers_url.format(user=result.get( 43 | 'url_token'), include=self.followers_query, limit=20, offset=0), 44 | self.parse_followers) 45 | 46 | def parse_follows(self, response): 47 | results = json.loads(response.text) 48 | 49 | if 'data' in results.keys(): 50 | for result in results.get('data'): 51 | yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query), 52 | self.parse_user) 53 | 54 | if 'paging' in results.keys() and results.get('paging').get('is_end') == False: 55 | next_page = results.get('paging').get('next') 56 | yield Request(next_page, self.parse_follows) 57 | 58 | def parse_followers(self, response): 59 | results = json.loads(response.text) 60 | 61 | if 'data' in results.keys(): 62 | for result in results.get('data'): 63 | yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query), 64 | self.parse_user) 65 | 66 | if 'paging' in results.keys() and results.get('paging').get('is_end') == False: 67 | next_page = results.get('paging').get('next') 68 | yield Request(next_page, self.parse_followers) 69 | -------------------------------------------------------------------------------- /Zhihu/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = Zhihu.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = Zhihu 12 | --------------------------------------------------------------------------------