├── .project ├── .pydevproject ├── .settings └── org.eclipse.core.resources.prefs ├── README.md ├── api ├── app.py ├── flask │ └── app.py ├── requirements.txt └── templates │ └── index.html ├── cmdline.py ├── scrapy.cfg └── stock ├── __init__.py ├── __init__.pyc ├── commands └── crawlall.py ├── cookies.py ├── cookies.pyc ├── items.py ├── items.pyc ├── middlewares.py ├── middlewares.pyc ├── mongodb.py ├── mongodb.pyc ├── pipelines.py ├── pipelines.pyc ├── settings.py ├── settings.pyc └── spiders ├── __init__.py ├── __init__.pyc ├── baiduTopStockSpider.py ├── baiduTopStockSpider.pyc ├── topStockSpider.pyc ├── xueqiuPostSpider.py └── xueqiuPostSpider.pyc /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | stock 4 | 5 | 6 | 7 | 8 | 9 | org.python.pydev.PyDevBuilder 10 | 11 | 12 | 13 | 14 | 15 | org.python.pydev.pythonNature 16 | 17 | 18 | -------------------------------------------------------------------------------- /.pydevproject: -------------------------------------------------------------------------------- 1 | 2 | 3 | Default 4 | python 2.7 5 | 6 | -------------------------------------------------------------------------------- /.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//stock/items.py=utf-8 3 | encoding//stock/middlewares.py=utf-8 4 | encoding//stock/pipelines.py=utf-8 5 | encoding//stock/settings.py=utf-8 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # stockSpider 2 | 学python写的抓雪球文章的爬虫,基于Scrapy 3 | 4 | scrapy crawl stock 5 | -------------------------------------------------------------------------------- /api/app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask 2 | from flask_pymongo import PyMongo 3 | from flask import render_template 4 | 5 | app = Flask(__name__) 6 | 7 | app.config['MONGO_HOST'] = 'localhost' 8 | app.config['MONGO_PORT'] = 27017 9 | app.config['MONGO_DBNAME'] = 'stock' 10 | mongo = PyMongo(app, config_prefix='MONGO') 11 | 12 | 13 | @app.route('/') 14 | def hello(): 15 | stocks = list(mongo.db.top.find({})) 16 | return render_template('index.html', stocks=stocks) 17 | 18 | if __name__ == '__main__': 19 | app.run() 20 | -------------------------------------------------------------------------------- /api/flask/app.py: -------------------------------------------------------------------------------- 1 | #!flask/bin/python 2 | from flask import Flask 3 | 4 | app = Flask(__name__) 5 | 6 | 7 | connection = pymongo.MongoClient( 8 | settings['MONGODB_SERVER'], 9 | settings['MONGODB_PORT'] 10 | ) 11 | db = connection[settings['MONGODB_DB']] 12 | self.collection = db[settings['MONGODB_COLLECTION']] 13 | 14 | @app.route('/') 15 | def index(): 16 | return "Hello, World!" 17 | 18 | if __name__ == '__main__': 19 | app.run(debug=True) 20 | -------------------------------------------------------------------------------- /api/requirements.txt: -------------------------------------------------------------------------------- 1 | click==6.7 2 | Flask==0.12.2 3 | itsdangerous==0.24 4 | Jinja2==2.9.6 5 | MarkupSafe==1.0 6 | Werkzeug==0.12.2 7 | -------------------------------------------------------------------------------- /api/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | stock form baidu top 6 | 7 | 8 | 9 | 10 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /cmdline.py: -------------------------------------------------------------------------------- 1 | import scrapy.cmdline 2 | 3 | if __name__ == '__main__': 4 | scrapy.cmdline.execute(argv=['scrapy','crawl','xueqiuPostSpider']) -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = stock.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = stock 12 | -------------------------------------------------------------------------------- /stock/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mizhdi/stockSpider/a4c0092a312b7fc1be1b4468779febcea6bda625/stock/__init__.py -------------------------------------------------------------------------------- /stock/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mizhdi/stockSpider/a4c0092a312b7fc1be1b4468779febcea6bda625/stock/__init__.pyc -------------------------------------------------------------------------------- /stock/commands/crawlall.py: -------------------------------------------------------------------------------- 1 | # from scrapy.command import ScrapyCommand 2 | # from scrapy.utils.project import get_project_settings 3 | # from scrapy.crawler import Crawler 4 | # 5 | # class Command(ScrapyCommand): 6 | # 7 | # requires_project = True 8 | # 9 | # def syntax(self): 10 | # return '[options]' 11 | # 12 | # def short_desc(self): 13 | # return 'Runs all of the spiders' 14 | # 15 | # def run(self, args, opts): 16 | # settings = get_project_settings() 17 | # 18 | # for spider_name in self.crawler.spiders.list(): 19 | # crawler = Crawler(settings) 20 | # crawler.configure() 21 | # spider = crawler.spiders.create(spider_name) 22 | # crawler.crawl(spider) 23 | # crawler.start() 24 | # 25 | # self.crawler.start() -------------------------------------------------------------------------------- /stock/cookies.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import redis 4 | import logging 5 | from .settings import REDIS_URL 6 | from random import choice 7 | from scrapy.conf import settings 8 | 9 | reds = redis.Redis.from_url(REDIS_URL, db=2, decode_responses=True) 10 | login_url = 'https://xueqiu.com' 11 | 12 | def get_cookie(): 13 | # request and get cookie 14 | # return '{"xq_a_token": "0a52c567442f1fdd8b09c27e0abb26438e274a7e","xq_r_token": "43c6fed2d6b5cc8bc38cc9694c6c1cf121d38471"}'; 15 | headers = {'User-Agent': choice(settings["USER_AGENT_CHOICES"])} 16 | r = requests.get(login_url, headers=headers) 17 | cookies = r.cookies.get_dict() 18 | 19 | return json.dumps(cookies) 20 | 21 | def init_cookie(spidername): 22 | if reds.get("%s:Cookies" % (spidername)) is None: 23 | cookie = get_cookie() 24 | reds.set("%s:Cookies" % (spidername), cookie) 25 | 26 | def update_cookie(): 27 | pass 28 | 29 | def delete_cookie(): 30 | pass -------------------------------------------------------------------------------- /stock/cookies.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mizhdi/stockSpider/a4c0092a312b7fc1be1b4468779febcea6bda625/stock/cookies.pyc -------------------------------------------------------------------------------- /stock/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class TopStockItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | num = scrapy.Field() 14 | name = scrapy.Field() 15 | source = scrapy.Field() 16 | 17 | pass 18 | 19 | class PostItem(scrapy.Item): 20 | # define the fields for your item here like: 21 | authorId = scrapy.Field() 22 | viewCount = scrapy.Field() 23 | postId = scrapy.Field() 24 | postTitle = scrapy.Field() 25 | postDetail = scrapy.Field() 26 | 27 | pass 28 | -------------------------------------------------------------------------------- /stock/items.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mizhdi/stockSpider/a4c0092a312b7fc1be1b4468779febcea6bda625/stock/items.pyc -------------------------------------------------------------------------------- /stock/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | from scrapy.downloadermiddlewares.retry import RetryMiddleware 10 | from .cookies import init_cookie, update_cookie 11 | import redis 12 | from random import choice 13 | from scrapy.exceptions import NotConfigured 14 | import json 15 | 16 | class CookieMiddleware(RetryMiddleware): 17 | def __init__(self, settings, crawler): 18 | RetryMiddleware.__init__(self, settings) 19 | self.reds = redis.Redis.from_url(settings['REDIS_URL'], db=2, decode_responses=True) 20 | init_cookie(crawler.spider.name) 21 | 22 | @classmethod 23 | def from_crawler(cls, crawler): 24 | return cls(crawler.settings, crawler) 25 | 26 | def process_request(self, request, spider): 27 | redisKeys = self.reds.keys() 28 | while len(redisKeys) > 0: 29 | if spider.name + ':Cookies' in redisKeys: 30 | try: 31 | cookie = json.loads(self.reds.get(spider.name + ':Cookies')) 32 | request.cookies = cookie 33 | except: 34 | print "json string convert to dict failed" 35 | break 36 | else: 37 | pass 38 | #redisKeys.remove(elem) 39 | 40 | 41 | class UserAgentMiddleware(object): 42 | # Not all methods need to be defined. If a method is not defined, 43 | # scrapy acts as if the spider middleware does not modify the 44 | # passed objects. 45 | 46 | def __init__(self, user_agents): 47 | self.enabled = False 48 | self.user_agents = user_agents 49 | 50 | @classmethod 51 | def from_crawler(cls, crawler): 52 | # This method is used by Scrapy to create your spiders. 53 | # s = cls() 54 | # crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 55 | # return s 56 | user_agents = crawler.settings.get('USER_AGENT_CHOICES', []) 57 | 58 | if not user_agents: 59 | raise NotConfigured("USER_AGENT_CHOICES not set or empty") 60 | 61 | o = cls(user_agents) 62 | crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) 63 | 64 | return o 65 | 66 | def process_spider_input(self, response, spider): 67 | # Called for each response that goes through the spider 68 | # middleware and into the spider. 69 | 70 | # Should return None or raise an exception. 71 | return None 72 | 73 | def process_spider_output(self, response, result, spider): 74 | # Called with the results returned from the Spider, after 75 | # it has processed the response. 76 | 77 | # Must return an iterable of Request, dict or Item objects. 78 | for i in result: 79 | yield i 80 | 81 | def process_spider_exception(self, response, exception, spider): 82 | # Called when a spider or process_spider_input() method 83 | # (from other spider middleware) raises an exception. 84 | 85 | # Should return either None or an iterable of Response, dict 86 | # or Item objects. 87 | pass 88 | 89 | def process_start_requests(self, start_requests, spider): 90 | # Called with the start requests of the spider, and works 91 | # similarly to the process_spider_output() method, except 92 | # that it doesn’t have a response associated. 93 | 94 | # Must return only requests (not items). 95 | for r in start_requests: 96 | yield r 97 | 98 | def spider_opened(self, spider): 99 | spider.logger.info('Spider opened: %s' % spider.name) 100 | self.enabled = getattr(spider, 'rotate_user_agent', self.enabled) 101 | 102 | def process_request(self, request, spider): 103 | if not self.enabled or not self.user_agents: 104 | return 105 | request.headers['user-agent'] = choice(self.user_agents) 106 | -------------------------------------------------------------------------------- /stock/middlewares.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mizhdi/stockSpider/a4c0092a312b7fc1be1b4468779febcea6bda625/stock/middlewares.pyc -------------------------------------------------------------------------------- /stock/mongodb.py: -------------------------------------------------------------------------------- 1 | import pymongo 2 | from scrapy.conf import settings 3 | 4 | def init_mongodb(): 5 | connection = pymongo.MongoClient( 6 | settings['MONGODB_SERVER'], 7 | settings['MONGODB_PORT'] 8 | ) 9 | 10 | db = connection[settings['MONGODB_DB']] 11 | 12 | return db 13 | -------------------------------------------------------------------------------- /stock/mongodb.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mizhdi/stockSpider/a4c0092a312b7fc1be1b4468779febcea6bda625/stock/mongodb.pyc -------------------------------------------------------------------------------- /stock/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import pymongo 9 | from scrapy import log 10 | from twisted.conch.insults.window import cursor 11 | from stock.mongodb import init_mongodb 12 | 13 | class StockPipeline(object): 14 | 15 | def __init__(self): 16 | self.db = init_mongodb() 17 | 18 | def process_item(self, item, spider): 19 | if spider.name == 'baiduTopStockSpider': 20 | collection = self.db[settings['stock']] 21 | d = dict(item) 22 | cursor = list(collection.find({'num': d["num"], 'source': d["source"]})) 23 | 24 | if cursor: 25 | collection.update({'_id': cursor[0]['_id']}, d) 26 | else: 27 | collection.insert(d) 28 | log.msg("stock added to MongoDB database!", level=log.DEBUG, spider=spider) 29 | elif spider.name == 'xueqiuPostSpider': 30 | collection = self.db['post'] 31 | collection.save(dict(item)) 32 | log.msg("post added to MongoDB database!", level=log.DEBUG, spider=spider) 33 | 34 | return item 35 | -------------------------------------------------------------------------------- /stock/pipelines.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mizhdi/stockSpider/a4c0092a312b7fc1be1b4468779febcea6bda625/stock/pipelines.pyc -------------------------------------------------------------------------------- /stock/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for stock project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'stock' 13 | 14 | SPIDER_MODULES = ['stock.spiders'] 15 | NEWSPIDER_MODULE = 'stock.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'stock (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'stock.middlewares.StockSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'stock.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | # 'scrapy.pipeline.images.ImagesPipeline': 1, 69 | 'stock.pipelines.StockPipeline': 10, 70 | 'scrapy_redis.pipelines.RedisPipeline': 300, 71 | } 72 | 73 | IMAGES_STORE = '/path/to/valid/dir' 74 | IMAGES_EXPIRES = 30 75 | IMAGES_THUMBS = { 76 | 'small': (50, 50), 77 | 'big': (270, 270), 78 | } 79 | # Enable and configure the AutoThrottle extension (disabled by default) 80 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 81 | #AUTOTHROTTLE_ENABLED = True 82 | # The initial download delay 83 | #AUTOTHROTTLE_START_DELAY = 5 84 | # The maximum download delay to be set in case of high latencies 85 | #AUTOTHROTTLE_MAX_DELAY = 60 86 | # The average number of requests Scrapy should be sending in parallel to 87 | # each remote server 88 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 89 | # Enable showing throttling stats for every response received: 90 | #AUTOTHROTTLE_DEBUG = False 91 | 92 | # Enable and configure HTTP caching (disabled by default) 93 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 94 | #HTTPCACHE_ENABLED = True 95 | #HTTPCACHE_EXPIRATION_SECS = 0 96 | #HTTPCACHE_DIR = 'httpcache' 97 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 98 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 99 | 100 | 101 | MONGODB_SERVER = "localhost" 102 | MONGODB_PORT = 27017 103 | MONGODB_DB = "stock" 104 | MONGODB_COLLECTION = "top" 105 | 106 | FEED_URI = '/Users/sean/Documents/topStock.csv' 107 | FEED_FORMAT ='CSV' 108 | 109 | # COMMANDS_MODULE = 'stock.commands' 110 | 111 | 112 | #启用Redis调度存储请求队列 113 | SCHEDULER = "scrapy_redis.scheduler.Scheduler" 114 | 115 | #确保所有的爬虫通过Redis去重 116 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 117 | 118 | #默认请求序列化使用的是pickle 但是我们可以更改为其他类似的。PS:这玩意儿2.X的可以用。3.X的不能用 119 | #SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat" 120 | 121 | #不清除Redis队列、这样可以暂停/恢复 爬取 122 | #SCHEDULER_PERSIST = True 123 | 124 | #使用优先级调度请求队列 (默认使用) 125 | #SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' 126 | #可选用的其它队列 127 | #SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue' 128 | #SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue' 129 | 130 | #最大空闲时间防止分布式爬虫因为等待而关闭 131 | #SCHEDULER_IDLE_BEFORE_CLOSE = 10 132 | 133 | #将清除的项目在redis进行处理 134 | 135 | DOWNLOADER_MIDDLEWARES = { 136 | 'stock.middlewares.UserAgentMiddleware': 100, 137 | 'stock.middlewares.CookieMiddleware': 200 138 | } 139 | 140 | #序列化项目管道作为redis Key存储 141 | REDIS_ITEMS_KEY = '%(spider)s:items' 142 | 143 | #默认使用ScrapyJSONEncoder进行项目序列化 144 | #You can use any importable path to a callable object. 145 | #REDIS_ITEMS_SERIALIZER = 'json.dumps' 146 | 147 | #指定连接到redis时使用的端口和地址(可选) 148 | #REDIS_HOST = 'localhost' 149 | #REDIS_PORT = 6379 150 | 151 | #指定用于连接redis的URL(可选) 152 | #如果设置此项,则此项优先级高于设置的REDIS_HOST 和 REDIS_PORT 153 | REDIS_URL = 'redis://127.0.0.1:6379' 154 | 155 | #自定义的redis参数(连接超时之类的) 156 | #REDIS_PARAMS = {} 157 | 158 | #自定义redis客户端类 159 | #REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient' 160 | 161 | #如果为True,则使用redis的'spop'进行操作。 162 | #如果需要避免起始网址列表出现重复,这个选项非常有用。开启此选项urls必须通过sadd添加,否则会出现类型错误。 163 | #REDIS_START_URLS_AS_SET = False 164 | 165 | #RedisSpider和RedisCrawlSpider默认 start_usls 键 166 | #REDIS_START_URLS_KEY = '%(name)s:start_urls' 167 | 168 | #设置redis使用utf-8之外的编码 169 | #REDIS_ENCODING = 'latin1' 170 | 171 | USER_AGENT_CHOICES = [ 172 | 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0', 173 | # 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', 174 | # 'Mozilla/5.0 (compatible; Bingbot/2.0; +http://www.bing.com/bingbot.htm)', 175 | # 'Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)', 176 | # 'DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)', 177 | # 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)', 178 | # 'Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)', 179 | # 'ia_archiver (+http://www.alexa.com/site/help/webmasters; crawler@alexa.com)', 180 | ] 181 | -------------------------------------------------------------------------------- /stock/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mizhdi/stockSpider/a4c0092a312b7fc1be1b4468779febcea6bda625/stock/settings.pyc -------------------------------------------------------------------------------- /stock/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /stock/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mizhdi/stockSpider/a4c0092a312b7fc1be1b4468779febcea6bda625/stock/spiders/__init__.pyc -------------------------------------------------------------------------------- /stock/spiders/baiduTopStockSpider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from scrapy.selector import Selector 3 | from stock.items import TopStockItem 4 | 5 | class baiduTopStockSpider(scrapy.spiders.Spider): 6 | name = "baiduTopStockSpider" 7 | allowed_domains = ["baidu.com"] 8 | start_urls = [ 9 | "http://top.baidu.com/buzz?b=276&c=17&fr=topbuzz_b277_c17", 10 | ] 11 | url = 'http://top.baidu.com' 12 | 13 | def parse(self, response): 14 | # print(response, type(response)) 15 | # from scrapy.http.response.html import HtmlResponse 16 | item = TopStockItem() 17 | selector = Selector(response) 18 | stocks = selector.xpath('//td[@class="keyword"]/a[@class="list-title"]') 19 | 20 | for index, stock in enumerate(stocks): 21 | item['name'] = stock.xpath('text()').extract()[0] 22 | item['num'] = index + 1 23 | item['source'] = "baidu" 24 | 25 | yield item 26 | -------------------------------------------------------------------------------- /stock/spiders/baiduTopStockSpider.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mizhdi/stockSpider/a4c0092a312b7fc1be1b4468779febcea6bda625/stock/spiders/baiduTopStockSpider.pyc -------------------------------------------------------------------------------- /stock/spiders/topStockSpider.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mizhdi/stockSpider/a4c0092a312b7fc1be1b4468779febcea6bda625/stock/spiders/topStockSpider.pyc -------------------------------------------------------------------------------- /stock/spiders/xueqiuPostSpider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import urlparse 3 | from scrapy.selector import Selector 4 | import json 5 | from scrapy import log 6 | from scrapy.http import Request,FormRequest 7 | from stock.items import PostItem 8 | 9 | class xueqiuPostSpider(scrapy.spiders.Spider): 10 | name = "xueqiuPostSpider" 11 | rotate_user_agent = True 12 | allowed_domains = ["xueqiu.com"] 13 | start_urls = ["https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=10&category=-1"] 14 | 15 | def __init__(self): 16 | self.headers = {} 17 | self.cookies = {} 18 | 19 | def start_requests(self): 20 | for i, url in enumerate(self.start_urls): 21 | yield FormRequest(url, 22 | meta = {'cookiejar': i}, 23 | headers = self.headers, 24 | cookies = self.cookies, 25 | callback = self.parse, 26 | dont_filter = True)#jump to login page 27 | 28 | def parse(self, response): 29 | # print(response.text, type(response)) 30 | # from scrapy.http.response.html import HtmlResponse 31 | 32 | for list in json.loads(response.text)['list']: 33 | dict = json.loads(list['data']) 34 | detail_url = "https://xueqiu.com/" + str(dict['user']['id']) + '/' + str(dict['id']) 35 | 36 | yield Request(detail_url, callback=self.parse_detail, headers=self.headers, cookies=self.cookies) 37 | 38 | url = "https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=" + str(json.loads(response.text)['next_max_id']) + "&count=10&category=-1" 39 | 40 | yield Request(url, callback=self.parse, headers=self.headers, cookies=self.cookies) 41 | 42 | 43 | def parse_detail(self, response): 44 | url = urlparse.urlparse(response.url) 45 | path = url.path.split("/") 46 | 47 | item = PostItem() 48 | selector = Selector(response) 49 | 50 | item['postId'] = path[2] 51 | item['authorId'] = path[1] 52 | item['postDetail'] = selector.xpath('//div[@class="detail"]').extract()[0] 53 | 54 | yield item 55 | 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /stock/spiders/xueqiuPostSpider.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mizhdi/stockSpider/a4c0092a312b7fc1be1b4468779febcea6bda625/stock/spiders/xueqiuPostSpider.pyc --------------------------------------------------------------------------------