├── .project
├── .pydevproject
├── .settings
└── org.eclipse.core.resources.prefs
├── README.md
├── api
├── app.py
├── flask
│ └── app.py
├── requirements.txt
└── templates
│ └── index.html
├── cmdline.py
├── scrapy.cfg
└── stock
├── __init__.py
├── __init__.pyc
├── commands
└── crawlall.py
├── cookies.py
├── cookies.pyc
├── items.py
├── items.pyc
├── middlewares.py
├── middlewares.pyc
├── mongodb.py
├── mongodb.pyc
├── pipelines.py
├── pipelines.pyc
├── settings.py
├── settings.pyc
└── spiders
├── __init__.py
├── __init__.pyc
├── baiduTopStockSpider.py
├── baiduTopStockSpider.pyc
├── topStockSpider.pyc
├── xueqiuPostSpider.py
└── xueqiuPostSpider.pyc
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | stock
4 |
5 |
6 |
7 |
8 |
9 | org.python.pydev.PyDevBuilder
10 |
11 |
12 |
13 |
14 |
15 | org.python.pydev.pythonNature
16 |
17 |
18 |
--------------------------------------------------------------------------------
/.pydevproject:
--------------------------------------------------------------------------------
1 |
2 |
3 | Default
4 | python 2.7
5 |
6 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.core.resources.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | encoding//stock/items.py=utf-8
3 | encoding//stock/middlewares.py=utf-8
4 | encoding//stock/pipelines.py=utf-8
5 | encoding//stock/settings.py=utf-8
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # stockSpider
2 | 学python写的抓雪球文章的爬虫,基于Scrapy
3 |
4 | scrapy crawl stock
5 |
--------------------------------------------------------------------------------
/api/app.py:
--------------------------------------------------------------------------------
1 | from flask import Flask
2 | from flask_pymongo import PyMongo
3 | from flask import render_template
4 |
5 | app = Flask(__name__)
6 |
7 | app.config['MONGO_HOST'] = 'localhost'
8 | app.config['MONGO_PORT'] = 27017
9 | app.config['MONGO_DBNAME'] = 'stock'
10 | mongo = PyMongo(app, config_prefix='MONGO')
11 |
12 |
13 | @app.route('/')
14 | def hello():
15 | stocks = list(mongo.db.top.find({}))
16 | return render_template('index.html', stocks=stocks)
17 |
18 | if __name__ == '__main__':
19 | app.run()
20 |
--------------------------------------------------------------------------------
/api/flask/app.py:
--------------------------------------------------------------------------------
1 | #!flask/bin/python
2 | from flask import Flask
3 |
4 | app = Flask(__name__)
5 |
6 |
7 | connection = pymongo.MongoClient(
8 | settings['MONGODB_SERVER'],
9 | settings['MONGODB_PORT']
10 | )
11 | db = connection[settings['MONGODB_DB']]
12 | self.collection = db[settings['MONGODB_COLLECTION']]
13 |
14 | @app.route('/')
15 | def index():
16 | return "Hello, World!"
17 |
18 | if __name__ == '__main__':
19 | app.run(debug=True)
20 |
--------------------------------------------------------------------------------
/api/requirements.txt:
--------------------------------------------------------------------------------
1 | click==6.7
2 | Flask==0.12.2
3 | itsdangerous==0.24
4 | Jinja2==2.9.6
5 | MarkupSafe==1.0
6 | Werkzeug==0.12.2
7 |
--------------------------------------------------------------------------------
/api/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | stock form baidu top
6 |
7 |
8 |
9 |
10 |
11 | {% for stock in stocks %}
12 | - {{ stock.stockName }}
13 | {% endfor %}
14 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/cmdline.py:
--------------------------------------------------------------------------------
1 | import scrapy.cmdline
2 |
3 | if __name__ == '__main__':
4 | scrapy.cmdline.execute(argv=['scrapy','crawl','xueqiuPostSpider'])
--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = stock.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = stock
12 |
--------------------------------------------------------------------------------
/stock/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mizhdi/stockSpider/a4c0092a312b7fc1be1b4468779febcea6bda625/stock/__init__.py
--------------------------------------------------------------------------------
/stock/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mizhdi/stockSpider/a4c0092a312b7fc1be1b4468779febcea6bda625/stock/__init__.pyc
--------------------------------------------------------------------------------
/stock/commands/crawlall.py:
--------------------------------------------------------------------------------
1 | # from scrapy.command import ScrapyCommand
2 | # from scrapy.utils.project import get_project_settings
3 | # from scrapy.crawler import Crawler
4 | #
5 | # class Command(ScrapyCommand):
6 | #
7 | # requires_project = True
8 | #
9 | # def syntax(self):
10 | # return '[options]'
11 | #
12 | # def short_desc(self):
13 | # return 'Runs all of the spiders'
14 | #
15 | # def run(self, args, opts):
16 | # settings = get_project_settings()
17 | #
18 | # for spider_name in self.crawler.spiders.list():
19 | # crawler = Crawler(settings)
20 | # crawler.configure()
21 | # spider = crawler.spiders.create(spider_name)
22 | # crawler.crawl(spider)
23 | # crawler.start()
24 | #
25 | # self.crawler.start()
--------------------------------------------------------------------------------
/stock/cookies.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import json
3 | import redis
4 | import logging
5 | from .settings import REDIS_URL
6 | from random import choice
7 | from scrapy.conf import settings
8 |
9 | reds = redis.Redis.from_url(REDIS_URL, db=2, decode_responses=True)
10 | login_url = 'https://xueqiu.com'
11 |
12 | def get_cookie():
13 | # request and get cookie
14 | # return '{"xq_a_token": "0a52c567442f1fdd8b09c27e0abb26438e274a7e","xq_r_token": "43c6fed2d6b5cc8bc38cc9694c6c1cf121d38471"}';
15 | headers = {'User-Agent': choice(settings["USER_AGENT_CHOICES"])}
16 | r = requests.get(login_url, headers=headers)
17 | cookies = r.cookies.get_dict()
18 |
19 | return json.dumps(cookies)
20 |
21 | def init_cookie(spidername):
22 | if reds.get("%s:Cookies" % (spidername)) is None:
23 | cookie = get_cookie()
24 | reds.set("%s:Cookies" % (spidername), cookie)
25 |
26 | def update_cookie():
27 | pass
28 |
29 | def delete_cookie():
30 | pass
--------------------------------------------------------------------------------
/stock/cookies.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mizhdi/stockSpider/a4c0092a312b7fc1be1b4468779febcea6bda625/stock/cookies.pyc
--------------------------------------------------------------------------------
/stock/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class TopStockItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | num = scrapy.Field()
14 | name = scrapy.Field()
15 | source = scrapy.Field()
16 |
17 | pass
18 |
19 | class PostItem(scrapy.Item):
20 | # define the fields for your item here like:
21 | authorId = scrapy.Field()
22 | viewCount = scrapy.Field()
23 | postId = scrapy.Field()
24 | postTitle = scrapy.Field()
25 | postDetail = scrapy.Field()
26 |
27 | pass
28 |
--------------------------------------------------------------------------------
/stock/items.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mizhdi/stockSpider/a4c0092a312b7fc1be1b4468779febcea6bda625/stock/items.pyc
--------------------------------------------------------------------------------
/stock/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 | from scrapy.downloadermiddlewares.retry import RetryMiddleware
10 | from .cookies import init_cookie, update_cookie
11 | import redis
12 | from random import choice
13 | from scrapy.exceptions import NotConfigured
14 | import json
15 |
16 | class CookieMiddleware(RetryMiddleware):
17 | def __init__(self, settings, crawler):
18 | RetryMiddleware.__init__(self, settings)
19 | self.reds = redis.Redis.from_url(settings['REDIS_URL'], db=2, decode_responses=True)
20 | init_cookie(crawler.spider.name)
21 |
22 | @classmethod
23 | def from_crawler(cls, crawler):
24 | return cls(crawler.settings, crawler)
25 |
26 | def process_request(self, request, spider):
27 | redisKeys = self.reds.keys()
28 | while len(redisKeys) > 0:
29 | if spider.name + ':Cookies' in redisKeys:
30 | try:
31 | cookie = json.loads(self.reds.get(spider.name + ':Cookies'))
32 | request.cookies = cookie
33 | except:
34 | print "json string convert to dict failed"
35 | break
36 | else:
37 | pass
38 | #redisKeys.remove(elem)
39 |
40 |
41 | class UserAgentMiddleware(object):
42 | # Not all methods need to be defined. If a method is not defined,
43 | # scrapy acts as if the spider middleware does not modify the
44 | # passed objects.
45 |
46 | def __init__(self, user_agents):
47 | self.enabled = False
48 | self.user_agents = user_agents
49 |
50 | @classmethod
51 | def from_crawler(cls, crawler):
52 | # This method is used by Scrapy to create your spiders.
53 | # s = cls()
54 | # crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
55 | # return s
56 | user_agents = crawler.settings.get('USER_AGENT_CHOICES', [])
57 |
58 | if not user_agents:
59 | raise NotConfigured("USER_AGENT_CHOICES not set or empty")
60 |
61 | o = cls(user_agents)
62 | crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
63 |
64 | return o
65 |
66 | def process_spider_input(self, response, spider):
67 | # Called for each response that goes through the spider
68 | # middleware and into the spider.
69 |
70 | # Should return None or raise an exception.
71 | return None
72 |
73 | def process_spider_output(self, response, result, spider):
74 | # Called with the results returned from the Spider, after
75 | # it has processed the response.
76 |
77 | # Must return an iterable of Request, dict or Item objects.
78 | for i in result:
79 | yield i
80 |
81 | def process_spider_exception(self, response, exception, spider):
82 | # Called when a spider or process_spider_input() method
83 | # (from other spider middleware) raises an exception.
84 |
85 | # Should return either None or an iterable of Response, dict
86 | # or Item objects.
87 | pass
88 |
89 | def process_start_requests(self, start_requests, spider):
90 | # Called with the start requests of the spider, and works
91 | # similarly to the process_spider_output() method, except
92 | # that it doesn’t have a response associated.
93 |
94 | # Must return only requests (not items).
95 | for r in start_requests:
96 | yield r
97 |
98 | def spider_opened(self, spider):
99 | spider.logger.info('Spider opened: %s' % spider.name)
100 | self.enabled = getattr(spider, 'rotate_user_agent', self.enabled)
101 |
102 | def process_request(self, request, spider):
103 | if not self.enabled or not self.user_agents:
104 | return
105 | request.headers['user-agent'] = choice(self.user_agents)
106 |
--------------------------------------------------------------------------------
/stock/middlewares.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mizhdi/stockSpider/a4c0092a312b7fc1be1b4468779febcea6bda625/stock/middlewares.pyc
--------------------------------------------------------------------------------
/stock/mongodb.py:
--------------------------------------------------------------------------------
1 | import pymongo
2 | from scrapy.conf import settings
3 |
4 | def init_mongodb():
5 | connection = pymongo.MongoClient(
6 | settings['MONGODB_SERVER'],
7 | settings['MONGODB_PORT']
8 | )
9 |
10 | db = connection[settings['MONGODB_DB']]
11 |
12 | return db
13 |
--------------------------------------------------------------------------------
/stock/mongodb.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mizhdi/stockSpider/a4c0092a312b7fc1be1b4468779febcea6bda625/stock/mongodb.pyc
--------------------------------------------------------------------------------
/stock/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | import pymongo
9 | from scrapy import log
10 | from twisted.conch.insults.window import cursor
11 | from stock.mongodb import init_mongodb
12 |
13 | class StockPipeline(object):
14 |
15 | def __init__(self):
16 | self.db = init_mongodb()
17 |
18 | def process_item(self, item, spider):
19 | if spider.name == 'baiduTopStockSpider':
20 | collection = self.db[settings['stock']]
21 | d = dict(item)
22 | cursor = list(collection.find({'num': d["num"], 'source': d["source"]}))
23 |
24 | if cursor:
25 | collection.update({'_id': cursor[0]['_id']}, d)
26 | else:
27 | collection.insert(d)
28 | log.msg("stock added to MongoDB database!", level=log.DEBUG, spider=spider)
29 | elif spider.name == 'xueqiuPostSpider':
30 | collection = self.db['post']
31 | collection.save(dict(item))
32 | log.msg("post added to MongoDB database!", level=log.DEBUG, spider=spider)
33 |
34 | return item
35 |
--------------------------------------------------------------------------------
/stock/pipelines.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mizhdi/stockSpider/a4c0092a312b7fc1be1b4468779febcea6bda625/stock/pipelines.pyc
--------------------------------------------------------------------------------
/stock/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for stock project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'stock'
13 |
14 | SPIDER_MODULES = ['stock.spiders']
15 | NEWSPIDER_MODULE = 'stock.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'stock (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'stock.middlewares.StockSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'stock.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | # 'scrapy.pipeline.images.ImagesPipeline': 1,
69 | 'stock.pipelines.StockPipeline': 10,
70 | 'scrapy_redis.pipelines.RedisPipeline': 300,
71 | }
72 |
73 | IMAGES_STORE = '/path/to/valid/dir'
74 | IMAGES_EXPIRES = 30
75 | IMAGES_THUMBS = {
76 | 'small': (50, 50),
77 | 'big': (270, 270),
78 | }
79 | # Enable and configure the AutoThrottle extension (disabled by default)
80 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
81 | #AUTOTHROTTLE_ENABLED = True
82 | # The initial download delay
83 | #AUTOTHROTTLE_START_DELAY = 5
84 | # The maximum download delay to be set in case of high latencies
85 | #AUTOTHROTTLE_MAX_DELAY = 60
86 | # The average number of requests Scrapy should be sending in parallel to
87 | # each remote server
88 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
89 | # Enable showing throttling stats for every response received:
90 | #AUTOTHROTTLE_DEBUG = False
91 |
92 | # Enable and configure HTTP caching (disabled by default)
93 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
94 | #HTTPCACHE_ENABLED = True
95 | #HTTPCACHE_EXPIRATION_SECS = 0
96 | #HTTPCACHE_DIR = 'httpcache'
97 | #HTTPCACHE_IGNORE_HTTP_CODES = []
98 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
99 |
100 |
101 | MONGODB_SERVER = "localhost"
102 | MONGODB_PORT = 27017
103 | MONGODB_DB = "stock"
104 | MONGODB_COLLECTION = "top"
105 |
106 | FEED_URI = '/Users/sean/Documents/topStock.csv'
107 | FEED_FORMAT ='CSV'
108 |
109 | # COMMANDS_MODULE = 'stock.commands'
110 |
111 |
112 | #启用Redis调度存储请求队列
113 | SCHEDULER = "scrapy_redis.scheduler.Scheduler"
114 |
115 | #确保所有的爬虫通过Redis去重
116 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
117 |
118 | #默认请求序列化使用的是pickle 但是我们可以更改为其他类似的。PS:这玩意儿2.X的可以用。3.X的不能用
119 | #SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat"
120 |
121 | #不清除Redis队列、这样可以暂停/恢复 爬取
122 | #SCHEDULER_PERSIST = True
123 |
124 | #使用优先级调度请求队列 (默认使用)
125 | #SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
126 | #可选用的其它队列
127 | #SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue'
128 | #SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue'
129 |
130 | #最大空闲时间防止分布式爬虫因为等待而关闭
131 | #SCHEDULER_IDLE_BEFORE_CLOSE = 10
132 |
133 | #将清除的项目在redis进行处理
134 |
135 | DOWNLOADER_MIDDLEWARES = {
136 | 'stock.middlewares.UserAgentMiddleware': 100,
137 | 'stock.middlewares.CookieMiddleware': 200
138 | }
139 |
140 | #序列化项目管道作为redis Key存储
141 | REDIS_ITEMS_KEY = '%(spider)s:items'
142 |
143 | #默认使用ScrapyJSONEncoder进行项目序列化
144 | #You can use any importable path to a callable object.
145 | #REDIS_ITEMS_SERIALIZER = 'json.dumps'
146 |
147 | #指定连接到redis时使用的端口和地址(可选)
148 | #REDIS_HOST = 'localhost'
149 | #REDIS_PORT = 6379
150 |
151 | #指定用于连接redis的URL(可选)
152 | #如果设置此项,则此项优先级高于设置的REDIS_HOST 和 REDIS_PORT
153 | REDIS_URL = 'redis://127.0.0.1:6379'
154 |
155 | #自定义的redis参数(连接超时之类的)
156 | #REDIS_PARAMS = {}
157 |
158 | #自定义redis客户端类
159 | #REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient'
160 |
161 | #如果为True,则使用redis的'spop'进行操作。
162 | #如果需要避免起始网址列表出现重复,这个选项非常有用。开启此选项urls必须通过sadd添加,否则会出现类型错误。
163 | #REDIS_START_URLS_AS_SET = False
164 |
165 | #RedisSpider和RedisCrawlSpider默认 start_usls 键
166 | #REDIS_START_URLS_KEY = '%(name)s:start_urls'
167 |
168 | #设置redis使用utf-8之外的编码
169 | #REDIS_ENCODING = 'latin1'
170 |
171 | USER_AGENT_CHOICES = [
172 | 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0',
173 | # 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
174 | # 'Mozilla/5.0 (compatible; Bingbot/2.0; +http://www.bing.com/bingbot.htm)',
175 | # 'Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)',
176 | # 'DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)',
177 | # 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
178 | # 'Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)',
179 | # 'ia_archiver (+http://www.alexa.com/site/help/webmasters; crawler@alexa.com)',
180 | ]
181 |
--------------------------------------------------------------------------------
/stock/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mizhdi/stockSpider/a4c0092a312b7fc1be1b4468779febcea6bda625/stock/settings.pyc
--------------------------------------------------------------------------------
/stock/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/stock/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mizhdi/stockSpider/a4c0092a312b7fc1be1b4468779febcea6bda625/stock/spiders/__init__.pyc
--------------------------------------------------------------------------------
/stock/spiders/baiduTopStockSpider.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 | from scrapy.selector import Selector
3 | from stock.items import TopStockItem
4 |
5 | class baiduTopStockSpider(scrapy.spiders.Spider):
6 | name = "baiduTopStockSpider"
7 | allowed_domains = ["baidu.com"]
8 | start_urls = [
9 | "http://top.baidu.com/buzz?b=276&c=17&fr=topbuzz_b277_c17",
10 | ]
11 | url = 'http://top.baidu.com'
12 |
13 | def parse(self, response):
14 | # print(response, type(response))
15 | # from scrapy.http.response.html import HtmlResponse
16 | item = TopStockItem()
17 | selector = Selector(response)
18 | stocks = selector.xpath('//td[@class="keyword"]/a[@class="list-title"]')
19 |
20 | for index, stock in enumerate(stocks):
21 | item['name'] = stock.xpath('text()').extract()[0]
22 | item['num'] = index + 1
23 | item['source'] = "baidu"
24 |
25 | yield item
26 |
--------------------------------------------------------------------------------
/stock/spiders/baiduTopStockSpider.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mizhdi/stockSpider/a4c0092a312b7fc1be1b4468779febcea6bda625/stock/spiders/baiduTopStockSpider.pyc
--------------------------------------------------------------------------------
/stock/spiders/topStockSpider.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mizhdi/stockSpider/a4c0092a312b7fc1be1b4468779febcea6bda625/stock/spiders/topStockSpider.pyc
--------------------------------------------------------------------------------
/stock/spiders/xueqiuPostSpider.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 | import urlparse
3 | from scrapy.selector import Selector
4 | import json
5 | from scrapy import log
6 | from scrapy.http import Request,FormRequest
7 | from stock.items import PostItem
8 |
9 | class xueqiuPostSpider(scrapy.spiders.Spider):
10 | name = "xueqiuPostSpider"
11 | rotate_user_agent = True
12 | allowed_domains = ["xueqiu.com"]
13 | start_urls = ["https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=10&category=-1"]
14 |
15 | def __init__(self):
16 | self.headers = {}
17 | self.cookies = {}
18 |
19 | def start_requests(self):
20 | for i, url in enumerate(self.start_urls):
21 | yield FormRequest(url,
22 | meta = {'cookiejar': i},
23 | headers = self.headers,
24 | cookies = self.cookies,
25 | callback = self.parse,
26 | dont_filter = True)#jump to login page
27 |
28 | def parse(self, response):
29 | # print(response.text, type(response))
30 | # from scrapy.http.response.html import HtmlResponse
31 |
32 | for list in json.loads(response.text)['list']:
33 | dict = json.loads(list['data'])
34 | detail_url = "https://xueqiu.com/" + str(dict['user']['id']) + '/' + str(dict['id'])
35 |
36 | yield Request(detail_url, callback=self.parse_detail, headers=self.headers, cookies=self.cookies)
37 |
38 | url = "https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=" + str(json.loads(response.text)['next_max_id']) + "&count=10&category=-1"
39 |
40 | yield Request(url, callback=self.parse, headers=self.headers, cookies=self.cookies)
41 |
42 |
43 | def parse_detail(self, response):
44 | url = urlparse.urlparse(response.url)
45 | path = url.path.split("/")
46 |
47 | item = PostItem()
48 | selector = Selector(response)
49 |
50 | item['postId'] = path[2]
51 | item['authorId'] = path[1]
52 | item['postDetail'] = selector.xpath('//div[@class="detail"]').extract()[0]
53 |
54 | yield item
55 |
56 |
57 |
58 |
59 |
--------------------------------------------------------------------------------
/stock/spiders/xueqiuPostSpider.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mizhdi/stockSpider/a4c0092a312b7fc1be1b4468779febcea6bda625/stock/spiders/xueqiuPostSpider.pyc
--------------------------------------------------------------------------------