├── README.md
├── get_content
    ├── __init__.py
    ├── items.py
    ├── model.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
    │   ├── __init__.py
    │   └── get_content_spider.py
└── scrapy.cfg


/README.md:
--------------------------------------------------------------------------------
 1 | #### 抓取东方财富网龙虎榜
 2 | 
 3 | 
 4 | * 利用scrapy从东方财富网抓取龙虎榜数据.
 5 | * 实现了增量更新.
 6 | * 利用pipeline处理中间数据.
 7 | * 利用peewee简化数据更新操作.
 8 | * 运行请进入项目所在路径，并运行命令：scrapy crawl get_content
 9 | * 项目中从数据库中读取了工作日对应的日期数据，如需使用请自行改接其他数据源
10 | 


--------------------------------------------------------------------------------
/get_content/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donadonny/get_longhubang/6ce325a38ec1f084181cd3fc7e83762c87b536e9/get_content/__init__.py


--------------------------------------------------------------------------------
/get_content/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class GetContentItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     date =scrapy.Field()
15 |     stock_code =scrapy.Field()
16 |     department=scrapy.Field()
17 |     buy=scrapy.Field()
18 |     buy_percent =scrapy.Field()
19 |     sell =scrapy.Field()
20 |     sell_percent =scrapy.Field()
21 |     net=scrapy.Field()
22 |     tag=scrapy.Field()
23 |     serial_number = scrapy.Field()
24 |     reason = scrapy.Field()
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/get_content/model.py:
--------------------------------------------------------------------------------
 1 | # __author__ = 'fit'
 2 | # -*- coding: utf-8 -*-
 3 | from peewee import *
 4 | 
 5 | db = MySQLDatabase(database="pachong", host="192.168.0.114", port=3306, user="root", passwd="fit123456",
 6 |                    charset="utf8")
 7 | db.connect()
 8 | 
 9 | 
10 | # 龙虎榜orm模型定义
11 | class longhubang(Model):
12 |     stock_code = CharField()
13 |     date = DateField()
14 |     department = CharField()
15 |     buy = DoubleField(null=True)
16 |     buy_percent = DoubleField(null=True)
17 |     sell = DoubleField(null=True)
18 |     sell_percent = DoubleField(null=True)
19 |     net = DoubleField(null=True)
20 |     reason = CharField(null=True)
21 |     tag = IntegerField()
22 |     serial_number = IntegerField(null=True)
23 | 
24 |     class Meta:
25 |         database = db
26 | 


--------------------------------------------------------------------------------
/get_content/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import datetime
 8 | import re
 9 | from model import *
10 | 
11 | 
12 | class transformPipeline(object):
13 |     def __init__(self):
14 |         self.pattern = re.compile(r'^(-?\d+)(\.\d+)?$')
15 | 
16 |     def process_item(self, item, spider):
17 |         item['date'] = datetime.datetime.strptime(item['date'], "%Y-%m-%d").date()
18 |         if self.pattern.match(item['buy']):
19 |             item['buy'] = float(item['buy'])
20 |         else:
21 |             item['buy'] = None
22 |         if self.pattern.match(item['sell']):
23 |             item['sell'] = float(item['sell'])
24 |         else:
25 |             item['sell'] = None
26 |         if self.pattern.match(item['net']):
27 |             item['net'] = float(item['net'])
28 |         else:
29 |             item['net'] = None
30 |         if len(item['buy_percent']) <= 1:
31 |             item['buy_percent'] = None
32 |         else:
33 |             item['buy_percent'] = item['buy_percent'][:-1]
34 |             item['buy_percent'] = float(item['buy_percent'])
35 | 
36 |         if len(item['sell_percent']) <= 1:
37 |             item['sell_percent'] = None
38 |         else:
39 |             item['sell_percent'] = item['sell_percent'][:-1]
40 |             item['sell_percent'] = float(item['sell_percent'])
41 |         if self.pattern.match(item['serial_number']):
42 |             item['serial_number'] = int(item['serial_number'])
43 |         else:
44 |             item['serial_number'] = None
45 |         return item
46 | 
47 | 
48 | class GetContentPipeline(object):
49 |     def process_item(self, item, spider):
50 |         # 修改为直接插入，upsert操作太费时
51 |         one = longhubang(
52 |             stock_code=item['stock_code'],
53 |             date=item['date'],
54 |             reason=item['reason'],
55 |             department=item['department'],
56 |             sell=item['sell'],
57 |             sell_percent=item['sell_percent'],
58 |             buy=item['buy'],
59 |             buy_percent=item['buy_percent'],
60 |             tag=item['tag'],
61 |             net=item['net'],
62 |             serial_number=item['serial_number'],
63 |         )
64 |         one.save()
65 |         return item
66 | 


--------------------------------------------------------------------------------
/get_content/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for get_content project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'get_content'
13 | 
14 | SPIDER_MODULES = ['get_content.spiders']
15 | NEWSPIDER_MODULE = 'get_content.spiders'
16 | DEFAULT_REQUEST_HEADERS = {
17 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
18 |     'Accept-Encoding': 'gzip, deflate, sdch',
19 |     'Accept-Language': 'zh-CN,zh;q=0.8',
20 |     'Referer': 'http://data.eastmoney.com',
21 |     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
22 | 
23 | }
24 | DOWNLOAD_DELAY = 1
25 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
26 | # USER_AGENT = 'get_content (+http://www.yourdomain.com)'
27 | 
28 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
29 | # CONCURRENT_REQUESTS=32
30 | 
31 | # Configure a delay for requests for the same website (default: 0)
32 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
33 | # See also autothrottle settings and docs
34 | # DOWNLOAD_DELAY=3
35 | # The download delay setting will honor only one of:
36 | # CONCURRENT_REQUESTS_PER_DOMAIN=16
37 | # CONCURRENT_REQUESTS_PER_IP=16
38 | 
39 | # Disable cookies (enabled by default)
40 | # COOKIES_ENABLED=False
41 | 
42 | # Disable Telnet Console (enabled by default)
43 | # TELNETCONSOLE_ENABLED=False
44 | 
45 | # Override the default request headers:
46 | # DEFAULT_REQUEST_HEADERS = {
47 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
48 | #   'Accept-Language': 'en',
49 | # }
50 | 
51 | # Enable or disable spider middlewares
52 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
53 | # SPIDER_MIDDLEWARES = {
54 | #    'get_content.middlewares.MyCustomSpiderMiddleware': 543,
55 | # }
56 | 
57 | # Enable or disable downloader middlewares
58 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
59 | # DOWNLOADER_MIDDLEWARES = {
60 | #    'get_content.middlewares.MyCustomDownloaderMiddleware': 543,
61 | # }
62 | 
63 | # Enable or disable extensions
64 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
65 | # EXTENSIONS = {
66 | #    'scrapy.telnet.TelnetConsole': None,
67 | # }
68 | 
69 | # Configure item pipelines
70 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
71 | ITEM_PIPELINES = {
72 |     'get_content.pipelines.transformPipeline': 300,
73 |     'get_content.pipelines.GetContentPipeline': 800,
74 | }
75 | 
76 | # Enable and configure the AutoThrottle extension (disabled by default)
77 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
78 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay
79 | # AUTOTHROTTLE_ENABLED=True
80 | # The initial download delay
81 | # AUTOTHROTTLE_START_DELAY=5
82 | # The maximum download delay to be set in case of high latencies
83 | # AUTOTHROTTLE_MAX_DELAY=60
84 | # Enable showing throttling stats for every response received:
85 | # AUTOTHROTTLE_DEBUG=False
86 | 
87 | # Enable and configure HTTP caching (disabled by default)
88 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
89 | # HTTPCACHE_ENABLED=True
90 | # HTTPCACHE_EXPIRATION_SECS=0
91 | # HTTPCACHE_DIR='httpcache'
92 | # HTTPCACHE_IGNORE_HTTP_CODES=[]
93 | # HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
94 | 


--------------------------------------------------------------------------------
/get_content/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/get_content/spiders/get_content_spider.py:
--------------------------------------------------------------------------------
  1 | # __author__ = 'fit'`
  2 | # -*- coding: utf-8 -*-
  3 | from get_content.model import *
  4 | from get_content.items import *
  5 | import urllib
  6 | import re
  7 | import MySQLdb
  8 | import pandas as pd
  9 | import scrapy
 10 | 
 11 | 
 12 | class get_content_spider(scrapy.Spider):
 13 |     name = "get_content"
 14 |     allowed_domains = ["eastmoney.com"]
 15 | 
 16 |     # 获取数据表中最晚的那天的日期
 17 |     def get_lastest_date(self):
 18 |         con = MySQLdb.connect(host="192.168.0.114", port=3306, user="root", passwd="fit123456", charset="utf8",
 19 |                               db="pachong")
 20 |         cur = con.cursor()
 21 |         sql = "select date from longhubang order by date desc limit 0,1"
 22 |         cur.execute(sql)
 23 |         result = cur.fetchone()
 24 |         cur.close()
 25 |         con.close()
 26 |         if result is None or len(result) == 0:
 27 |             print 'maybe table longhubang is wrong!'
 28 |             exit(-1)
 29 |         return result[0]
 30 | 
 31 |     def build_url_by_loss_date(self):
 32 |         latest_date = self.get_lastest_date()
 33 |         update_con = MySQLdb.connect(host="192.168.0.114", port=3306, user="root", passwd="fit123456", charset="utf8",
 34 |                                      db="update")
 35 |         sql = "select distinct SH_tradeday as date from tradeday where SH_tradeday >'%s' " % latest_date
 36 |         df = pd.read_sql(sql, con=update_con)
 37 |         df['date'] = df['date'].map(lambda x: x.to_datetime().date())
 38 |         date_list = df['date'].tolist()
 39 |         ret_urls = []
 40 |         i = 1.0
 41 |         length = len(date_list)
 42 |         for date in date_list:
 43 |             print "build urls", i / length, "complete!"
 44 |             html_url = '''http://data.eastmoney.com/DataCenter_V3/stock2016/TradeDetail/pagesize=200,page=1,sortRule=-1,sortType=,startDate=%s,endDate=%s''' % (
 45 |                 date, date)
 46 |             html_url += ",gpfw=0,js=var%20data_tab_1.html"
 47 |             wp = urllib.urlopen(html_url)
 48 |             content = wp.read()
 49 |             code_list = re.findall(r"SCode\":\"\d\d\d\d\d\d", content)
 50 |             code_list = list(set(code_list))
 51 |             code_list = map(lambda x: x[-6:], code_list)
 52 |             url_list = map(lambda x: '''http://data.eastmoney.com/stock/lhb,%s,%s.html''' % (date, x), code_list)
 53 |             ret_urls.extend(url_list)
 54 |             i += 1
 55 |         return ret_urls
 56 | 
 57 |     def start_requests(self):
 58 |         url_list = self.build_url_by_loss_date();
 59 |         for url in url_list:
 60 |             yield self.make_requests_from_url(url)
 61 | 
 62 |     def parse(self, response):
 63 |         stock_code = response.url.split(',')[2][0:6]
 64 |         date = response.url.split(',')[1]
 65 |         list = response.xpath("//div[@class='left con-br']/text()").extract()
 66 |         if list is None or len(list) == 0:
 67 |             print '该链接无效'
 68 |             return
 69 |         table1s = response.xpath("//table[@class='default_tab stock-detail-tab']")
 70 |         table2s = response.xpath("//table[@class='default_tab tab-2']")
 71 |         for i in range(0, len(table1s)):
 72 |             table1 = table1s[i].xpath(".//tbody/tr")
 73 |             reason = None
 74 |             if i < len(list):
 75 |                 reason = list[i]
 76 |                 reason = reason.encode("utf8")
 77 |                 reason = reason.split("：")[1]
 78 |             for one in table1:
 79 |                 tmp = one.xpath("./td/text()").extract()
 80 |                 if len(tmp) < 10:
 81 |                     break
 82 |                 department = one.xpath("./td/div[@class='sc-name']/a/text()").extract()
 83 |                 item = GetContentItem()
 84 |                 item['tag'] = 1  # 1 买入
 85 |                 if department is None or len(department) == 0:
 86 |                     item['department'] = None
 87 |                 else:
 88 |                     item['department'] = department[0].encode("utf8")
 89 |                 item['date'] = date
 90 |                 item['stock_code'] = stock_code
 91 |                 item['reason'] = reason
 92 |                 item['serial_number'] = tmp[0]
 93 |                 item['buy'] = tmp[5]
 94 |                 item['buy_percent'] = tmp[6]
 95 |                 item['sell'] = tmp[7]
 96 |                 item['sell_percent'] = tmp[8]
 97 |                 item['net'] = tmp[9]
 98 |                 yield item
 99 |         for i in range(0, len(table2s)):
100 |             table2 = table2s[i].xpath(".//tbody/tr")
101 |             reason = None
102 |             if i < len(list):
103 |                 reason = list[i]
104 |                 reason = reason.encode("utf8")
105 |                 reason = reason.split("：")[1]
106 |             for one in table2:
107 |                 tmp = one.xpath("./td/text()").extract()
108 |                 if len(tmp) < 9:
109 |                     break
110 |                 department = one.xpath("./td/div[@class='sc-name']/a/text()").extract()
111 |                 item = GetContentItem()
112 |                 item['tag'] = 2  # 2 卖出
113 |                 if (department is None or len(department) == 0):
114 |                     item['department'] = None
115 |                 else:
116 |                     item['department'] = department[0].encode("utf8")
117 |                 item['date'] = date
118 |                 item['stock_code'] = stock_code
119 |                 item['reason'] = reason
120 |                 item['serial_number'] = tmp[0]
121 |                 item['buy'] = tmp[4]
122 |                 item['buy_percent'] = tmp[5]
123 |                 item['sell'] = tmp[6]
124 |                 item['sell_percent'] = tmp[7]
125 |                 item['net'] = tmp[8]
126 |                 yield item
127 | 
128 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = get_content.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = get_content
12 | 


--------------------------------------------------------------------------------