├── README.md ├── get_content ├── __init__.py ├── items.py ├── model.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ └── get_content_spider.py └── scrapy.cfg /README.md: -------------------------------------------------------------------------------- 1 | #### 抓取东方财富网龙虎榜 2 | 3 | 4 | * 利用scrapy从东方财富网抓取龙虎榜数据. 5 | * 实现了增量更新. 6 | * 利用pipeline处理中间数据. 7 | * 利用peewee简化数据更新操作. 8 | * 运行请进入项目所在路径,并运行命令:scrapy crawl get_content 9 | * 项目中从数据库中读取了工作日对应的日期数据,如需使用请自行改接其他数据源 10 | -------------------------------------------------------------------------------- /get_content/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/donadonny/get_longhubang/6ce325a38ec1f084181cd3fc7e83762c87b536e9/get_content/__init__.py -------------------------------------------------------------------------------- /get_content/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class GetContentItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | date =scrapy.Field() 15 | stock_code =scrapy.Field() 16 | department=scrapy.Field() 17 | buy=scrapy.Field() 18 | buy_percent =scrapy.Field() 19 | sell =scrapy.Field() 20 | sell_percent =scrapy.Field() 21 | net=scrapy.Field() 22 | tag=scrapy.Field() 23 | serial_number = scrapy.Field() 24 | reason = scrapy.Field() 25 | 26 | 27 | -------------------------------------------------------------------------------- /get_content/model.py: -------------------------------------------------------------------------------- 1 | # __author__ = 'fit' 2 | # -*- coding: utf-8 -*- 3 | from peewee import * 4 | 5 | db = MySQLDatabase(database="pachong", host="192.168.0.114", port=3306, user="root", passwd="fit123456", 6 | charset="utf8") 7 | db.connect() 8 | 9 | 10 | # 龙虎榜orm模型定义 11 | class longhubang(Model): 12 | stock_code = CharField() 13 | date = DateField() 14 | department = CharField() 15 | buy = DoubleField(null=True) 16 | buy_percent = DoubleField(null=True) 17 | sell = DoubleField(null=True) 18 | sell_percent = DoubleField(null=True) 19 | net = DoubleField(null=True) 20 | reason = CharField(null=True) 21 | tag = IntegerField() 22 | serial_number = IntegerField(null=True) 23 | 24 | class Meta: 25 | database = db 26 | -------------------------------------------------------------------------------- /get_content/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import datetime 8 | import re 9 | from model import * 10 | 11 | 12 | class transformPipeline(object): 13 | def __init__(self): 14 | self.pattern = re.compile(r'^(-?\d+)(\.\d+)?$') 15 | 16 | def process_item(self, item, spider): 17 | item['date'] = datetime.datetime.strptime(item['date'], "%Y-%m-%d").date() 18 | if self.pattern.match(item['buy']): 19 | item['buy'] = float(item['buy']) 20 | else: 21 | item['buy'] = None 22 | if self.pattern.match(item['sell']): 23 | item['sell'] = float(item['sell']) 24 | else: 25 | item['sell'] = None 26 | if self.pattern.match(item['net']): 27 | item['net'] = float(item['net']) 28 | else: 29 | item['net'] = None 30 | if len(item['buy_percent']) <= 1: 31 | item['buy_percent'] = None 32 | else: 33 | item['buy_percent'] = item['buy_percent'][:-1] 34 | item['buy_percent'] = float(item['buy_percent']) 35 | 36 | if len(item['sell_percent']) <= 1: 37 | item['sell_percent'] = None 38 | else: 39 | item['sell_percent'] = item['sell_percent'][:-1] 40 | item['sell_percent'] = float(item['sell_percent']) 41 | if self.pattern.match(item['serial_number']): 42 | item['serial_number'] = int(item['serial_number']) 43 | else: 44 | item['serial_number'] = None 45 | return item 46 | 47 | 48 | class GetContentPipeline(object): 49 | def process_item(self, item, spider): 50 | # 修改为直接插入,upsert操作太费时 51 | one = longhubang( 52 | stock_code=item['stock_code'], 53 | date=item['date'], 54 | reason=item['reason'], 55 | department=item['department'], 56 | sell=item['sell'], 57 | sell_percent=item['sell_percent'], 58 | buy=item['buy'], 59 | buy_percent=item['buy_percent'], 60 | tag=item['tag'], 61 | net=item['net'], 62 | serial_number=item['serial_number'], 63 | ) 64 | one.save() 65 | return item 66 | -------------------------------------------------------------------------------- /get_content/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for get_content project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'get_content' 13 | 14 | SPIDER_MODULES = ['get_content.spiders'] 15 | NEWSPIDER_MODULE = 'get_content.spiders' 16 | DEFAULT_REQUEST_HEADERS = { 17 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 18 | 'Accept-Encoding': 'gzip, deflate, sdch', 19 | 'Accept-Language': 'zh-CN,zh;q=0.8', 20 | 'Referer': 'http://data.eastmoney.com', 21 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36' 22 | 23 | } 24 | DOWNLOAD_DELAY = 1 25 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 26 | # USER_AGENT = 'get_content (+http://www.yourdomain.com)' 27 | 28 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 29 | # CONCURRENT_REQUESTS=32 30 | 31 | # Configure a delay for requests for the same website (default: 0) 32 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 33 | # See also autothrottle settings and docs 34 | # DOWNLOAD_DELAY=3 35 | # The download delay setting will honor only one of: 36 | # CONCURRENT_REQUESTS_PER_DOMAIN=16 37 | # CONCURRENT_REQUESTS_PER_IP=16 38 | 39 | # Disable cookies (enabled by default) 40 | # COOKIES_ENABLED=False 41 | 42 | # Disable Telnet Console (enabled by default) 43 | # TELNETCONSOLE_ENABLED=False 44 | 45 | # Override the default request headers: 46 | # DEFAULT_REQUEST_HEADERS = { 47 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 48 | # 'Accept-Language': 'en', 49 | # } 50 | 51 | # Enable or disable spider middlewares 52 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 53 | # SPIDER_MIDDLEWARES = { 54 | # 'get_content.middlewares.MyCustomSpiderMiddleware': 543, 55 | # } 56 | 57 | # Enable or disable downloader middlewares 58 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 59 | # DOWNLOADER_MIDDLEWARES = { 60 | # 'get_content.middlewares.MyCustomDownloaderMiddleware': 543, 61 | # } 62 | 63 | # Enable or disable extensions 64 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 65 | # EXTENSIONS = { 66 | # 'scrapy.telnet.TelnetConsole': None, 67 | # } 68 | 69 | # Configure item pipelines 70 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 71 | ITEM_PIPELINES = { 72 | 'get_content.pipelines.transformPipeline': 300, 73 | 'get_content.pipelines.GetContentPipeline': 800, 74 | } 75 | 76 | # Enable and configure the AutoThrottle extension (disabled by default) 77 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 78 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay 79 | # AUTOTHROTTLE_ENABLED=True 80 | # The initial download delay 81 | # AUTOTHROTTLE_START_DELAY=5 82 | # The maximum download delay to be set in case of high latencies 83 | # AUTOTHROTTLE_MAX_DELAY=60 84 | # Enable showing throttling stats for every response received: 85 | # AUTOTHROTTLE_DEBUG=False 86 | 87 | # Enable and configure HTTP caching (disabled by default) 88 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 89 | # HTTPCACHE_ENABLED=True 90 | # HTTPCACHE_EXPIRATION_SECS=0 91 | # HTTPCACHE_DIR='httpcache' 92 | # HTTPCACHE_IGNORE_HTTP_CODES=[] 93 | # HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' 94 | -------------------------------------------------------------------------------- /get_content/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /get_content/spiders/get_content_spider.py: -------------------------------------------------------------------------------- 1 | # __author__ = 'fit'` 2 | # -*- coding: utf-8 -*- 3 | from get_content.model import * 4 | from get_content.items import * 5 | import urllib 6 | import re 7 | import MySQLdb 8 | import pandas as pd 9 | import scrapy 10 | 11 | 12 | class get_content_spider(scrapy.Spider): 13 | name = "get_content" 14 | allowed_domains = ["eastmoney.com"] 15 | 16 | # 获取数据表中最晚的那天的日期 17 | def get_lastest_date(self): 18 | con = MySQLdb.connect(host="192.168.0.114", port=3306, user="root", passwd="fit123456", charset="utf8", 19 | db="pachong") 20 | cur = con.cursor() 21 | sql = "select date from longhubang order by date desc limit 0,1" 22 | cur.execute(sql) 23 | result = cur.fetchone() 24 | cur.close() 25 | con.close() 26 | if result is None or len(result) == 0: 27 | print 'maybe table longhubang is wrong!' 28 | exit(-1) 29 | return result[0] 30 | 31 | def build_url_by_loss_date(self): 32 | latest_date = self.get_lastest_date() 33 | update_con = MySQLdb.connect(host="192.168.0.114", port=3306, user="root", passwd="fit123456", charset="utf8", 34 | db="update") 35 | sql = "select distinct SH_tradeday as date from tradeday where SH_tradeday >'%s' " % latest_date 36 | df = pd.read_sql(sql, con=update_con) 37 | df['date'] = df['date'].map(lambda x: x.to_datetime().date()) 38 | date_list = df['date'].tolist() 39 | ret_urls = [] 40 | i = 1.0 41 | length = len(date_list) 42 | for date in date_list: 43 | print "build urls", i / length, "complete!" 44 | html_url = '''http://data.eastmoney.com/DataCenter_V3/stock2016/TradeDetail/pagesize=200,page=1,sortRule=-1,sortType=,startDate=%s,endDate=%s''' % ( 45 | date, date) 46 | html_url += ",gpfw=0,js=var%20data_tab_1.html" 47 | wp = urllib.urlopen(html_url) 48 | content = wp.read() 49 | code_list = re.findall(r"SCode\":\"\d\d\d\d\d\d", content) 50 | code_list = list(set(code_list)) 51 | code_list = map(lambda x: x[-6:], code_list) 52 | url_list = map(lambda x: '''http://data.eastmoney.com/stock/lhb,%s,%s.html''' % (date, x), code_list) 53 | ret_urls.extend(url_list) 54 | i += 1 55 | return ret_urls 56 | 57 | def start_requests(self): 58 | url_list = self.build_url_by_loss_date(); 59 | for url in url_list: 60 | yield self.make_requests_from_url(url) 61 | 62 | def parse(self, response): 63 | stock_code = response.url.split(',')[2][0:6] 64 | date = response.url.split(',')[1] 65 | list = response.xpath("//div[@class='left con-br']/text()").extract() 66 | if list is None or len(list) == 0: 67 | print '该链接无效' 68 | return 69 | table1s = response.xpath("//table[@class='default_tab stock-detail-tab']") 70 | table2s = response.xpath("//table[@class='default_tab tab-2']") 71 | for i in range(0, len(table1s)): 72 | table1 = table1s[i].xpath(".//tbody/tr") 73 | reason = None 74 | if i < len(list): 75 | reason = list[i] 76 | reason = reason.encode("utf8") 77 | reason = reason.split(":")[1] 78 | for one in table1: 79 | tmp = one.xpath("./td/text()").extract() 80 | if len(tmp) < 10: 81 | break 82 | department = one.xpath("./td/div[@class='sc-name']/a/text()").extract() 83 | item = GetContentItem() 84 | item['tag'] = 1 # 1 买入 85 | if department is None or len(department) == 0: 86 | item['department'] = None 87 | else: 88 | item['department'] = department[0].encode("utf8") 89 | item['date'] = date 90 | item['stock_code'] = stock_code 91 | item['reason'] = reason 92 | item['serial_number'] = tmp[0] 93 | item['buy'] = tmp[5] 94 | item['buy_percent'] = tmp[6] 95 | item['sell'] = tmp[7] 96 | item['sell_percent'] = tmp[8] 97 | item['net'] = tmp[9] 98 | yield item 99 | for i in range(0, len(table2s)): 100 | table2 = table2s[i].xpath(".//tbody/tr") 101 | reason = None 102 | if i < len(list): 103 | reason = list[i] 104 | reason = reason.encode("utf8") 105 | reason = reason.split(":")[1] 106 | for one in table2: 107 | tmp = one.xpath("./td/text()").extract() 108 | if len(tmp) < 9: 109 | break 110 | department = one.xpath("./td/div[@class='sc-name']/a/text()").extract() 111 | item = GetContentItem() 112 | item['tag'] = 2 # 2 卖出 113 | if (department is None or len(department) == 0): 114 | item['department'] = None 115 | else: 116 | item['department'] = department[0].encode("utf8") 117 | item['date'] = date 118 | item['stock_code'] = stock_code 119 | item['reason'] = reason 120 | item['serial_number'] = tmp[0] 121 | item['buy'] = tmp[4] 122 | item['buy_percent'] = tmp[5] 123 | item['sell'] = tmp[6] 124 | item['sell_percent'] = tmp[7] 125 | item['net'] = tmp[8] 126 | yield item 127 | 128 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = get_content.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = get_content 12 | --------------------------------------------------------------------------------