├── LICENSE
├── README.md
└── src
    ├── crawl_data
        ├── .crawl_cghsz.sh.swp
        ├── .crawl_cgzsz.sh.swp
        ├── crawl_all.sh
        ├── crawl_attachment.sh
        ├── crawl_cghsz.sh
        ├── crawl_data
        │   ├── __init__.py
        │   ├── items.py
        │   ├── middlewares.py
        │   ├── pipelines.py
        │   ├── settings.py
        │   └── spiders
        │   │   ├── AnhuiSpider.py
        │   │   ├── BeijingSpider.py
        │   │   ├── CentralSpider.py
        │   │   ├── ChongqingSpider.py
        │   │   ├── FujianSpider.py
        │   │   ├── GansuSpider.py
        │   │   ├── GuangdongSpider.py
        │   │   ├── GuangxiSpider.py
        │   │   ├── GuizhouSpider.py
        │   │   ├── HainanSpider.py
        │   │   ├── HebeiSpider.py
        │   │   ├── HeilongjiangSpider.py
        │   │   ├── HenanSpider.py
        │   │   ├── HubeiSpider.py
        │   │   ├── HunanSpider.py
        │   │   ├── JiangsuSpider.py
        │   │   ├── JiangxiSpider.py
        │   │   ├── JilinSpider.py
        │   │   ├── LiaoningSpider.py
        │   │   ├── NeimengguSpider.py
        │   │   ├── NingxiaSpider.py
        │   │   ├── QinghaiSpider.py
        │   │   ├── Shaanxi_shanSpider.py
        │   │   ├── ShandongSpider.py
        │   │   ├── Shandong_leftSpider.py
        │   │   ├── ShanghaiSpider.py
        │   │   ├── Shanxi_jinSpider.py
        │   │   ├── SichuanSpider.py
        │   │   ├── TianjinSpider.py
        │   │   ├── XinjiangSpider.py
        │   │   ├── XizangSpider.py
        │   │   ├── YunnanSpider.py
        │   │   ├── ZhejiangSpider.py
        │   │   └── __init__.py
        ├── crwal_jsst.sh
        ├── scrapy.cfg
        ├── test_get_ip.py
        └── test_proxy.py
    ├── crawl_test
        └── sichuan.py
    ├── data_analysis
        ├── preprocess.py
        ├── text_cluster.py
        ├── topic.py
        ├── wordcloud_diamond.py
        └── wordcloud_plot.py
    └── doc_tex
        └── en.tex


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 JinhuaSu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/crawl_data/.crawl_cghsz.sh.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SmartDataLab/Policy_crawler/fb9fcb7ab701dfb98606afe9f7260f2f2e857506/src/crawl_data/.crawl_cghsz.sh.swp


--------------------------------------------------------------------------------
/src/crawl_data/.crawl_cgzsz.sh.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SmartDataLab/Policy_crawler/fb9fcb7ab701dfb98606afe9f7260f2f2e857506/src/crawl_data/.crawl_cgzsz.sh.swp


--------------------------------------------------------------------------------
/src/crawl_data/crawl_all.sh:
--------------------------------------------------------------------------------
 1 | scrapy crawl  Zhejiang
 2 | scrapy crawl  Jiangxi
 3 | scrapy crawl  Fujian
 4 | scrapy crawl  Sichuan
 5 | scrapy crawl  Heilongjiang
 6 | scrapy crawl  Ningxia
 7 | scrapy crawl  Guizhou
 8 | scrapy crawl  Guangxi
 9 | scrapy crawl  Xinjiang
10 | scrapy crawl  Jiangsu
11 | scrapy crawl  Shandong
12 | scrapy crawl  Hebei
13 | scrapy crawl  Jilin
14 | scrapy crawl  Shanghai
15 | scrapy crawl  Gansu
16 | scrapy crawl  Henan
17 | scrapy crawl  Qinghai
18 | scrapy crawl  Xizang
19 | scrapy crawl  Central
20 | scrapy crawl  Tianjin
21 | scrapy crawl  Anhui
22 | scrapy crawl  Neimenggu
23 | scrapy crawl  Guangdong
24 | scrapy crawl  Shaanxi_shan
25 | scrapy crawl  Chongqing
26 | scrapy crawl  Liaoning
27 | scrapy crawl  Hunan
28 | scrapy crawl  Beijing
29 | scrapy crawl  Shanxi_jin
30 | scrapy crawl  Hainan
31 | scrapy crawl  Yunnan
32 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_attachment.sh:
--------------------------------------------------------------------------------
 1 | scrapy crawl  Hunan
 2 | scrapy crawl  Guangdong
 3 | scrapy crawl  Yunnan
 4 | scrapy crawl  Sichuan
 5 | scrapy crawl  Neimenggu
 6 | scrapy crawl  Shangdong
 7 | scrapy crawl  Shanghai
 8 | scrapy crawl  Guangxi
 9 | scrapy crawl  Hebei
10 | scrapy crawl  Xinjiang
11 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_cghsz.sh:
--------------------------------------------------------------------------------
1 | scrapy crawl Chongqing
2 | scrapy crawl Guangxi
3 | scrapy crawl Hainan
4 | scrapy crawl Shandong
5 | scrapy crawl Zhejiang


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SmartDataLab/Policy_crawler/fb9fcb7ab701dfb98606afe9f7260f2f2e857506/src/crawl_data/crawl_data/__init__.py


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://docs.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class CrawlDataItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | from scrapy.http import HtmlResponse
 10 | import time
 11 | import random
 12 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
 13 | import requests
 14 | import json
 15 | 
 16 | user_agent_list = [
 17 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
 18 |     "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
 19 |     "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
 20 |     "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
 21 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
 22 |     "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
 23 |     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
 24 |     "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
 25 |     "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
 26 |     "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
 27 |     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
 28 |     "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
 29 |     "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
 30 |     "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
 31 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
 32 |     "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
 33 |     "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
 34 |     "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
 35 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
 36 |     "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
 37 |     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
 38 |     "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
 39 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
 40 |     "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
 41 |     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
 42 |     "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
 43 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
 44 |     "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
 45 |     "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
 46 |     "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
 47 |     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
 48 |     "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
 49 |     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
 50 |     "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
 51 |     "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
 52 |     "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
 53 | ]
 54 | 
 55 | 
 56 | class RandomUserAgent(UserAgentMiddleware):  # 如何运行此中间件? settings 直接添加就OK
 57 |     def process_request(self, request, spider):
 58 |         ua = random.choice(user_agent_list)
 59 |         # 在请求头里设置ua
 60 |         request.headers.setdefault("User-Agent", ua)
 61 | 
 62 | 
 63 | class CrawlDataSpiderMiddleware(object):
 64 |     # Not all methods need to be defined. If a method is not defined,
 65 |     # scrapy acts as if the spider middleware does not modify the
 66 |     # passed objects.
 67 | 
 68 |     @classmethod
 69 |     def from_crawler(cls, crawler):
 70 |         # This method is used by Scrapy to create your spiders.
 71 |         s = cls()
 72 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 73 |         return s
 74 | 
 75 |     def process_spider_input(self, response, spider):
 76 |         # Called for each response that goes through the spider
 77 |         # middleware and into the spider.
 78 | 
 79 |         # Should return None or raise an exception.
 80 |         return None
 81 | 
 82 |     def process_spider_output(self, response, result, spider):
 83 |         # Called with the results returned from the Spider, after
 84 |         # it has processed the response.
 85 | 
 86 |         # Must return an iterable of Request, dict or Item objects.
 87 |         for i in result:
 88 |             yield i
 89 | 
 90 |     def process_spider_exception(self, response, exception, spider):
 91 |         # Called when a spider or process_spider_input() method
 92 |         # (from other spider middleware) raises an exception.
 93 | 
 94 |         # Should return either None or an iterable of Request, dict
 95 |         # or Item objects.
 96 |         pass
 97 | 
 98 |     def process_start_requests(self, start_requests, spider):
 99 |         # Called with the start requests of the spider, and works
100 |         # similarly to the process_spider_output() method, except
101 |         # that it doesn’t have a response associated.
102 | 
103 |         # Must return only requests (not items).
104 |         for r in start_requests:
105 |             yield r
106 | 
107 |     def spider_opened(self, spider):
108 |         spider.logger.info("Spider opened: %s" % spider.name)
109 | 
110 | 
111 | class CrawlDataDownloaderMiddleware(object):
112 |     # Not all methods need to be defined. If a method is not defined,
113 |     # scrapy acts as if the downloader middleware does not modify the
114 |     # passed objects.
115 | 
116 |     @classmethod
117 |     def from_crawler(cls, crawler):
118 |         # This method is used by Scrapy to create your spiders.
119 |         s = cls()
120 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
121 |         return s
122 | 
123 |     def process_request(self, request, spider):
124 |         # Called for each request that goes through the downloader
125 |         # middleware.
126 | 
127 |         # Must either:
128 |         # - return None: continue processing this request
129 |         # - or return a Response object
130 |         # - or return a Request object
131 |         # - or raise IgnoreRequest: process_exception() methods of
132 |         #   installed downloader middleware will be called
133 |         return None
134 | 
135 |     def process_response(self, request, response, spider):
136 |         # Called with the response returned from the downloader.
137 | 
138 |         # Must either;
139 |         # - return a Response object
140 |         # - return a Request object
141 |         # - or raise IgnoreRequest
142 |         return response
143 | 
144 |     def process_exception(self, request, exception, spider):
145 |         # Called when a download handler or a process_request()
146 |         # (from other downloader middleware) raises an exception.
147 | 
148 |         # Must either:
149 |         # - return None: continue processing this exception
150 |         # - return a Response object: stops process_exception() chain
151 |         # - return a Request object: stops process_exception() chain
152 |         pass
153 | 
154 |     def spider_opened(self, spider):
155 |         spider.logger.info("Spider opened: %s" % spider.name)
156 | 
157 | 
158 | class SeleniumDownloaderMiddleware(object):
159 |     def __init__(self):
160 |         self.lasttime = time.time()
161 |         self.lastip = self.get_proxy()
162 | 
163 |     # 可以拦截到request请求
164 |     def process_request(self, request, spider):
165 |         if spider.name in [
166 |             "Central",
167 |             "Henan",
168 |             "Xizang",
169 |             "Tianjin",
170 |             "Anhui",
171 |             "Yunnan",
172 |             "Shaanxi_shan",
173 |             "Shandong",
174 |         ]:
175 |             t = time.time()
176 |             if t - self.lasttime <= 10:
177 |                 ret_proxy = self.lastip
178 |             else:
179 |                 ret_proxy = self.get_proxy()
180 |                 if len(ret_proxy) > 0:
181 |                     self.lastip = ret_proxy
182 |                     self.lasttime = t
183 |                 else:
184 |                     ret_proxy = self.lastip
185 |             request.meta["proxy"] = ret_proxy
186 |             print("为%s添加代理%s" % (request.url, ret_proxy), end="")
187 |         else:
188 |             # 在进行url访问之前可以进行的操作, 更换UA请求头, 使用其他代理等
189 |             pass
190 | 
191 |     # 可以拦截到response响应对象(拦截下载器传递给Spider的响应对象)
192 |     def process_response(self, request, response, spider):
193 |         """
194 |         三个参数:
195 |         # request: 响应对象所对应的请求对象
196 |         # response: 拦截到的响应对象
197 |         # spider: 爬虫文件中对应的爬虫类 WangyiSpider 的实例对象, 可以通过这个参数拿到 WangyiSpider 中的一些属性或方法
198 |         """
199 | 
200 |         #  对页面响应体数据的篡改, 如果是每个模块的 url 请求, 则处理完数据并进行封装
201 |         if spider.name in ["Hubei"]:
202 |             spider.browser.get(url=request.url)
203 |             spider.browser.refresh()
204 |             time.sleep(0.5)
205 |             row_response = spider.browser.page_source
206 |             return HtmlResponse(
207 |                 url=spider.browser.current_url,
208 |                 body=row_response,
209 |                 encoding="utf8",
210 |                 request=request,
211 |             )  # 参数url指当前浏览器访问的url(通过current_url方法获取), 在这里参数url也可以用request.url
212 |             # 参数body指要封装成符合HTTP协议的源数据, 后两个参数可有可无
213 |         else:
214 |             return response  # 是原来的主页的响应对象
215 | 
216 |     # 请求出错了的操作, 比如ip被封了,可以在这里设置ip代理
217 |     def process_exception(self, request, exception, spider):
218 |         if spider.name in [
219 |             "Central",
220 |             "Xizang",
221 |             "Tianjin",
222 |             "Anhui",
223 |             "Yunnan",
224 |             "Shaanxi_shan",
225 |             "Shandong",
226 |         ]:
227 |             print("添加代理开始")
228 |             t = time.time()
229 |             if t - self.lasttime <= 10:
230 |                 ret_proxy = self.lastip
231 |             else:
232 |                 ret_proxy = self.get_proxy()
233 |                 if len(ret_proxy) > 0:
234 |                     self.lastip = ret_proxy
235 |                     self.lasttime = t
236 |                 else:
237 |                     ret_proxy = self.lastip
238 |             request.meta["proxy"] = ret_proxy
239 |             print("为%s添加代理%s" % (request.url, ret_proxy), end="")
240 |             return request
241 |         else:
242 |             return None
243 | 
244 |     def get_proxy(self):
245 |         url = "https://api.xiaoxiangdaili.com/ip/get?appKey=660304975683276800&appSecret=GUCGRDQv&cnt=1&method=http&releaseAuto=false&wt=json"
246 | 
247 |         s = ""
248 |         resp = requests.get(url)
249 |         if resp.status_code == 200:
250 |             x = json.loads(resp.text)
251 |             s = "http://%s:%s" % (x["data"][0]["ip"], x["data"][0]["port"])
252 |         return s
253 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import pymongo
 4 | import pandas as pd
 5 | import os
 6 | # Define your item pipelines here
 7 | #
 8 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 9 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
10 | 
11 | 
12 | class CrawlDataPipeline(object):
13 |     def process_item(self, item, spider):
14 |         return item
15 | 
16 | class PolicyMongoPipeline(object):
17 |     def __init__(self, mongo_uri, mongo_db):
18 |         self.mongo_uri = mongo_uri
19 |         self.mongo_db = mongo_db
20 |     
21 |     @classmethod
22 |     def from_crawler(cls, crawler):
23 |         return cls(
24 |             mongo_uri = crawler.settings.get('MONGO_URI'),
25 |             mongo_db = crawler.settings.get('MONGO_DB'),
26 |         )
27 | 
28 |     def open_spider(self, spider):
29 |         self.mongo_col = spider.name
30 |         self.client = pymongo.MongoClient(self.mongo_uri)
31 |         self.db = self.client[self.mongo_db]
32 |         self.db[self.mongo_col].drop()
33 | 
34 |     def close_spider(self, spider):
35 |         if not os.path.exists('../../data/csv'):
36 |             os.makedirs('../../data/csv')
37 |         if not os.path.exists('../../data/empty'):
38 |             os.makedirs('../../data/empty')
39 |         table = self.db[self.mongo_col]
40 |         data_list = []
41 |         empty_list = []
42 |         for raw_dict in table.find():
43 |             #data_list.append({key:value for key,value in  raw_dict.items() if key in ['UID','title','date','url']})
44 |             try:
45 |                 if raw_dict['crawl state'] == 'full':
46 |                     data_list.append({key:raw_dict[key] for key in ['UID','title','date','url','FileNumber','crawl state','text length']})
47 |                 else:
48 |                     empty_list.append({key:raw_dict[key] for key in ['UID','title','date','url','FileNumber','crawl state','text length']})
49 |             except:
50 |                 print(raw_dict)
51 |         df = pd.DataFrame(data_list)
52 |         print(df)
53 |         df.to_csv('../../data/csv/%s_news_list.csv' % spider.name,encoding='utf-8')
54 |         df = pd.DataFrame(empty_list)
55 |         print(df)
56 |         df.to_csv('../../data/empty/%s_empty_list.csv' % spider.name,encoding='utf-8')
57 |         self.client.close()
58 | 
59 |     def process_item(self, item, spider):
60 |         table = self.db[self.mongo_col]
61 |         if item['crawl state'] == 'half':
62 |             table.insert(item)
63 |         else:
64 |             table.update_one({'UID':item['UID']},{'$set':item},upsert=True)
65 |         return item


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for crawl_data project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://docs.scrapy.org/en/latest/topics/settings.html
 9 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'crawl_data'
13 | 
14 | SPIDER_MODULES = ['crawl_data.spiders']
15 | NEWSPIDER_MODULE = 'crawl_data.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'crawl_data (+http://www.yourdomain.com)'
20 | USER_AGENT='Mozilla/5.0'
21 | 
22 | # Obey robots.txt rules
23 | ROBOTSTXT_OBEY = False
24 | 
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | #CONCURRENT_REQUESTS = 32
27 | 
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | #DOWNLOAD_DELAY = 3
32 | # The download delay setting will honor only one of:
33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
34 | #CONCURRENT_REQUESTS_PER_IP = 16
35 | 
36 | # Disable cookies (enabled by default)
37 | #COOKIES_ENABLED = False
38 | 
39 | # Disable Telnet Console (enabled by default)
40 | #TELNETCONSOLE_ENABLED = False
41 | 
42 | # Override the default request headers:
43 | #DEFAULT_REQUEST_HEADERS = {
44 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45 | #   'Accept-Language': 'en',
46 | #}
47 | 
48 | # Enable or disable spider middlewares
49 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
50 | #SPIDER_MIDDLEWARES = {
51 | #    'crawl_data.middlewares.CrawlDataSpiderMiddleware': 543,
52 | #}
53 | 
54 | # Enable or disable downloader middlewares
55 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
56 | DOWNLOADER_MIDDLEWARES = {
57 | #    'crawl_data.middlewares.CrawlDataDownloaderMiddleware': 543,
58 |    'crawl_data.middlewares.SeleniumDownloaderMiddleware': 543,
59 |    'crawl_data.middlewares.RandomUserAgent': 542,
60 | }
61 | 
62 | # Enable or disable extensions
63 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
64 | #EXTENSIONS = {
65 | #    'scrapy.extensions.telnet.TelnetConsole': None,
66 | #}
67 | 
68 | # Configure item pipelines
69 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
70 | ITEM_PIPELINES = {
71 |     #'crawl_data.pipelines.CrawlDataPipeline': 300,
72 |     'crawl_data.pipelines.PolicyMongoPipeline': 400
73 | }
74 | MONGO_URI = 'mongodb://localhost:27017'
75 | MONGO_DB = 'Policy'
76 | 
77 | # Enable and configure the AutoThrottle extension (disabled by default)
78 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
79 | #AUTOTHROTTLE_ENABLED = True
80 | # The initial download delay
81 | #AUTOTHROTTLE_START_DELAY = 5
82 | # The maximum download delay to be set in case of high latencies
83 | #AUTOTHROTTLE_MAX_DELAY = 60
84 | # The average number of requests Scrapy should be sending in parallel to
85 | # each remote server
86 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
87 | # Enable showing throttling stats for every response received:
88 | #AUTOTHROTTLE_DEBUG = False
89 | 
90 | # Enable and configure HTTP caching (disabled by default)
91 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
92 | #HTTPCACHE_ENABLED = True
93 | #HTTPCACHE_EXPIRATION_SECS = 0
94 | #HTTPCACHE_DIR = 'httpcache'
95 | #HTTPCACHE_IGNORE_HTTP_CODES = []
96 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
97 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/AnhuiSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | import ast
 5 | from urllib import parse
 6 | from scrapy.selector import Selector
 7 | 
 8 | class AnhuiSpider(scrapy.Spider):
 9 |     name = "Anhui"
10 |     if not os.path.exists('../../data/HTML_pk/%s' % name):
11 |         os.makedirs('../../data/HTML_pk/%s' % name)
12 |     if not os.path.exists('../../data/text/%s' % name):
13 |         os.makedirs('../../data/text/%s' % name)
14 |     def start_requests(self):
15 |         total_page = 448
16 |         # total_page = 3 
17 | 
18 |         headers = {
19 |             'Cookie':'yfx_c_g_u_id_10006888=_ck20032823561110758413715353987; yfx_f_l_v_t_10006888=f_t_1585410971071__r_t_1585410971071__v_t_1585410971071__r_c_0; UM_distinctid=17121db9612ad6-0ec7fe709cb527-31760856-ff000-17121db9613ce3; CNZZDATA3688016=cnzz_eid%3D1930396560-1585409032-%26ntime%3D1585409032; membercenterjsessionid=MWViOWE2ZDktYmRlNC00NWMzLWFiMWUtNjQ3NWU0OGYzNTFl; wzws_cid=7187a7347902f0d7885c71d456e608136714b577f2ee24a43314ebd9d84b64b6f438b3bf396c3d7cf59f1f30cc019242417df6f600b6f2ee80692577b8f7754ab14524ba3008de665fdf770f7f231f9a; SHIROJSESSIONID=82f1243e-8bc8-4770-8f88-8438524086a6',
20 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
21 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
22 |         }
23 |         url_base = 'http://www.ah.gov.cn/site/label/8888?IsAjax=1&dataType=html&_=0.42568734060518776&siteId=6781961&pageSize=16&pageIndex={0}&action=list&isDate=true&dateFormat=yyyy-MM-dd&length=46&organId=1681&type=4&catId=&cId=&result=%E6%9A%82%E6%97%A0%E7%9B%B8%E5%85%B3%E4%BF%A1%E6%81%AF&searchType=&keyWords=&specialCatIds=6708451%2C6708461%2C6708471&catIdExplainType=6711101tp_explain%2C6711111xwfbh_explain%2C6711121sp_explain%2C6711131ft_explain&labelName=publicInfoList&file=%2Fahxxgk%2FpublicInfoList-an-new'
24 |         for i in range(total_page):
25 |             yield scrapy.Request(url=url_base.format(i+1),headers=headers, callback=self.parse)
26 | 
27 |     def parse(self,response):
28 |         detail_page_links = []
29 |         for div in response.css('div.xxgk_navli'):
30 |             url = div.css('a::attr(href)').get()
31 |             UID = url.split('/')[-1][:-5]
32 |             date = div.css('span.date::text').get()
33 |             if date and len(date) > 1:
34 |                 date = date.replace('\r','')
35 |                 date = date.replace('\n','')
36 |                 date = date.replace('\t','')
37 |             if '?' not in UID:
38 |                 detail_page_links.append(url)
39 |             yield {
40 |                 'UID': UID,
41 |                 'title': div.css('a::attr(title)').get(),
42 |                 'date': date,
43 |                 'FileNumber':None,
44 |                 'url': url,
45 |                 'text length':0,
46 |                 'crawl state':'half'
47 |             }
48 |         try:
49 |             yield from response.follow_all(detail_page_links, callback = self.parse_content)
50 |         except:
51 |             print(response.text)
52 | 
53 |     def parse_content(self, response):
54 |         UID = response.url.split('/')[-1][:-5]
55 |         doc_info_dict = {}
56 |         td_list = response.css('tbody')[0].css('td')
57 |         th_list = response.css('tbody')[0].css('th')
58 |         for i in range(len(th_list)):
59 |             key = ''.join(th_list[i].css('::text').getall())
60 |             value = ''.join(td_list[i].css('::text').getall())
61 |             doc_info_dict[key] = value
62 |         File_num = None
63 |         if '文号：' in doc_info_dict.keys():
64 |             File_num = doc_info_dict['文号：']
65 |         paragraph_list =  response.css('div.wzcon p *::text').getall() 
66 |         if len(paragraph_list) == 0:
67 |             paragraph_list =  response.css('p *::text').getall() 
68 |         length = len(''.join(paragraph_list))
69 |         if length > 0:
70 |             with open('../../data/HTML_pk/%s/%s.pkl' % (self.name,UID), 'wb') as f:
71 |                 pickle.dump(response.text,f)
72 |             with open('../../data/text/%s/%s.txt' % (self.name,UID), 'w') as f:
73 |                 f.write('\n'.join(paragraph_list))
74 |             return {
75 |                 'UID': UID,
76 |                 'FileNumber':File_num,
77 |                 'mainText': paragraph_list,
78 |                 'doc_info_dict':doc_info_dict,
79 |                 'crawl state':'full',
80 |                 'text length':length,
81 |             }
82 |         else:
83 |             return {
84 |                 'UID': UID,
85 |                 'mainText': paragraph_list,
86 |                 'crawl state':'empty',
87 |                 'text length':0,
88 |             }
89 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/BeijingSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import os
 3 | import pickle
 4 | 
 5 | class BeijingSpider(scrapy.Spider):
 6 |     name = "Beijing"
 7 |     if not os.path.exists('../../data/HTML_pk/%s' % name):
 8 |         os.makedirs('../../data/HTML_pk/%s' % name)
 9 |     if not os.path.exists('../../data/text/%s' % name):
10 |         os.makedirs('../../data/text/%s' % name)
11 |     def start_requests(self):
12 |         total_page = 264
13 |         # total_page = 3
14 |         url_base = 'http://www.beijing.gov.cn/zhengce/zhengcefagui/index'
15 |         for i in range(total_page):
16 |             ref = '.html' if i == 0 else '_%s.html' % i 
17 |             yield scrapy.Request(url=url_base + ref, callback=self.parse)
18 | 
19 |     def parse(self,response):
20 |         detail_page_links = []
21 |         for piece in response.css('div.listBox ul.list li'):
22 |             href = piece.css('a::attr(href)').get()
23 |             url = response.urljoin(href)
24 |             detail_page_links.append(href)
25 |             UID = href.split('/')[-1][:-5]
26 |             if '?' not in UID:
27 |                 detail_page_links.append(url)
28 |             #response.follow(href, callbak = self.parse_content)
29 |             yield {
30 |                 'UID': UID,
31 |                 'title': piece.css('a::text').get(),
32 |                 'date': piece.css('span::text').get(),
33 |                 'url': url,
34 |                 'text length':0,
35 |                 'FileNumber': None,
36 |                 'crawl state':'half'
37 |             }
38 |         yield from response.follow_all(detail_page_links, callback = self.parse_content)
39 | 
40 |     def parse_content(self, response):
41 |         UID = response.url.split('/')[-1][:-5]
42 |         doc_info_dict = {}
43 |         container = response.css('div.container')[0]
44 |         for doc_info in container.css('ol li'):
45 |             doc_info_l = doc_info.css('::text').getall()
46 |             if len(doc_info_l) == 2:
47 |                 key,value = doc_info_l
48 |             elif len(doc_info_l) == 1:
49 |                 key = doc_info_l[0]
50 |                 value = ''
51 |             doc_info_dict[key] = value
52 |         full_tittle = container.css('div.header p::text').get()
53 |         paragraph_list = container.css('div.mainTextBox p::text').getall()
54 |         if len(paragraph_list) == 0:
55 |             paragraph_list =  response.css('p *::text').getall() 
56 |         Filenum = doc_info_dict["[发文字号] "] if "[发文字号] " in doc_info_dict.keys() else paragraph_list[0]
57 |         if Filenum and '号' not in Filenum:
58 |             Filenum = None
59 |         length = len(''.join(paragraph_list))
60 |         if length > 0:
61 |             with open('../../data/text/%s/%s.txt' % (self.name,UID), 'w') as f:
62 |                 f.write('\n'.join(paragraph_list))
63 |             with open('../../data/HTML_pk/%s/%s.pkl' % (self.name,UID), 'wb') as f:
64 |                 pickle.dump(response.text,f)
65 |             return {
66 |                 'UID': UID,
67 |                 'full_tittle': full_tittle,
68 |                 'FileNumber':Filenum,
69 |                 'doc_info_dict': doc_info_dict,
70 |                 'mainText': paragraph_list,
71 |                 'text length':length,
72 |                 'crawl state': 'full',
73 |             }
74 |         else:
75 |             return {
76 |                 'UID': UID,
77 |                 'full_tittle': full_tittle,
78 |                 'FileNumber':Filenum,
79 |                 'doc_info_dict': doc_info_dict,
80 |                 'mainText': paragraph_list,
81 |                 'text length':length,
82 |                 'crawl state': 'empty',
83 |             }
84 |             
85 | 
86 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/CentralSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | import json
 5 | 
 6 | # from selenium import webdriver
 7 | # from selenium.webdriver.firefox.options import Options
 8 | # options = Options()
 9 | # options.headless = True
10 | import random
11 | 
12 | 
13 | class CentralSpider(scrapy.Spider):
14 |     name = "Central"
15 |     if not os.path.exists('../../data/HTML_pk/%s' % name):
16 |         os.makedirs('../../data/HTML_pk/%s' % name)
17 |     if not os.path.exists('../../data/text/%s' % name):
18 |         os.makedirs('../../data/text/%s' % name)
19 |     # def __init__(self):
20 |     #     self.browser = webdriver.Firefox(options=options)
21 |     #     self.browser.get('http://sousuo.gov.cn')
22 |     #     super().__init__()
23 |     # def close(self,spider):
24 |     #     self.browser.quit()
25 |     def start_requests(self):
26 |         total_page = 2466
27 |         # total_page = 3
28 |         #total_page =  50
29 |         url_base = 'http://sousuo.gov.cn/data?t=zhengcelibrary&q=&timetype=timeqb&mintime=&maxtime=&sort=pubtime&sortType=1&searchfield=title&pcodeJiguan=&childtype=&subchildtype=&tsbq=&pubtimeyear=&puborg=&pcodeYear=&pcodeNum=&filetype=&p={0}&n=5&inpro=&bmfl=&dup=&orpro='
30 |         for i in range(total_page):
31 |             yield scrapy.Request(url=url_base.format(i), callback=self.parse)
32 | 
33 |     def parse(self,response):
34 |         detail_page_links = []
35 |         for item in json.loads(response.text)['searchVO']['catMap']['gongbao']['listVO']:
36 |             url = item['url']
37 |             UID = url.split('/')[-1][:-4]
38 |             item['UID'] = UID
39 |             item['date'] = item['pubtimeStr'].replace('.','-')
40 |             item['FileNumber'] = item['wenhao']
41 |             item['crawl state'] = 'half'
42 |             item['text length'] = 0 
43 |             if '?' not in UID:
44 |                 detail_page_links.append(url)
45 |             yield item
46 |         yield from response.follow_all(detail_page_links, callback = self.parse_content)
47 | 
48 |     def parse_content(self, response):
49 |         UID = response.url.split('/')[-1][:-4]
50 |         
51 |         paragraph_list = response.css('div.pages_content p *::text').getall()        
52 |         
53 |         if len(paragraph_list) == 0:
54 |             paragraph_list =  response.css('p *::text').getall() 
55 |         length = len(''.join(paragraph_list))
56 |         if length > 0:
57 |             state = 'full'
58 |             with open('../../data/HTML_pk/%s/%s.pkl' % (self.name,UID), 'wb') as f:
59 |                 pickle.dump(response.text,f)
60 |             with open('../../data/text/%s/%s.txt' % (self.name,UID), 'w') as f:
61 |                 f.write('\n'.join(paragraph_list))
62 |         else:
63 |             state = 'empty'
64 |         return {
65 |             'UID': UID,
66 |             'mainText': paragraph_list,
67 |             'crawl state':state,
68 |             'text length':length,
69 |         }
70 |             
71 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/ChongqingSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | 
 5 | 
 6 | class ChongqingSpider(scrapy.Spider):
 7 |     name = "Chongqing"
 8 |     if not os.path.exists("../../data/HTML_pk/%s" % name):
 9 |         os.makedirs("../../data/HTML_pk/%s" % name)
10 |     if not os.path.exists("../../data/text/%s" % name):
11 |         os.makedirs("../../data/text/%s" % name)
12 | 
13 |     def start_requests(self):
14 |         url_dict = {
15 |             "http://www.cq.gov.cn/zwgk/fdzdgknr/lzyj/zfgz/zfgz_52609/index{0}.html": 12,
16 |             "http://www.cq.gov.cn/zwgk/fdzdgknr/lzyj/xzgfxwj/szf_38655/index{0}.html": 23,
17 |             "http://www.cq.gov.cn/zwgk/fdzdgknr/lzyj/xzgfxwj/szfbgt_38656/index{0}.html": 33,
18 |             "http://www.cq.gov.cn/zwgk/fdzdgknr/lzyj/qtgw/index{0}.html": 34,
19 |             "http://www.cq.gov.cn/zwgk/fdzdgknr/lzyj/rsrm/index{0}.html": 3,
20 |         }
21 |         for url_base, max_page in url_dict.items():
22 |             for i in range(max_page):
23 |                 page = "_" + str(i) if i > 0 else ""
24 |                 yield scrapy.Request(url=url_base.format(page), callback=self.parse)
25 | 
26 |     def parse(self, response):
27 |         detail_page_links = response.css("ul.list-cont li.w400 a::attr(href)").getall()
28 |         release_inst_l = response.css("ul.list-cont li.w120::text").getall()
29 |         title_l = response.css("ul.list-cont li.w400 a::text").getall()
30 |         release_date_l = response.css("ul.list-cont li.w172::text").getall()
31 |         date_l = response.css("ul.list-cont li.w110 span::text").getall()
32 | 
33 |         for i in range(len(detail_page_links)):
34 |             href = detail_page_links[i]
35 |             UID = href.split("/")[-1][:-5]
36 |             yield {
37 |                 "UID": UID,
38 |                 "title": title_l[i],
39 |                 "date": date_l[i],
40 |                 "release_date": release_date_l[i],
41 |                 "release_inst": release_inst_l[i],
42 |                 "url": response.urljoin(href),
43 |                 "crawl state": "half",
44 |                 "text length": 0,
45 |                 "FileNumber": None,
46 |             }
47 |         yield from response.follow_all(detail_page_links, callback=self.parse_content)
48 | 
49 |     def parse_content(self, response):
50 |         UID = response.url.split("/")[-1][:-5]
51 |         doc_info_dict = {}
52 |         td_list = response.css("table.gkxl-top td")
53 |         for i in range(len(td_list) // 2):
54 |             key = td_list[2 * i].css("::text").get()
55 |             value = td_list[2 * i + 1].css("::text").get()
56 |             doc_info_dict[key] = value
57 |         FileNum = None
58 |         if "文 号：" in doc_info_dict.keys():
59 |             FileNum = doc_info_dict["文 号："]
60 |         paragraph_list = response.css("div.gkxl-article p *::text").getall()
61 |         if len(paragraph_list) == 0:
62 |             paragraph_list = response.css("p *::text").getall()
63 |         length = len("".join(paragraph_list))
64 |         if length > 0:
65 |             state = "full"
66 |             with open("../../data/text/%s/%s.txt" % (self.name, UID), "w") as f:
67 |                 f.write("\n".join(paragraph_list))
68 |             with open("../../data/HTML_pk/%s/%s.pkl" % (self.name, UID), "wb") as f:
69 |                 pickle.dump(response.text, f)
70 |         else:
71 |             state = "empty"
72 |         return {
73 |             "UID": UID,
74 |             "doc_info_dict": doc_info_dict,
75 |             "mainText": paragraph_list,
76 |             "url": response.url,
77 |             "crawl state": state,
78 |             "text length": length,
79 |             "FileNumber": FileNum,
80 |         }
81 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/FujianSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | import ast
 5 | import json
 6 | from urllib import parse
 7 | from scrapy.selector import Selector
 8 | 
 9 | class FujianSpider(scrapy.Spider):
10 |     name = "Fujian"
11 |     if not os.path.exists('../../data/HTML_pk/%s' % name):
12 |         os.makedirs('../../data/HTML_pk/%s' % name)
13 |     if not os.path.exists('../../data/text/%s' % name):
14 |         os.makedirs('../../data/text/%s' % name)
15 |     def start_requests(self):
16 |         total_page = 655
17 |         # total_page = 3
18 |         url_base = 'http://www.fujian.gov.cn/was5/web/search?channelid=229105&templet=docs.jsp&sortfield=-pubdate&classsql=chnlid%3E22054*chnlid%3C22084&prepage=10&page={0}'
19 |         for i in range(total_page):
20 |             yield scrapy.Request(url=url_base.format(str(i+1)), callback=self.parse)
21 | 
22 |     def parse(self,response):
23 |         detail_page_links = []
24 |         raw = response.text
25 |         raw = raw.replace('\r','')
26 |         raw = raw.replace('\n','')
27 |         d = json.loads(raw)
28 |         for piece_dict in d['docs']:
29 |             url = piece_dict['url']
30 |             UID = url.split('/')[-1][:-4]
31 |             title = piece_dict['title']
32 |             if '?' not in UID:
33 |                 detail_page_links.append(url)
34 |             if 'fileno' in piece_dict.keys():
35 |                 file_num = piece_dict['fileno']
36 |             else:
37 |                 break
38 |             date = piece_dict['time']
39 |             if date and len(date) > 10:
40 |                 date = date[:10]
41 |             yield {
42 |                 'UID':UID,
43 |                 'title':title,
44 |                 'url':url,
45 |                 'date':date,
46 |                 'FileNumber':file_num,
47 |                 'text length':0,
48 |                 'crawl state': 'half'
49 |             }
50 |         yield from response.follow_all(detail_page_links, callback = self.parse_content)
51 | 
52 |     def parse_content(self, response):
53 |         UID = response.url.split('/')[-1][:-4]
54 |         paragraph_list = response.css('div.xl-bk p *::text').getall() 
55 |         if len(paragraph_list) == 0:
56 |             paragraph_list =  response.css('p *::text').getall() 
57 |         length = len(''.join(paragraph_list))
58 |         if length > 0:
59 |             state = 'full'
60 |             with open('../../data/HTML_pk/%s/%s.pkl' % (self.name,UID), 'wb') as f:
61 |                 pickle.dump(response.text,f)
62 |             with open('../../data/text/%s/%s.txt' % (self.name,UID), 'w') as f:
63 |                 f.write('\n'.join(paragraph_list))
64 |         else:
65 |             state = 'empty'
66 |         return {
67 |             'UID': UID,
68 |             'mainText': paragraph_list,
69 |             'crawl state':state,
70 |             'text length':length,
71 |         }
72 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/GansuSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | import ast
 5 | from urllib import parse
 6 | from scrapy.selector import Selector
 7 | 
 8 | class GansuSpider(scrapy.Spider):
 9 |     name = "Gansu"
10 |     if not os.path.exists('../../data/HTML_pk/%s' % name):
11 |         os.makedirs('../../data/HTML_pk/%s' % name)
12 |     if not os.path.exists('../../data/text/%s' % name):
13 |         os.makedirs('../../data/text/%s' % name)
14 |     def start_requests(self):
15 |         headers = {
16 |             'User-Agent': "Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/60.0",
17 |             'Referer': "http://www.gansu.gov.cn/col/col4729/index.html",
18 |             "Host": "www.gansu.gov.cn",
19 |             "Origin":"http://www.gansu.gov.cn"
20 |         }
21 |         total_page = 9
22 |         # total_page = 2
23 |         url_base = "http://www.gansu.gov.cn/module/jslib/jquery/jpage/dataproxy.jsp?startrecord={0}&endrecord={1}&perpage=27&appid=1&webid=1&path=%2F&columnid=4729&sourceContentType=3&unitid=18064&webname=%E4%B8%AD%E5%9B%BD%C2%B7%E7%94%98%E8%82%83&permissiontype=0"
24 |         for i in range(total_page):
25 |             yield scrapy.Request(url=url_base.format(i*81+1,(i+1)*81),headers=headers, callback=self.parse)
26 | 
27 |     def parse(self,response):
28 |         detail_page_links = []
29 |         for table_text in ast.literal_eval(response.text[69:-1]):
30 |             table = Selector(text=table_text)
31 |             url = 'http://www.gansu.gov.cn/'+table.css('a::attr(href)').get()
32 |             UID = url.split('/')[-1][:-5]
33 |             detail_page_links.append(url)
34 |             yield {
35 |                 'UID': UID,
36 |                 'title': table.css('a::attr(title)').get(),
37 |                 'date': table.css('span::text').get(),
38 |                 'url': url,
39 |                 'text length':0,
40 |                 'crawl state':'half'
41 |             }
42 |         yield from response.follow_all(detail_page_links, callback = self.parse_content)
43 | 
44 |     def parse_content(self, response):
45 |         UID = response.url.split('/')[-1][:-5]
46 |         paragraph_list = response.css('div#zoom p *::text').getall() 
47 |         
48 |         if len(paragraph_list) == 0:
49 |             paragraph_list =  response.css('p *::text').getall() 
50 |         length = len(''.join(paragraph_list))
51 |         if length > 0:
52 |             state = 'full'
53 |             with open('../../data/HTML_pk/%s/%s.pkl' % (self.name,UID), 'wb') as f:
54 |                 pickle.dump(response.text,f)
55 |             with open('../../data/text/%s/%s.txt' % (self.name,UID), 'w') as f:
56 |                 f.write('\n'.join(paragraph_list))
57 |         else:
58 |             state = 'empty'
59 |         filenum = None
60 |         if '号' in paragraph_list[0]:
61 |             filenum = paragraph_list[0]
62 |         return {
63 |             'UID': UID,
64 |             'mainText': paragraph_list,
65 |             'FileNumber': filenum,
66 |             'crawl state':state,
67 |             'text length':length,
68 |         }
69 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/GuangdongSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | import ast
 5 | from urllib import parse
 6 | from scrapy.selector import Selector
 7 | 
 8 | class GuangdongSpider(scrapy.Spider):
 9 |     name = "Guangdong"
10 |     if not os.path.exists('../../data/HTML_pk/%s' % name):
11 |         os.makedirs('../../data/HTML_pk/%s' % name)
12 |     if not os.path.exists('../../data/text/%s' % name):
13 |         os.makedirs('../../data/text/%s' % name)
14 |     def start_requests(self):
15 |         total_page = 205
16 |         # total_page = 3 
17 |         url_base = 'http://www.gd.gov.cn/zwgk/wjk/qbwj/index{0}.html'
18 |         for i in range(total_page):
19 |             page = '_'+ str(i+1) if i > 0 else ''
20 |             yield scrapy.Request(url=url_base.format(page), callback=self.parse)
21 | 
22 |     def parse(self,response):
23 |         detail_page_links = []
24 |         for li in response.css('div.viewList ul li'):
25 |             url = response.urljoin(li.css('a::attr(href)').get())
26 |             UID = url.split('/')[-1][:-5]
27 |             if '?' not in UID:
28 |                 detail_page_links.append(url)
29 |             yield {
30 |                 'UID': UID,
31 |                 'title': li.css('a::text').get(),
32 |                 'date': li.css('span.date::text').get(),
33 |                 'FileNumber':li.css('span.wh::text').get(),
34 |                 'url': url,
35 |                 'text length':0,
36 |                 'crawl state':'half'
37 |             }
38 |         yield from response.follow_all(detail_page_links, callback = self.parse_content)
39 | 
40 |     def parse_content(self, response):
41 |         UID = response.url.split('/')[-1][:-5]
42 |         paragraph_list = response.css('div.zw p *::text').getall() 
43 |         attachment = response.css('p a::attr(href)').getall() 
44 |         if len(paragraph_list) == 0:
45 |             paragraph_list =  response.css('p *::text').getall() 
46 |         length = len(''.join(paragraph_list))
47 |         if length > 0:
48 |             state = 'full'
49 |             with open('../../data/HTML_pk/%s/%s.pkl' % (self.name,UID), 'wb') as f:
50 |                 pickle.dump(response.text,f)
51 |             with open('../../data/text/%s/%s.txt' % (self.name,UID), 'w') as f:
52 |                 f.write('\n'.join(paragraph_list))
53 |         else:
54 |             state = 'empty'
55 |         return {
56 |             'UID': UID,
57 |             'mainText': paragraph_list,
58 |             'attachment_link': attachment,
59 |             'crawl state':state,
60 |             'text length':length,
61 |         }
62 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/GuangxiSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | import ast
 5 | from urllib import parse
 6 | from scrapy.selector import Selector
 7 | import json
 8 | 
 9 | 
10 | class GuangxiSpider(scrapy.Spider):
11 |     name = "Guangxi"
12 |     if not os.path.exists("../../data/HTML_pk/%s" % name):
13 |         os.makedirs("../../data/HTML_pk/%s" % name)
14 |     if not os.path.exists("../../data/text/%s" % name):
15 |         os.makedirs("../../data/text/%s" % name)
16 | 
17 |     def start_requests(self):
18 |         total_page = 163
19 |         # total_page = 3
20 |         url_base = "http://www.gxzf.gov.cn/igs/front/search/list.html?index=file2-index-alias&type=governmentdocuments&pageNumber={0}&pageSize=10&filter[AVAILABLE]=true&filter[fileNum-like]=&filter[Effectivestate]=&filter[fileYear]=&filter[fileYear-lte]=&filter[FileName,DOCCONTENT,fileNum-or]=&siteId=14&filter[SITEID]=3&orderProperty=PUBDATE&orderDirection=desc"
21 |         for i in range(total_page):
22 |             yield scrapy.Request(url=url_base.format(i + 1), callback=self.parse)
23 | 
24 |     def parse(self, response):
25 |         detail_page_links = []
26 |         for piece_dict in json.loads(response.text)["page"]["content"]:
27 | 
28 |             piece_dict["UID"] = piece_dict["_id"]
29 |             piece_dict["title"] = piece_dict["DOCTITLE"]
30 |             piece_dict["date"] = piece_dict["PUBDATE"].split("T")[0]
31 |             piece_dict["mainText"] = [piece_dict["DOCCONTENT"]]
32 |             piece_dict["FileNumber"] = piece_dict["CHNLDESC"] + piece_dict["IdxID"]
33 |             piece_dict["url"] = piece_dict["DOCPUBURL"]
34 |             piece_dict["text length"] = len(piece_dict["DOCCONTENT"])
35 |             piece_dict["crawl state"] = "full"
36 | 
37 |             yield piece_dict
38 |         yield from response.follow_all(detail_page_links)
39 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/GuizhouSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | 
 5 | class GuizhouSpider(scrapy.Spider):
 6 |     name = "Guizhou"
 7 |     if not os.path.exists('../../data/HTML_pk/%s' % name):
 8 |         os.makedirs('../../data/HTML_pk/%s' % name)
 9 |     if not os.path.exists('../../data/text/%s' % name):
10 |         os.makedirs('../../data/text/%s' % name)
11 |     def start_requests(self):
12 |         url_dict = {
13 |             'http://www.guizhou.gov.cn/zwgk/zcfg/szfwj_8191/qff_8193/index{0}.html':28,
14 |             'http://www.guizhou.gov.cn/zwgk/zcfg/gfxwj/index{0}.html':17,
15 |         }
16 |         # test_page = 2
17 |         for url_base, max_page in url_dict.items():
18 |             for i in range(max_page):
19 |                 page = '_' + str(i) if i>0 else ''
20 |                 yield scrapy.Request(url=url_base.format(page), callback=self.parse)
21 | 
22 |     def parse(self,response):
23 |         detail_page_links = []
24 | 
25 |         for li in response.css('div.right-list-box ul li'):
26 |             url = li.css('a::attr(href)').get()
27 |             UID = url.split('/')[-1][:-5]
28 |             if '?' not in UID:
29 |                 detail_page_links.append(url)
30 |             title = li.css('a::attr(title)').get()
31 |             file_num = title.split('（')[-1]
32 |             file_num = file_num.split('(')[-1][:-1]
33 |             date = li.css('span::text').get()
34 |             if date and len(date) > 10:
35 |                 date = date[:10]
36 |             yield {
37 |                 'UID': UID,
38 |                 'title': title,
39 |                 'date': date,
40 |                 'FileNumber': file_num,
41 |                 'url': url,
42 |                 'text length':0,
43 |                 'crawl state':'half'
44 |             }
45 |         yield from response.follow_all(detail_page_links, callback = self.parse_content)
46 | 
47 |     def parse_content(self, response):
48 |         UID = response.url.split('/')[-1][:-5]
49 |         paragraph_list = response.css('div.view p *::text').getall()  
50 |         if len(paragraph_list) == 0:
51 |             paragraph_list = response.css('div#Zoom p *::text').getall()  
52 |         if len(paragraph_list) == 0:
53 |             paragraph_list =  response.css('p *::text').getall() 
54 |         length = len(''.join(paragraph_list))
55 |         if length > 0:
56 |             state = 'full'
57 |             with open('../../data/HTML_pk/%s/%s.pkl' % (self.name,UID), 'wb') as f:
58 |                 pickle.dump(response.text,f)
59 |             with open('../../data/text/%s/%s.txt' % (self.name,UID), 'w') as f:
60 |                 f.write('\n'.join(paragraph_list))
61 |         else:
62 |             state = 'empty'
63 |         return {
64 |             'UID': UID,
65 |             'mainText': paragraph_list,
66 |             'crawl state':state,
67 |             'text length':length,
68 |         }
69 | 
70 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/HainanSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import json
 3 | import pickle
 4 | import os
 5 | import ast
 6 | from urllib import parse
 7 | from scrapy.selector import Selector
 8 | 
 9 | 
10 | class HainanSpider(scrapy.Spider):
11 |     name = "Hainan"
12 |     if not os.path.exists("../../data/HTML_pk/%s" % name):
13 |         os.makedirs("../../data/HTML_pk/%s" % name)
14 |     if not os.path.exists("../../data/text/%s" % name):
15 |         os.makedirs("../../data/text/%s" % name)
16 | 
17 |     def start_requests(self):
18 |         total_page = 486
19 |         # total_page = 3
20 |         url_base = "http://www.hainan.gov.cn/u/search/wjk/rs?keywords=&docYear=&docName=&fwzh=&column=undefined&curPage={0}&PageSize=15"
21 |         for i in range(total_page):
22 |             yield scrapy.Request(url=url_base.format(i + 1), callback=self.parse)
23 | 
24 |     def parse(self, response):
25 |         detail_page_links = []
26 |         for item in json.loads(response.text)["page"]["list"]:
27 |             UID = item["url"].split("/")[-1].split(".")[0]
28 |             item["date"] = None
29 |             item["url"] = response.urljoin(item["url"])
30 |             item["UID"] = UID
31 |             date = item["pubDate"]
32 |             if date and len(date) > 10:
33 |                 date = date[:10]
34 |             item["date"] = date
35 |             item["FileNumber"] = item["c_wjbh"]
36 |             if "?" not in UID:
37 |                 detail_page_links.append(item["url"])
38 |             item["crawl state"] = "half"
39 |             item["text length"] = 0
40 |             yield item
41 |         yield from response.follow_all(detail_page_links, callback=self.parse_content)
42 | 
43 |     def parse_content(self, response):
44 |         UID = response.url.split("/")[-1].split(".")[0]
45 |         paragraph_list = response.css("div#zoom p *::text").getall()
46 |         attachment_link = response.css("div#zoom p a::attr(href)").getall()
47 |         if len(paragraph_list) == 0:
48 |             paragraph_list = response.css("table p *::text").getall()
49 |         if len(paragraph_list) == 0:
50 |             paragraph_list = response.css("p *::text").getall()
51 |         length = len("".join(paragraph_list))
52 |         if length > 0:
53 |             with open("../../data/text/%s/%s.txt" % (self.name, UID), "w") as f:
54 |                 f.write("\n".join(paragraph_list))
55 |             with open("../../data/HTML_pk/%s/%s.pkl" % (self.name, UID), "wb") as f:
56 |                 pickle.dump(response.text, f)
57 |             state = "full"
58 |         else:
59 |             state = "empty"
60 |         return {
61 |             "UID": UID,
62 |             "mainText": paragraph_list,
63 |             "attachment_link": attachment_link,
64 |             "crawl state": state,
65 |             "text length": length,
66 |         }
67 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/HebeiSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | 
 5 | class HebeiSpider(scrapy.Spider):
 6 |     name = "Hebei"
 7 |     if not os.path.exists('../../data/HTML_pk/%s' % name):
 8 |         os.makedirs('../../data/HTML_pk/%s' % name)
 9 |     if not os.path.exists('../../data/text/%s' % name):
10 |         os.makedirs('../../data/text/%s' % name)
11 |     def start_requests(self):
12 |         headers = {
13 |             'User-Agent': "Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/60.0",
14 |             'Referer': "http://info.hebei.gov.cn/eportal/ui?pageId=6817552"
15 |         }
16 |         total_page = 83
17 |         # total_page = 3
18 |         url_base = 'http://info.hebei.gov.cn/eportal/ui?pageId=6817552&currentPage={0}&moduleId=3bb45f8814654e33ae014e740ccf771b&formKey=GOV_OPEN&columnName=EXT_STR7&relationId='
19 |         for i in range(total_page):
20 |             yield scrapy.Request(url=url_base.format(str(i+1)),headers=headers, callback=self.parse)
21 | 
22 |     def parse(self,response):
23 |         detail_page_links = []
24 |         for piece in response.css('table.xxgkzclbtab3'):
25 |             url = response.urljoin(piece.css('td a::attr(href)').get())
26 |             UID = url.split('=')[-2].split('&')[0]
27 |             if '?' not in UID:
28 |                 detail_page_links.append(url)
29 |             date = piece.css('td[align="center"][width="150"]::text').get()
30 |             if date and len(date) > 3:
31 |                 date = date.replace('年','-').replace('月','-').replace('日','')
32 |             yield {
33 |                 'UID': UID,
34 |                 'title': piece.css('td a::text').get(),
35 |                 'date': date,
36 |                 'url': url,
37 |                 'FileNumber':piece.css('td[align="left"]::text').get(),
38 |                 'text length':0,
39 |                 'crawl state':'half'
40 |             }
41 |         yield from response.follow_all(detail_page_links, callback = self.parse_content)
42 | 
43 |     def parse_content(self, response):
44 |         UID = response.url.split('=')[-2].split('&')[0]
45 |         doc_info_dict = {}
46 |         count = 0
47 |         for td in response.css('div.xxgk_bmxl td'):
48 |             if count % 2 == 0:
49 |                 key = td.css('*::text').get()
50 |             else:
51 |                 value = td.css('*::text').get()
52 |                 doc_info_dict[key] = value
53 |             count+=1
54 |         paragraph_list = response.css('div#zoom div *::text').getall()
55 |         attachment_link = response.css('div#zoom div a::attr(href)').getall()
56 |         if len(paragraph_list) == 0:
57 |             paragraph_list =  response.css('div *::text').getall() 
58 |         length = len(''.join(paragraph_list))
59 |         if length > 0:
60 |             state = 'full'
61 |             with open('../../data/HTML_pk/%s/%s.pkl' % (self.name,UID), 'wb') as f:
62 |                 pickle.dump(response.text,f)
63 |             with open('../../data/text/%s/%s.txt' % (self.name,UID), 'w') as f:
64 |                 f.write('\n'.join(paragraph_list))
65 |         else:
66 |             state = 'empty'
67 |         return {
68 |             'UID': UID,
69 |             'doc_info_dict': doc_info_dict,
70 |             'mainText': paragraph_list,
71 |             'attachment_link': attachment_link,
72 |             'crawl state':state,
73 |             'text length':length,
74 |         }
75 | 
76 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/HeilongjiangSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | import ast
 5 | from urllib import parse
 6 | from scrapy.selector import Selector
 7 | import json
 8 | import time
 9 | 
10 | 
11 | class HeilongjiangSpider(scrapy.Spider):
12 |     name = "Heilongjiang"
13 |     if not os.path.exists("../../data/HTML_pk/%s" % name):
14 |         os.makedirs("../../data/HTML_pk/%s" % name)
15 |     if not os.path.exists("../../data/text/%s" % name):
16 |         os.makedirs("../../data/text/%s" % name)
17 | 
18 |     def start_requests(self):
19 |         total_page = 192
20 |         url_base = "http://zwgk.hlj.gov.cn/zwgk/publicInfo/searchFile?chanPath=2,&chanP=2%2C&page={0}&limit=10&total=0"
21 |         for i in range(total_page):
22 |             yield scrapy.Request(url=url_base.format(str(i + 1)), callback=self.parse)
23 | 
24 |     def parse(self, response):
25 |         detail_page_links = []
26 |         detail_url_base = "http://zwgk.hlj.gov.cn/zwgk/publicInfo/detail?id={0}"
27 |         json_dict = json.loads(response.text)
28 |         for piece_dict in json_dict["data"]["records"]:
29 |             UID = piece_dict["id"]
30 |             piece_dict["UID"] = str(UID)
31 |             local_time = time.localtime(piece_dict["publishTime"])
32 |             piece_dict["date"] = time.strftime("%Y-%m-%d", local_time)
33 | 
34 |             detail_page_links.append(detail_url_base.format(UID))
35 |             piece_dict["crawl state"] = "half"
36 |             piece_dict["text length"] = 0
37 |             yield piece_dict
38 |         yield from response.follow_all(detail_page_links, callback=self.parse_content)
39 | 
40 |     def parse_content(self, response):
41 |         UID = response.url.split("=")[-1]
42 |         paragraph_list = response.css("div[class=zwnr] \*::text").getall()
43 |         # new_text = parse.unquote_plus(response.text[7:-6])
44 |         # for escape_text in Selector(text=new_text).css("div.zwnr *::text").getall():
45 |         #     paragraph = (
46 |         #         escape_text.replace("%", "\\").encode("utf-8").decode("unicode_escape")
47 |         #     )
48 |         #     paragraph_list.append(paragraph)
49 |         length = len("".join(paragraph_list))
50 |         if length > 0:
51 |             state = "full"
52 |             with open("../../data/HTML_pk/%s/%s.pkl" % (self.name, UID), "wb") as f:
53 |                 pickle.dump(response.text, f)
54 |             with open("../../data/text/%s/%s.txt" % (self.name, UID), "w") as f:
55 |                 f.write("\n".join(paragraph_list))
56 |         else:
57 |             state = "empty"
58 |         return {
59 |             "UID": UID,
60 |             "mainText": paragraph_list,
61 |             "crawl state": state,
62 |             "text length": length,
63 |         }
64 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/HenanSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | import ast
 5 | from urllib import parse
 6 | from scrapy.selector import Selector
 7 | 
 8 | 
 9 | class HenanSpider(scrapy.Spider):
10 |     name = "Henan"
11 |     if not os.path.exists("../../data/HTML_pk/%s" % name):
12 |         os.makedirs("../../data/HTML_pk/%s" % name)
13 |     if not os.path.exists("../../data/text/%s" % name):
14 |         os.makedirs("../../data/text/%s" % name)
15 | 
16 |     def start_requests(self):
17 |         url_dict = {
18 |             "https://www.henan.gov.cn/zwgk/fgwj/szfl/index{0}.html": 5,
19 |             "https://www.henan.gov.cn/zwgk/fgwj/yz/index{0}.html": 46,
20 |             "https://www.henan.gov.cn/zwgk/fgwj/yzb/index{0}.html": 82,
21 |             "https://www.henan.gov.cn/zwgk/fgwj/yzr/index{0}.html": 98,
22 |         }
23 |         # test_page = 2
24 |         for url_base, max_page in url_dict.items():
25 |             for i in range(max_page):
26 |                 page = "_%s" % i if i > 0 else ""
27 |                 yield scrapy.Request(url=url_base.format(page), callback=self.parse)
28 | 
29 |     def parse(self, response):
30 |         detail_page_links = []
31 |         for a in response.css('div[class="con-box"] li'):
32 |             url = response.urljoin(a.css("a::attr(href)").get())
33 |             detail_page_links.append(url)
34 |             UID = url.split("/")[-1].split(".")[0]
35 |             title = a.css("a::text").get()
36 |             file_num = a.css("p::text").get()
37 |             date = a.css("span::text").get()
38 |             yield {
39 |                 "UID": UID,
40 |                 "title": title,
41 |                 "date": date,
42 |                 "FileNumber": file_num,
43 |                 "url": url,
44 |                 "crawl state": "half",
45 |                 "text length": 0,
46 |             }
47 |         yield from response.follow_all(detail_page_links, callback=self.parse_content)
48 | 
49 |     def parse_content(self, response):
50 |         UID = response.url.split("/")[-1].split(".")[0]
51 |         paragraph_list = response.css('div[class="content"] \*::text').getall()
52 |         if len(paragraph_list) == 0:
53 |             paragraph_list = response.css("table p *::text").getall()
54 |         if len(paragraph_list) == 0:
55 |             paragraph_list = response.css("p *::text").getall()
56 |         length = len("".join(paragraph_list))
57 |         if length > 0:
58 |             with open("../../data/text/%s/%s.txt" % (self.name, UID), "w") as f:
59 |                 f.write("\n".join(paragraph_list))
60 |             with open("../../data/HTML_pk/%s/%s.pkl" % (self.name, UID), "wb") as f:
61 |                 pickle.dump(response.text, f)
62 |             state = "full"
63 |         else:
64 |             state = "empty"
65 |         return {
66 |             "UID": UID,
67 |             "mainText": paragraph_list,
68 |             "crawl state": state,
69 |             "text length": length,
70 |         }
71 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/HubeiSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | 
 5 | from selenium import webdriver
 6 | from selenium.webdriver.firefox.options import Options
 7 | options = Options()
 8 | options.headless = True
 9 | 
10 | class HubeiSpider(scrapy.Spider):
11 |     name = "Hubei"
12 |     if not os.path.exists('../../data/HTML_pk/%s' % name):
13 |         os.makedirs('../../data/HTML_pk/%s' % name)
14 |     if not os.path.exists('../../data/text/%s' % name):
15 |         os.makedirs('../../data/text/%s' % name)
16 |     
17 |     def __init__(self):
18 |         self.browser = webdriver.Firefox(options=options)
19 |         self.browser.get('http://www.hubei.gov.cn')
20 |         super().__init__()
21 |     
22 |     def close(self,spider):
23 |         self.browser.quit()
24 | 
25 |     def start_requests(self):
26 |         url_dict = {
27 |             'http://www.hubei.gov.cn/zfwj/szfl/index{0}.shtml':10,
28 |             'http://www.hubei.gov.cn/zfwj/ezf/index{0}.shtml':47,
29 |             'http://www.hubei.gov.cn/zfwj/ezh/index{0}.shtml':12,
30 |             'http://www.hubei.gov.cn/zfwj/ezd/index{0}.shtml':1,
31 |             'http://www.hubei.gov.cn/zfwj/ezbf/index{0}.shtml':50,
32 |             'http://www.hubei.gov.cn/zfwj/qt/index{0}.shtml':8, 
33 |         }
34 |         # test_page = 1
35 |         for url_base, max_page in url_dict.items():
36 |             for i in range(max_page):
37 |                 page = '_' + str(i) if i>0 else ''
38 |                 yield scrapy.Request(url=url_base.format(page), callback=self.parse)
39 | 
40 |     def parse(self,response):
41 |         detail_page_links = []
42 | 
43 |         for li in response.css('div.list_block li'):
44 |             url = response.urljoin(li.css('a::attr(href)').get())
45 |             UID = url.split('/')[-1][:-6]
46 |             date = li.css('span::text').get()
47 |             detail_page_links.append(url)
48 |             if date and len(date)>10:
49 |                 date = date[:10]
50 |             yield {
51 |                 'UID': UID,
52 |                 'title': li.css('a::attr(title)').get(),
53 |                 'date': date,
54 |                 'url': url, 
55 |                 'crawl state':'half',
56 |                 'text length':0,
57 |                 'FileNumber':None
58 |             }
59 |         yield from response.follow_all(detail_page_links, callback = self.parse_content)
60 | 
61 |     def parse_content(self, response):
62 |         UID = response.url.split('/')[-1][:-6]
63 |         doc_info_dict = {}
64 |         div_list = response.css('div.metadata_content div.col-xs-12')
65 |         for div in div_list:
66 |             row = div.css('::text').getall()
67 |             if len(row) == 3:
68 |                 key = row[1]
69 |                 value = row[2]
70 |             elif len(row) == 5:
71 |                 key = row[1]
72 |                 value = row[3]
73 |             doc_info_dict[key] = value
74 |         FileNum =None
75 |         if '文    号：' in doc_info_dict.keys():
76 |             FileNum = doc_info_dict['文    号：']
77 |         paragraph_list = response.css('div.content_block *::text').getall()
78 |         if len(paragraph_list) == 0:
79 |             paragraph_list =  response.css('*::text').getall() 
80 |         length = len(''.join(paragraph_list))
81 |         if length > 0:
82 |             state = 'full'
83 |             with open('../../data/text/%s/%s.txt' % (self.name,UID), 'w') as f:
84 |                 f.write('\n'.join(paragraph_list))
85 |             with open('../../data/HTML_pk/%s/%s.pkl' % (self.name,UID), 'wb') as f:
86 |                 pickle.dump(response.text,f)
87 |         else:
88 |             state = 'empty'
89 |         return {
90 |             'UID': UID,
91 |             'doc_info_dict': doc_info_dict,
92 |             'mainText': paragraph_list,
93 |             'crawl state':state,
94 |             'text length':length,
95 |             'FileNumber':FileNum,
96 |         }
97 | 
98 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/HunanSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | 
 5 | class HunanSpider(scrapy.Spider):
 6 |     name = "Hunan"
 7 |     if not os.path.exists('../../data/HTML_pk/%s' % name):
 8 |         os.makedirs('../../data/HTML_pk/%s' % name)
 9 |     if not os.path.exists('../../data/text/%s' % name):
10 |         os.makedirs('../../data/text/%s' % name)
11 |     def start_requests(self):
12 |         url_dict = {
13 |             'http://www.hunan.gov.cn/hnszf/xxgk/wjk/szfwj/wjk_glrb{0}.html':34,
14 |             'http://www.hunan.gov.cn/hnszf/xxgk/wjk/szfbgt/wjk_glrb{0}.html':34,
15 |         }
16 |         # test_page = 2
17 |         for url_base, max_page in url_dict.items():
18 |             for i in range(max_page):
19 |                 page = '_' + str(i+1) if i>0 else ''
20 |                 yield scrapy.Request(url=url_base.format(page), callback=self.parse)
21 | 
22 |     def parse(self,response):
23 |         detail_page_links = []
24 |         for tr in response.css('tbody tr'):
25 |             url = tr.css('a::attr(href)').get()
26 |             url = response.urljoin(url)
27 |             UID = url.split('/')[-1][:-5]
28 |             if '?' not in UID:
29 |                 detail_page_links.append(url)
30 |             yield {
31 |                 'UID': UID,
32 |                 'title': tr.css('a::text').get(),
33 |                 'date': tr.css('td')[-2].css('::text').get(),
34 |                 'FileNumber': tr.css('td')[-3].css('::text').get().split("'")[1],
35 |                 'url': url,
36 |                 'text length':0,
37 |                 'crawl state':'half'
38 |             }
39 |         yield from response.follow_all(detail_page_links, callback = self.parse_content)
40 | 
41 |     def parse_content(self, response):
42 |         UID = response.url.split('/')[-1][:-5]
43 |         paragraph_list = response.css('div#zoom p *::text').getall() 
44 |         attachment_list = response.css('div#zoom a::attr(href)').getall() 
45 |         if len(paragraph_list) == 0:
46 |             paragraph_list =  response.css('p *::text').getall() 
47 |         length = len(''.join(paragraph_list))
48 |         if length > 0:
49 |             state = 'full'
50 |             with open('../../data/HTML_pk/%s/%s.pkl' % (self.name,UID), 'wb') as f:
51 |                 pickle.dump(response.text,f)
52 |             with open('../../data/text/%s/%s.txt' % (self.name,UID), 'w') as f:
53 |                 f.write('\n'.join(paragraph_list))
54 |         else:
55 |             state = 'empty'
56 |         return {
57 |             'UID': UID,
58 |             'mainText': paragraph_list,
59 |             'attachment_link': attachment_list,
60 |             'crawl state':state,
61 |             'text length':length,
62 |         }
63 | 
64 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/JiangsuSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | import ast
 5 | from urllib import parse
 6 | from scrapy.selector import Selector
 7 | 
 8 | class JiangsuSpider(scrapy.Spider):
 9 |     name = "Jiangsu"
10 |     if not os.path.exists('../../data/HTML_pk/%s' % name):
11 |         os.makedirs('../../data/HTML_pk/%s' % name)
12 |     if not os.path.exists('../../data/text/%s' % name):
13 |         os.makedirs('../../data/text/%s' % name)
14 |     def start_requests(self):
15 |         headers = {
16 |             'User-Agent': "Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/60.0",
17 |             'Referer': "http://www.jiangsu.gov.cn/col/col76841/index.html?uid=297589&pageNum=3",
18 |             "Host": "www.jiangsu.gov.cn",
19 |             "Origin":"http://www.jiangsu.gov.cn"
20 |         }
21 | 
22 |         total_page = 63
23 |         # total_page = 8
24 |         # url_base = "http://www.jiangsu.gov.cn/module/web/jpage/dataproxy.jsp?col=1&appid=1&webid=1&path=%2F&columnid=76841&sourceContentType=1&unitid=297589&webname=%E6%B1%9F%E8%8B%8F%E7%9C%81%E4%BA%BA%E6%B0%91%E6%94%BF%E5%BA%9C&permissiontype=0"
25 |         # url_base = 'http://www.jiangsu.gov.cn/col/col76841/index.html?uid=297589&pageNum={0}&col=1&appid=1&webid=1&path=%2F&columnid=76841&sourceContentType=1&unitid=297589&webname=%E6%B1%9F%E8%8B%8F%E7%9C%81%E4%BA%BA%E6%B0%91%E6%94%BF%E5%BA%9C&permissiontype=0'
26 |         url_base = 'http://www.jiangsu.gov.cn/col/col76841/index.html?uid=297589&pageNum={0}&col=1&appid=1&webid=1&path=%2F&columnid=76841&sourceContentType=1&unitid=297589&webname=%C3%83%C2%A6%C3%82%C2%B1%C3%82%C2%9F%C3%83%C2%A8%C3%82%C2%8B%C3%82%C2%8F%C3%83%C2%A7%C3%82%C2%9C%C3%82%C2%81%C3%83%C2%A4%C3%82%C2%BA%C3%82%C2%BA%C3%83%C2%A6%C3%82%C2%B0%C3%82%C2%91%C3%83%C2%A6%C3%82%C2%94%C3%82%C2%BF%C3%83%C2%A5%C3%82%C2%BA%C3%82%C2%9C&permissiontype=0'
27 |         url_base = 'http://www.jiangsu.gov.cn/module/web/jpage/dataproxy.jsp?startrecord={0}&endrecord={0}&perpage=25&col=1&appid=1&webid=1&path=%2F&columnid=76841&sourceContentType=1&unitid=297589&webname=%E6%B1%9F%E8%8B%8F%E7%9C%81%E4%BA%BA%E6%B0%91%E6%94%BF%E5%BA%9C&permissiontype=0'
28 |         for i in range(total_page):
29 |             yield scrapy.Request(url=url_base.format(i*75+1,i*75+75),headers=headers, callback=self.parse)
30 | 
31 |     def parse(self,response):
32 |         detail_page_links = []
33 |         for html_text in response.css('record *::text').getall():
34 |             record = Selector(text = html_text)
35 |             url = record.css('a::attr(href)').get()
36 |             UID = url.split('/')[-1][:-5]+'_'+url.split('/')[-4]+url.split('/')[-3]+url.split('/')[-2]
37 |             detail_page_links.append(url)
38 |             yield {
39 |                 'UID': UID,
40 |                 'title': record.css('a::attr(title)').get(),
41 |                 'date': record.css('b::text').get(),
42 |                 'FileNumber':None,
43 |                 'text length':0,
44 |                 'url': url,
45 |                 'crawl state':'half'
46 |             }
47 |         for url in detail_page_links:
48 |             yield scrapy.Request(url=url, callback = self.parse_content)
49 | 
50 |     def parse_content(self, response):
51 |         url = response.url
52 |         UID = url.split('/')[-1][:-5]+'_'+url.split('/')[-4]+url.split('/')[-3]+url.split('/')[-2]
53 |         doc_info_dict = {}
54 |         count = 0
55 |         for td in response.css('tbody td'):
56 |             if count % 2 == 0:
57 |                 key = td.css("::text").get()
58 |             else:
59 |                 value = td.css("::text").get()
60 |                 doc_info_dict[key] = value
61 |             count+=1
62 |         file_num = doc_info_dict['文\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0号'] if '文\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0号' in doc_info_dict.keys() else  None
63 |         paragraph_list = response.css('div#zoom p *::text').getall() 
64 |         if len(paragraph_list) == 0:
65 |             paragraph_list =  response.css('p *::text').getall() 
66 |         length = len(''.join(paragraph_list))
67 |         if length > 0:
68 |             state = 'full'
69 |             with open('../../data/HTML_pk/%s/%s.pkl' % (self.name,UID), 'wb') as f:
70 |                 pickle.dump(response.text,f)
71 |             with open('../../data/text/%s/%s.txt' % (self.name,UID), 'w') as f:
72 |                 f.write('\n'.join(paragraph_list))
73 |         else:
74 |             state = 'empty'
75 |         return {
76 |             'UID': UID,
77 |             'mainText': paragraph_list,
78 |             'FileNumber': file_num,
79 |             'crawl state':state,
80 |             'text length':length,
81 |         }
82 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/JiangxiSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | 
 5 | class JiangxiSpider(scrapy.Spider):
 6 |     name = "Jiangxi"
 7 |     if not os.path.exists('../../data/HTML_pk/%s' % name):
 8 |         os.makedirs('../../data/HTML_pk/%s' % name)
 9 |     if not os.path.exists('../../data/text/%s' % name):
10 |         os.makedirs('../../data/text/%s' % name)
11 |     def start_requests(self):
12 |         total_page = 108
13 |         url_base = 'http://www.jiangxi.gov.cn/module/xxgk/subjectinfo.jsp?sortfield=compaltedate:0&fbtime=&texttype=0&vc_all=&vc_filenumber=&vc_title=&vc_number=&currpage={0}&binlay=&c_issuetime='
14 |         for i in range(total_page):
15 |             yield scrapy.Request(url=url_base.format(str(i+1)), callback=self.parse)
16 | 
17 |     def parse(self,response):
18 |         detail_page_links = []
19 |         for piece in response.css('tr.tr_main_value_odd'):
20 |             href = piece.css('td a::attr(href)').get()
21 |             UID = href.split('/')[-1]
22 |             UID = UID.split('?')[0][:-5]
23 |             detail_page_links.append(href)
24 |             yield {
25 |                 'UID': UID,
26 |                 'docID': piece.css('td a::attr(syh)').get(),
27 |                 'title': piece.css('td a::attr(mc)').get(),
28 |                 'date': piece.css('td a::attr(rq)').get(),
29 |                 'FileNumber':None,
30 |                 'url': response.urljoin(href),
31 |                 'crawl state':'half'
32 |             }
33 |         for piece in response.css('tr.tr_main_value_even'):
34 |             href = piece.css('td a::attr(href)').get()
35 |             UID = href.split('/')[-1]
36 |             UID = UID.split('?')[0][:-5]
37 |             if '?' not in UID:
38 |                 detail_page_links.append(href)
39 |             yield {
40 |                 'UID': UID,
41 |                 'docID': piece.css('td a::attr(syh)').get(),
42 |                 'title': piece.css('td a::attr(mc)').get(),
43 |                 'date': piece.css('td a::attr(rq)').get(),
44 |                 'FileNumber':None,
45 |                 'url': response.urljoin(href),
46 |                 'text length':0,
47 |                 'crawl state':'half'
48 |             } 
49 |         
50 |         yield from response.follow_all(detail_page_links, callback = self.parse_content)
51 | 
52 |     def parse_content(self, response):
53 |         UID = response.url.split('/')[-1]
54 |         UID = UID.split('?')[0][:-5]
55 |         values = response.css('div.bt-article-y')[0].css('tr td::text').getall()
56 |         keys = response.css('div.bt-article-y')[0].css('tr td b span::text').getall() +\
57 |            response.css('div.bt-article-y')[0].css('tr td b::text').getall()
58 |         doc_info_dict = {}
59 |         if len(keys) == len(values):
60 |             for i in range(len(keys)):
61 |                 doc_info_dict[keys[i]] = values[i]
62 |         
63 |         full_tittle = ''.join(response.css('div.bt-article-y p.sp_title::text').getall())
64 |         paragraph_list = response.css('div.bt-article-y div#zoom p::text').getall()
65 |         if len(paragraph_list) == 0:
66 |             paragraph_list =  response.css('p *::text').getall() 
67 |         FileNum = None
68 |         if '文\xa0\xa0\xa0\xa0\xa0\xa0号:' in doc_info_dict.keys():
69 |             FileNum = doc_info_dict['文\xa0\xa0\xa0\xa0\xa0\xa0号:']
70 |         attachment_link = response.css('div.bt-article-y div#zoom p a::attr(href)').getall()
71 |         attachment_link = [link for link in attachment_link if link[:16]=='/module/download']
72 |         length = len(''.join(paragraph_list))
73 |         if length > 0:
74 |             state = 'full'
75 |             with open('../../data/HTML_pk/%s/%s.pkl' % (self.name,UID), 'wb') as f:
76 |                 pickle.dump(response.text,f)
77 |             with open('../../data/text/%s/%s.txt' % (self.name,UID), 'w') as f:
78 |                 f.write('\n'.join(paragraph_list))
79 |         else:
80 |             state = 'empty'
81 |         return {
82 |             'UID': UID,
83 |             'full_tittle': full_tittle,
84 |             'FileNumber':FileNum,
85 |             'doc_info_dict': doc_info_dict,
86 |             'mainText': paragraph_list,
87 |             'attachment_link': attachment_link,
88 |             'crawl state':state,
89 |             'text length':length,
90 |         }
91 | 
92 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/JilinSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | import json
 5 | from urllib import parse
 6 | from scrapy.selector import Selector
 7 | 
 8 | class JilinSpider(scrapy.Spider):
 9 |     name = "Jilin"
10 |     if not os.path.exists('../../data/HTML_pk/%s' % name):
11 |         os.makedirs('../../data/HTML_pk/%s' % name)
12 |     if not os.path.exists('../../data/text/%s' % name):
13 |         os.makedirs('../../data/text/%s' % name)
14 |     def start_requests(self):
15 |         total_page = 173
16 |         # total_page = 3
17 |         url_base = 'http://infogate.jl.gov.cn/govsearch/jsonp/zf_jd_list.jsp?page={0}&lb=134657&callback=result&sword=&searchColumn=all&searchYear=all&pubURL=http%3A%2F%2Fxxgk.jl.gov.cn%2F&SType=1&searchColumnYear=all&searchYear=all&pubURL=&SType=1&channelId=134657&_=1585041673815'
18 |         for i in range(total_page):
19 |             yield scrapy.Request(url=url_base.format(str(i+1)), callback=self.parse)
20 | 
21 |     def parse(self,response):
22 |         detail_page_links = []
23 |         for piece_dict in json.loads(response.text[66:-4])['data']:
24 |             item = {
25 |                 'UID':piece_dict['MetaDataId'],
26 |                 'url':piece_dict['puburl'],
27 |                 'title':piece_dict['tip']['title'],
28 |                 'date':piece_dict['tip']['dates'],
29 |                 'FileNumber':piece_dict['tip']['filenum'],
30 |                 'publisher':piece_dict['tip']['publisher'],
31 |                 'text length':0,
32 |                 'crawl state':'half',
33 |             }
34 |             if '?' not in item['UID']:
35 |                 detail_page_links.append(item['url'])
36 |             yield item
37 |         yield from response.follow_all(detail_page_links, callback = self.parse_content)
38 | 
39 |     def parse_content(self, response):
40 |         UID = response.url.split('_')[-1][:-5]
41 |         state = 'full' if response.status == 200 else 'half'
42 |         paragraph_list = response.css('div.zlyxwz_t2a p *::text').getall()
43 |         attachment_links = response.css('div.zlyxwz_t2a p a::attr(href)').getall()        
44 |         
45 |         if len(paragraph_list) == 0:
46 |             paragraph_list =  response.css('p *::text').getall() 
47 |         if len(paragraph_list) == 0:
48 |             paragraph_list =  response.css('*::text').getall() 
49 |         length = len(''.join(paragraph_list))
50 |         if length > 0:
51 |             state = 'full'
52 |             with open('../../data/HTML_pk/%s/%s.pkl' % (self.name,UID), 'wb') as f:
53 |                 pickle.dump(response.text,f)
54 |             with open('../../data/text/%s/%s.txt' % (self.name,UID), 'w') as f:
55 |                 f.write('\n'.join(paragraph_list))
56 |         else:
57 |             state = 'empty'
58 |         return {
59 |             'UID': UID,
60 |             'mainText': paragraph_list,
61 |             'attachment_links':attachment_links,
62 |             'crawl state':state,
63 |             'text length':length,
64 |         }
65 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/LiaoningSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | 
 5 | class LiaoningSpider(scrapy.Spider):
 6 |     name = "Liaoning"
 7 |     if not os.path.exists('../../data/HTML_pk/%s' % name):
 8 |         os.makedirs('../../data/HTML_pk/%s' % name)
 9 |     if not os.path.exists('../../data/text/%s' % name):
10 |         os.makedirs('../../data/text/%s' % name)
11 |     def start_requests(self):
12 |         url_dict = {
13 |             'http://www.ln.gov.cn/zfxx/zfwj/szfl/index{0}.html':9,
14 |             'http://www.ln.gov.cn/zfxx/zfwj/szfbgtwj/zfwj2011_136268/index{0}.html':2,
15 |             'http://www.ln.gov.cn/zfxx/zfwj/szfwj/zfwj2011_140407/index{0}.html':1,
16 |             'http://www.ln.gov.cn/zfxx/zfwj/bmwj/index{0}.html':1
17 |         }
18 |         #test_page = 1
19 |         for url_base, max_page in url_dict.items():
20 |             for i in range(max_page):
21 |                 page = '_' + str(i) if i>0 else ''
22 |                 yield scrapy.Request(url=url_base.format(page), callback=self.parse)
23 | 
24 |     def parse(self,response):
25 |         url_sign = response.url.split('/')[-2]
26 |         if url_sign == 'zfwj2011_136268' or url_sign == 'zfwj2011_140407':
27 |             td_base = 0
28 |         elif url_sign == 'szfl':
29 |             td_base = 1
30 |         elif url_sign == 'bmwj':
31 |             td_base = 2
32 |         detail_page_links = []
33 |         for tr in response.css('table.dataList tr')[1:]:
34 |             td_list = tr.css('td')
35 |             href = response.urljoin(td_list[0+td_base].css('a::attr(href)').get())
36 |             UID = href.split('/')[-1][:-5]
37 |             if '?' not in UID:
38 |                 detail_page_links.append(href)
39 |             date = td_list[2+td_base].css('::text').get() 
40 |             if date and len(date) > 3:
41 |                 date = date.replace('年','-').replace('月','-').replace('日','')
42 |             yield {
43 |                 'UID': UID,
44 |                 'title': td_list[0+td_base].css('a::attr(title)').get(),
45 |                 'date': date,
46 |                 'FileNumber':td_list[1+td_base].css('::text').get(),
47 |                 'url': href,
48 |                 'text length':0,
49 |                 'crawl state':'half'
50 |             }
51 |         yield from response.follow_all(detail_page_links, callback = self.parse_content)
52 | 
53 |     def parse_content(self, response):
54 |         UID = response.url.split('/')[-1][:-5]
55 |         paragraph_list = response.css('div#main *::text').getall()
56 |         
57 |         if len(paragraph_list) == 0:
58 |             paragraph_list =  response.css('div *::text').getall() 
59 |         length = len(''.join(paragraph_list))
60 |         if length > 0:
61 |             state = 'full'
62 |             with open('../../data/HTML_pk/%s/%s.pkl' % (self.name,UID), 'wb') as f:
63 |                 pickle.dump(response.text,f)
64 |             with open('../../data/text/%s/%s.txt' % (self.name,UID), 'w') as f:
65 |                 f.write('\n'.join(paragraph_list))
66 |         else:
67 |             state = 'empty'
68 |         return {
69 |             'UID': UID,
70 |             'mainText': paragraph_list,
71 |             'crawl state':state,
72 |             'text length':length,
73 |         }
74 | 
75 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/NeimengguSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | 
 5 | 
 6 | class NeimengguSpider(scrapy.Spider):
 7 |     name = "Neimenggu"
 8 |     if not os.path.exists("../../data/HTML_pk/%s" % name):
 9 |         os.makedirs("../../data/HTML_pk/%s" % name)
10 |     if not os.path.exists("../../data/text/%s" % name):
11 |         os.makedirs("../../data/text/%s" % name)
12 | 
13 |     def start_requests(self):
14 |         headers = {
15 |             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/60.0",
16 |             "Referer": "http://www.nmg.gov.cn/col/col1686/index.html",
17 |             "Host": "www.nmg.gov.cn",
18 |             "Origin": "http://www.nmg.gov.cn",
19 |         }
20 |         url_dict = {
21 |             "http://www.nmg.gov.cn/zwgk/zfxxgk/zfxxgkml/gzxzgfxwj/xzgfxwj/index{0}.html": 71,
22 |             "http://www.nmg.gov.cn/zwgk/zfxxgk/zfxxgkml/zzqzfjbgtwj/index{0}.html": 152,
23 |         }
24 |         for url_base, max_page in url_dict.items():
25 |             for i in range(max_page):
26 |                 page = "_" + str(i) if i > 0 else ""
27 |                 yield scrapy.Request(url=url_base.format(page), callback=self.parse)
28 | 
29 |     def parse(self, response):
30 |         detail_page_links = []
31 | 
32 |         for piece in response.css("tbody tr"):
33 |             url = piece.css("td")[1].css("a::attr(href)").get()
34 |             UID = url.split("/")[-1].split(".")[0]
35 | 
36 |             detail_page_links.append(url)
37 |             yield {
38 |                 "UID": UID,
39 |                 "title": piece.css("td")[1].css("a::text").get(),
40 |                 "date": piece.css("td")[-1].css("::text").get(),
41 |                 "FileNumber": piece.css("td")[-3].css("::text").get(),
42 |                 "text length": 0,
43 |                 "url": url,
44 |                 "crawl state": "half",
45 |             }
46 |         yield from response.follow_all(detail_page_links, callback=self.parse_content)
47 | 
48 |     def parse_content(self, response):
49 |         UID = response.url.split("/")[-1].split(".")[0]
50 |         paragraph_list = response.css(
51 |             'div[class="view TRS_UEDITOR trs_paper_default trs_external"] \*::text'
52 |         ).getall()
53 | 
54 |         if len(paragraph_list) == 0:
55 |             paragraph_list = response.css("p *::text").getall()
56 |         length = len("".join(paragraph_list))
57 |         if length > 0:
58 |             state = "full"
59 |             with open("../../data/HTML_pk/%s/%s.pkl" % (self.name, UID), "wb") as f:
60 |                 pickle.dump(response.text, f)
61 |             with open("../../data/text/%s/%s.txt" % (self.name, UID), "w") as f:
62 |                 f.write("\n".join(paragraph_list))
63 |         else:
64 |             state = "empty"
65 |         return {
66 |             "UID": UID,
67 |             "mainText": paragraph_list,
68 |             "crawl state": state,
69 |             "text length": length,
70 |         }
71 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/NingxiaSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | import ast
 5 | from urllib import parse
 6 | from scrapy.selector import Selector
 7 | 
 8 | class NingxiaSpider(scrapy.Spider):
 9 |     name = "Ningxia"
10 |     if not os.path.exists('../../data/HTML_pk/%s' % name):
11 |         os.makedirs('../../data/HTML_pk/%s' % name)
12 |     if not os.path.exists('../../data/text/%s' % name):
13 |         os.makedirs('../../data/text/%s' % name)
14 |     def start_requests(self):
15 |         total_page = 48 
16 |         # total_page = 3 
17 |         headers = {
18 |             'User-Agent': "Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/60.0",
19 |             'Referer': "http://www.nx.gov.cn/zwgk/qzfwj/",
20 |             "Host": "www.nx.gov.cn",
21 |             "Origin":"http://www.nx.gov.cn"
22 |         }
23 |         url_base = 'http://www.nx.gov.cn/zwgk/qzfwj/list{0}.html'
24 |         for i in range(total_page):
25 |             page = '_'+ str(i) if i > 0 else ''
26 |             yield scrapy.Request(url=url_base.format(page),headers=headers, callback=self.parse)
27 | 
28 |     def parse(self,response):
29 |         detail_page_links = []
30 |         for li in response.css('ul.nx-list li'):
31 |             url = response.urljoin(li.css('a::attr(href)').get())
32 |             UID = url.split('/')[-1][:-5]
33 |             if '?' not in UID:
34 |                 detail_page_links.append(url)
35 |             FileNumber = None
36 |             doc_info_dict = {}
37 |             for p in li.css('div.nx-conmtab p'):
38 |                 key = p.css('span.tt::text').get()
39 |                 value = p.css('span.value::text').get()
40 |                 doc_info_dict[key] = value
41 |                 if key == '发文字号：':
42 |                     FileNumber = value
43 |             yield {
44 |                 'UID': UID,
45 |                 'title': li.css('a::attr(title)').get(),
46 |                 'date': li.css('span.date::text').get(),
47 |                 'FileNumber': FileNumber,
48 |                 'doc_info_dict':doc_info_dict,
49 |                 'text length':0,
50 |                 'url': url,
51 |                 'crawl state':'half'
52 |             }
53 |         yield from response.follow_all(detail_page_links, callback = self.parse_content)
54 | 
55 |     def parse_content(self, response):
56 |         UID = response.url.split('/')[-1][:-5]
57 |         paragraph_list = response.css('div.view p *::text').getall()
58 |         if len(paragraph_list) == 0:
59 |             paragraph_list =  response.css('p *::text').getall() 
60 |         length = len(''.join(paragraph_list))
61 |         if length > 0:
62 |             state = 'full'
63 |             with open('../../data/HTML_pk/%s/%s.pkl' % (self.name,UID), 'wb') as f:
64 |                 pickle.dump(response.text,f)
65 |             with open('../../data/text/%s/%s.txt' % (self.name,UID), 'w') as f:
66 |                 f.write('\n'.join(paragraph_list))
67 |         else:
68 |             state = 'empty'
69 |         return {
70 |             'UID': UID,
71 |             'mainText': paragraph_list,
72 |             'crawl state':state,
73 |             'text length':length,
74 |         }
75 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/QinghaiSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | import ast
 5 | from urllib import parse
 6 | from scrapy.selector import Selector
 7 | 
 8 | 
 9 | class QinghaiSpider(scrapy.Spider):
10 |     name = "Qinghai"
11 |     if not os.path.exists("../../data/HTML_pk/%s" % name):
12 |         os.makedirs("../../data/HTML_pk/%s" % name)
13 |     if not os.path.exists("../../data/text/%s" % name):
14 |         os.makedirs("../../data/text/%s" % name)
15 | 
16 |     def start_requests(self):
17 |         total_page = 59
18 |         # total_page = 3
19 |         url_base = "http://zwgk.qh.gov.cn/xxgk/fd/zfwj/index{0}.html"
20 |         for i in range(total_page):
21 |             page = "_" + str(i) if i > 0 else ""
22 |             yield scrapy.Request(url=url_base.format(page), callback=self.parse)
23 | 
24 |     def parse(self, response):
25 |         detail_page_links = []
26 |         for li in response.css('table[class="zctb"] tr')[1:]:
27 |             url = li.css("td")[0].css("a::attr(href)").get()
28 |             UID = url.split("/")[-1].split(".")[0]
29 |             detail_page_links.append(url)
30 |             date = UID.split("_")[0][1:]
31 |             date = "-".join([date[:4], date[4:6], date[6:8]])
32 |             yield {
33 |                 "UID": UID,
34 |                 "title": "".join(li.css("td")[0].css("a \*::text").getall()),
35 |                 "date": date,
36 |                 "FileNumber": li.css("td")[1].css("::text").get(),
37 |                 "text length": 0,
38 |                 "url": url,
39 |                 "crawl state": "half",
40 |             }
41 |         yield from response.follow_all(detail_page_links, callback=self.parse_content)
42 | 
43 |     def parse_content(self, response):
44 |         UID = response.url.split("/")[-1].split(".")[0]
45 | 
46 |         paragraph_list = response.css("div#contentlf \*::text").getall()
47 |         if len(paragraph_list) == 0:
48 |             paragraph_list = response.css("p *::text").getall()
49 |         length = len("".join(paragraph_list))
50 |         if length > 0:
51 |             state = "full"
52 |             with open("../../data/HTML_pk/%s/%s.pkl" % (self.name, UID), "wb") as f:
53 |                 pickle.dump(response.text, f)
54 |             with open("../../data/text/%s/%s.txt" % (self.name, UID), "w") as f:
55 |                 f.write("\n".join(paragraph_list))
56 |         else:
57 |             state = "empty"
58 |         return {
59 |             "UID": UID,
60 |             "mainText": paragraph_list,
61 |             "crawl state": state,
62 |             "text length": length,
63 |         }
64 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/Shaanxi_shanSpider.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import os
 3 | import scrapy
 4 | 
 5 | 
 6 | class Shaanxi_shanSpider(scrapy.Spider):
 7 |     name = "Shaanxi_shan"
 8 |     if not os.path.exists("../../data/HTML_pk/%s" % name):
 9 |         os.makedirs("../../data/HTML_pk/%s" % name)
10 |     if not os.path.exists("../../data/text/%s" % name):
11 |         os.makedirs("../../data/text/%s" % name)
12 | 
13 |     def start_requests(self):
14 |         url_dict = {
15 |             "http://www.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/szfgz/index{0}.html": 11,
16 |             "http://www.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/index{0}.html": 5,
17 |             "http://www.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/szfwj/szf/index{0}.html": 70,
18 |             "http://www.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/szfwj/szz/index{0}.html": 17,
19 |             "http://www.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/szfwj/szrz/index{0}.html": 134,
20 |             "http://www.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/szfwj/sztb/index{0}.html": 46,
21 |             "http://www.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/szfwj/szh/index{0}.html": 56,
22 |             "http://www.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/szfbgtwj/szbf/index{0}.html": 134,
23 |             "http://www.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/szfbgtwj/szbz/index{0}.html": 6,
24 |             "http://www.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/szfbgtwj/szbfmd/index{0}.html": 14,
25 |         }
26 |         # test_page = 2
27 |         for url_base, max_page in url_dict.items():
28 |             for i in range(max_page):
29 |                 page = "_%s" % i if i > 0 else ""
30 |                 yield scrapy.Request(url=url_base.format(page), callback=self.parse)
31 | 
32 |     def parse(self, response):
33 |         detail_page_links = []
34 |         for tr in response.css('ul[class="gov-item cf-otw"] li'):
35 |             href = tr.css('div[class="a-news otw lf"] a::attr(href)').get()
36 |             url = response.urljoin(href)
37 |             UID = url.split("/")[-1].split(".")[0]
38 | 
39 |             detail_page_links.append(url)
40 |             yield {
41 |                 "UID": UID,
42 |                 "title": tr.css('div[class="a-news otw lf"] a::attr(title)').get(),
43 |                 "date": tr.css('span[class="date rt"]::text').get().strip(),
44 |                 "FileNumber": tr.css('span[class="code-num otw lf"]::text')
45 |                 .get()
46 |                 .strip(),
47 |                 "text length": 0,
48 |                 "url": url,
49 |                 "crawl state": "half",
50 |             }
51 |         yield from response.follow_all(detail_page_links, callback=self.parse_content)
52 | 
53 |     def parse_content(self, response):
54 |         UID = response.url.split("/")[-1].split(".")[0]
55 | 
56 |         paragraph_list = response.css("div#doc_left *::text").getall()
57 |         if len(paragraph_list) == 0:
58 |             paragraph_list = response.css("p *::text").getall()
59 |         if len(paragraph_list) == 0:
60 |             paragraph_list = response.css("*::text").getall()
61 | 
62 |         length = len("".join(paragraph_list))
63 |         if length > 0:
64 |             state = "full"
65 |             with open("../../data/HTML_pk/%s/%s.pkl" % (self.name, UID), "wb") as f:
66 |                 pickle.dump(response.text, f)
67 |             with open("../../data/text/%s/%s.txt" % (self.name, UID), "w") as f:
68 |                 f.write("\n".join(paragraph_list))
69 |         else:
70 |             state = "empty"
71 |         return {
72 |             "UID": UID,
73 |             "mainText": paragraph_list,
74 |             "crawl state": state,
75 |             "text length": length,
76 |         }
77 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/ShandongSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | import ast
 5 | from urllib import parse
 6 | from scrapy.selector import Selector
 7 | 
 8 | 
 9 | class ShandongSpider(scrapy.Spider):
10 |     name = "Shandong"
11 |     if not os.path.exists("../../data/HTML_pk/%s" % name):
12 |         os.makedirs("../../data/HTML_pk/%s" % name)
13 |     if not os.path.exists("../../data/text/%s" % name):
14 |         os.makedirs("../../data/text/%s" % name)
15 | 
16 |     def start_requests(self):
17 |         total_page = 6735
18 |         # total_page = 3
19 |         url_base = "http://www.shandong.gov.cn/module/xxgk/search_custom.jsp?fields=&fieldConfigId=247732&sortfield=compaltedate:0&fbtime=&texttype=&vc_all=&vc_filenumber=&vc_title=&vc_number=&currpage={0}&binlay=&c_issuetime="
20 | 
21 |         for i in range(total_page):
22 |             yield scrapy.Request(url=url_base.format(i + 1), callback=self.parse)
23 | 
24 |     def parse(self, response):
25 |         detail_page_links = []
26 |         for div in response.css("div.wip_lists"):
27 |             url = div.css("a::attr(href)").get()
28 |             UID = url.split("/")[-1][:-16]
29 |             if "?" not in UID:
30 |                 detail_page_links.append(url)
31 |             date = div.css('div[class="wip_lists_time bt-left"]::text').get().strip()
32 | 
33 |             yield {
34 |                 "UID": UID,
35 |                 "title": div.css("a::text").get(),
36 |                 "date": date,
37 |                 "FileNumber": None,
38 |                 "text length": 0,
39 |                 "url": url,
40 |                 "crawl state": "half",
41 |             }
42 |         yield from response.follow_all(detail_page_links, callback=self.parse_content)
43 | 
44 |     def parse_content(self, response):
45 |         UID = response.url.split("/")[-1][:-16]
46 | 
47 |         paragraph_list = response.css("div.wip_art_con p *::text").getall()
48 |         attachment_link = response.css("div.wip_art_con p a::attr(href)").getall()
49 |         if len(paragraph_list) == 0:
50 |             paragraph_list = response.css("div#zoom p *::text").getall()
51 |             attachment_link = response.css("div#zoom p a::attr(href)").getall()
52 |         if len(paragraph_list) == 0:
53 |             paragraph_list = response.css("p *::text").getall()
54 |             attachment_link = []
55 |         if len(response.css("div.wip_art_con p")) >= 2:
56 |             File_num = response.css("div.wip_art_con p")[1].css("::text").get()
57 |         else:
58 |             File_num = None
59 |         if File_num and "号" not in File_num:
60 |             File_num = None
61 |         length = len("".join(paragraph_list))
62 |         if length > 0:
63 |             state = "full"
64 |             with open("../../data/HTML_pk/%s/%s.pkl" % (self.name, UID), "wb") as f:
65 |                 pickle.dump(response.text, f)
66 |             with open("../../data/text/%s/%s.txt" % (self.name, UID), "w") as f:
67 |                 f.write("\n".join(paragraph_list))
68 |         else:
69 |             state = "empty"
70 |         return {
71 |             "UID": UID,
72 |             "FileNumber": File_num,
73 |             "mainText": paragraph_list,
74 |             "attachment_link": attachment_link,
75 |             "crawl state": state,
76 |             "text length": length,
77 |         }
78 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/Shandong_leftSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | import ast
 5 | from urllib import parse
 6 | import pandas as pd
 7 | from scrapy.selector import Selector
 8 | 
 9 | class Shandong_leftSpider(scrapy.Spider):
10 |     name = "Shandong_left"
11 |     if not os.path.exists('../../data/HTML_pk/%s' % name):
12 |         os.makedirs('../../data/HTML_pk/%s' % name)
13 |     if not os.path.exists('../../data/text/%s' % name):
14 |         os.makedirs('../../data/text/%s' % name)
15 |     def start_requests(self):
16 |         yield scrapy.Request('http://www.shandong.gov.cn',callback=self.parse)
17 | 
18 |     def parse(self,response):
19 |         detail_page_links = []
20 |         df = pd.read_csv('../../data/empty/Shandong_empty_list.csv')     
21 |         for i in range(len(df)):
22 |             UID = str(df.loc[i,'UID'])
23 |             if '?' not in UID:
24 |                 title = df.loc[i,'title']
25 |                 date = df.loc[i,'date']
26 |                 url = df.loc[i,'url']
27 |                 detail_page_links.append(url)
28 |                 yield {
29 |                         'UID': UID,
30 |                         'title': title,
31 |                         'date': date,
32 |                         'FileNumber':None,
33 |                         'text length':0,
34 |                         'url': url,
35 |                         'crawl state':'half'
36 |                     }
37 |         yield from response.follow_all(detail_page_links, callback = self.parse_content)
38 | 
39 |     def parse_content(self, response):
40 |         UID = response.url.split('/')[-1][:-16]
41 | 
42 |         paragraph_list = response.css('div.wip_art_con p *::text').getall()
43 |         if len(paragraph_list) == 0:
44 |             paragraph_list = response.css('div#zoom p *::text').getall()
45 |         if len(paragraph_list) == 0:
46 |             paragraph_list = response.css('p *::text').getall()
47 |         if len(paragraph_list) == 0:
48 |             paragraph_list = response.css('*::text').getall()
49 |         if len(response.css('div.wip_art_con p')) >= 2:
50 |             File_num = response.css('div.wip_art_con p')[1].css('::text').get()
51 |         else:
52 |             File_num = None
53 |         if File_num and '号' not in File_num:
54 |             File_num = None
55 |         length = len(''.join(paragraph_list))
56 |         if length > 0:
57 |             state = 'full'
58 |             with open('../../data/HTML_pk/%s/%s.pkl' % (self.name,UID), 'wb') as f:
59 |                 pickle.dump(response.text,f)
60 |             with open('../../data/text/%s/%s.txt' % (self.name,UID), 'w') as f:
61 |                 f.write('\n'.join(paragraph_list))
62 |         else:
63 |             state = 'empty'
64 |         return {
65 |             'UID': UID,
66 |             'FileNumber':File_num,
67 |             'mainText': paragraph_list,
68 |             'crawl state':state,
69 |             'text length':length,
70 |         }
71 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/ShanghaiSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | 
 5 | 
 6 | class ShanghaiSpider(scrapy.Spider):
 7 |     name = "Shanghai"
 8 |     if not os.path.exists("../../data/HTML_pk/%s" % name):
 9 |         os.makedirs("../../data/HTML_pk/%s" % name)
10 |     if not os.path.exists("../../data/text/%s" % name):
11 |         os.makedirs("../../data/text/%s" % name)
12 | 
13 |     def start_requests(self):
14 |         total_page = 298
15 |         url_base = "http://service.shanghai.gov.cn/XingZhengWenDangKuJyh/XZGFList.aspx?testpara=0&kw=&issueDate_userprop8=&status=1&departid=0&wenhao=&issueDate_userprop8_end=&excuteDate=&excuteDate_end=&closeDate=&closeDate_end=&departtypename=0&typename=%E5%85%A8%E9%83%A8&zhutitypename=&zhuti=&currentPage={0}&pagesize=10"
16 |         for i in range(total_page):
17 |             yield scrapy.Request(url=url_base.format(str(i + 1)), callback=self.parse)
18 | 
19 |     def parse(self, response):
20 |         detail_page_links = []
21 |         for piece in response.css('table[class="table table-list"] tr')[1:]:
22 |             url = response.urljoin(piece.css("td a::attr(href)").get())
23 |             UID = url.split("=")[-1]
24 | 
25 |             detail_page_links.append(url)
26 | 
27 |             date = piece.css("td")[2].css("::text").get()
28 | 
29 |             yield {
30 |                 "UID": UID,
31 |                 "title": piece.css("td a::attr(title)").get(),
32 |                 "date": date,
33 |                 "url": url,
34 |                 "text length": 0,
35 |                 "crawl state": "half",
36 |                 "FileNumber": piece.css("td")[1].css("::text").get(),
37 |             }
38 |         yield from response.follow_all(detail_page_links, callback=self.parse_content)
39 | 
40 |     def parse_content(self, response):
41 |         UID = response.url.split("=")[-1]
42 | 
43 |         paragraph_list = response.css("div#ivs_content *::text").getall()
44 | 
45 |         if len(paragraph_list) == 0:
46 |             paragraph_list = response.css("p *::text").getall()
47 | 
48 |         if len(paragraph_list) > 0 and "号" in paragraph_list[0]:
49 |             Filenum = paragraph_list[0]
50 |         length = len("".join(paragraph_list))
51 |         if length > 0:
52 |             state = "full"
53 |             with open("../../data/HTML_pk/%s/%s.pkl" % (self.name, UID), "wb") as f:
54 |                 pickle.dump(response.text, f)
55 |             with open("../../data/text/%s/%s.txt" % (self.name, UID), "w") as f:
56 |                 f.write("\n".join(paragraph_list))
57 |         else:
58 |             state = "empty"
59 |         return {
60 |             "UID": UID,
61 |             "mainText": paragraph_list,
62 |             "crawl state": state,
63 |             "text length": length,
64 |         }
65 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/Shanxi_jinSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | 
 5 | class Shanxi_jinSpider(scrapy.Spider):
 6 |     name = "Shanxi_jin"
 7 |     if not os.path.exists('../../data/HTML_pk/%s' % name):
 8 |         os.makedirs('../../data/HTML_pk/%s' % name)
 9 |     if not os.path.exists('../../data/text/%s' % name):
10 |         os.makedirs('../../data/text/%s' % name)
11 |     def start_requests(self):
12 |         total_page = 138
13 |         # total_page = 3
14 |         
15 |         headers = {
16 |             'User-Agent': "Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/60.0",
17 |             'Referer': "http://www.shanxi.gov.cn/sxszfxxgk/index.shtml",
18 |         }
19 |         url_base = "http://www.shanxi.gov.cn/sxszfxxgk/index{0}.shtml"
20 |         for i in range(total_page):
21 |             page = '_%s' % (i+1) if i >0 else ''
22 |             yield scrapy.Request(url=url_base.format(page), headers =headers,callback=self.parse)
23 | 
24 |     def parse(self,response):
25 |         detail_page_links = []
26 |         hrefs = response.css('table.affairs-document-box tr td.affaires-doc-title a::attr(href)').getall()
27 |         filenums = response.css('table.affairs-document-box tr td.affaires-doc-sizes::text').getall()
28 |         dates = response.css('table.affairs-document-box tr td.affaires-doc-published::text').getall()
29 |         titles = response.css('table.affairs-document-box tr td.affaires-doc-title a.doc-title::attr(title)').getall()
30 |         for i in range(len(hrefs)):
31 |             url = hrefs[i]
32 |             url = response.urljoin(url)
33 |             UID = url.split('/')[-1][:-6]
34 |             if '?' not in UID:
35 |                 detail_page_links.append(url)
36 |             date = dates[i]
37 |             if date and len(date) > 3:
38 |                 date = date.replace('年','-').replace('月','-').replace('日','')
39 |             yield {
40 |                 'UID': UID,
41 |                 'title': titles[i],
42 |                 'date': date,
43 |                 'FileNumber': filenums[i],
44 |                 'text length':0,
45 |                 'url': url,
46 |                 'crawl state':'half'
47 |             }
48 |         yield from response.follow_all(detail_page_links, callback = self.parse_content)
49 | 
50 |     def parse_content(self, response):
51 |         UID = response.url.split('/')[-1][:-6]
52 |         doc_info_dict = {}
53 |         count = 0
54 |         for td in response.css('table.affairs-detail-head td'):
55 |             if count % 2 == 0:
56 |                 key = td.css('*::text').get()
57 |             else:
58 |                 value = td.css('*::text').get()
59 |                 doc_info_dict[key] = value
60 |             count+=1
61 |         paragraph_list = response.css('div[style="FONT-SIZE: 16px; LINE-HEIGHT: 160%"] *::text').getall()
62 |         if len(paragraph_list) == 0:
63 |             paragraph_list =  response.css('div *::text').getall() 
64 |         if len(paragraph_list) == 0:
65 |             paragraph_list =  response.css('*::text').getall() 
66 |         pdf_links = [response.urljoin(response.css('div.article-body a::attr(href)').getall()[-1])]
67 |         length = len(''.join(paragraph_list))
68 |         if length > 0:
69 |             state = 'full'
70 |             with open('../../data/HTML_pk/%s/%s.pkl' % (self.name,UID), 'wb') as f:
71 |                 pickle.dump(response.text,f)
72 |             with open('../../data/text/%s/%s.txt' % (self.name,UID), 'w') as f:
73 |                 f.write('\n'.join(paragraph_list))
74 |         else:
75 |             state = 'empty'
76 |         return {
77 |             'UID': UID,
78 |             'doc_info_dict': doc_info_dict,
79 |             'mainText': paragraph_list,
80 |             'attachment_links':pdf_links,
81 |             'crawl state':state,
82 |             'text length':length,
83 |         }
84 | 
85 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/SichuanSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | import ast
 5 | from urllib import parse
 6 | from scrapy.selector import Selector
 7 | 
 8 | 
 9 | class SichuanSpider(scrapy.Spider):
10 |     name = "Sichuan"
11 |     if not os.path.exists("../../data/HTML_pk/%s" % name):
12 |         os.makedirs("../../data/HTML_pk/%s" % name)
13 |     if not os.path.exists("../../data/text/%s" % name):
14 |         os.makedirs("../../data/text/%s" % name)
15 | 
16 |     def start_requests(self):
17 |         url_dict = {
18 |             "http://www.sc.gov.cn/10462/c102914/list_ft{0}.shtml": 7,
19 |             "http://www.sc.gov.cn/10462/c103043/stt_list{0}.shtml": 4,
20 |             "http://www.sc.gov.cn/10462/c103044/stt_list{0}.shtml": 14,
21 |             "http://www.sc.gov.cn/10462/c103045/stt_list{0}.shtml": 15,
22 |             "http://www.sc.gov.cn/10462/c103046/stt_list{0}.shtml": 15,
23 |             "http://www.sc.gov.cn/10462/c103047/stt_list{0}.shtml": 15,
24 |             "http://www.sc.gov.cn/10462/c103048/stt_list{0}.shtml": 2,
25 |         }
26 |         # test_page = 1
27 |         for url_base, max_page in url_dict.items():
28 |             for i in range(max_page):
29 |                 page = "_" + str(i + 1) if i > 0 else ""
30 |                 yield scrapy.Request(url=url_base.format(page), callback=self.parse)
31 | 
32 |     def parse(self, response):
33 |         detail_page_links = []
34 |         for tr in response.css("div#content table#dash-table tr"):
35 |             url = response.urljoin(tr.css("td")[1].css("a::attr(href)").get())
36 |             UID = url.split("/")[-1].split(".")[0]
37 |             if "?" not in UID:
38 |                 detail_page_links.append(url)
39 | 
40 |             yield {
41 |                 "UID": UID,
42 |                 "title": tr.css("td")[1].css("a::attr(title)").get(),
43 |                 "date": None,
44 |                 "FileNumber": None,
45 |                 "text length": 0,
46 |                 "url": url,
47 |                 "crawl state": "half",
48 |             }
49 |         yield from response.follow_all(detail_page_links, callback=self.parse_content)
50 | 
51 |     def parse_content(self, response):
52 |         UID = response.url.split("/")[-1].split(".")[0]
53 |         paragraph_list = response.css("td[class=contText] \*::text").getall()
54 |         attachment_link = []
55 |         filenumber = (
56 |             response.css('td[bgcolor="#fbfbfb"] tr')[1]
57 |             .css("td")[3]
58 |             .css("\*::text")
59 |             .get()
60 |             .strip()
61 |         )
62 |         date = (
63 |             response.css('td[bgcolor="#fbfbfb"] tr')[1]
64 |             .css("td")[1]
65 |             .css("\*::text")
66 |             .get()
67 |             .strip()
68 |         )
69 | 
70 |         if len(paragraph_list) == 0:
71 |             paragraph_list = response.css("p *::text").getall()
72 |         length = len("".join(paragraph_list))
73 |         if length > 0:
74 |             state = "full"
75 |             with open("../../data/HTML_pk/%s/%s.pkl" % (self.name, UID), "wb") as f:
76 |                 pickle.dump(response.text, f)
77 |             with open("../../data/text/%s/%s.txt" % (self.name, UID), "w") as f:
78 |                 f.write("\n".join(paragraph_list))
79 |         else:
80 |             state = "empty"
81 |         return {
82 |             "UID": UID,
83 |             "mainText": paragraph_list,
84 |             "attachment_link": attachment_link,
85 |             "crawl state": state,
86 |             "FileNumber": filenumber,
87 |             "date": date,
88 |             "text length": length,
89 |         }
90 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/TianjinSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | import json
 5 | from selenium import webdriver
 6 | from selenium.webdriver.firefox.options import Options
 7 | 
 8 | options = Options()
 9 | options.headless = True
10 | 
11 | 
12 | class TianjinSpider(scrapy.Spider):
13 |     name = "Tianjin"
14 |     if not os.path.exists("../../data/HTML_pk/%s" % name):
15 |         os.makedirs("../../data/HTML_pk/%s" % name)
16 |     if not os.path.exists("../../data/text/%s" % name):
17 |         os.makedirs("../../data/text/%s" % name)
18 | 
19 |     def __init__(self):
20 |         self.browser = webdriver.Firefox(options=options)
21 |         self.browser.get("http://gk.tj.gov.cn/")
22 |         super().__init__()
23 | 
24 |     def close(self, spider):
25 |         self.browser.quit()
26 | 
27 |     def start_requests(self):
28 |         total_page = 1000
29 |         url_base = "http://www.tj.gov.cn/igs/front/search/list.html?index=zcwj-index-200424&type=zcfg&filter[AVAILABLE]=true&siteId=&pageSize=10&pageNumber={0}&filter%5BFWJG_name%5D=&filter%5BBT%2CZW%2CFWZH-or%5D=&orderProperty=FBRQ&orderDirection=desc&filter%5BWZFL_name%5D=&filter%5BFBRQ-lte%5D=&filter%5BFBRQ-gte%5D="
30 |         for i in range(total_page):
31 |             req = scrapy.Request(url=url_base.format(str(i + 1)), callback=self.parse)
32 |             req.meta["dont_redirect"] = True
33 |             req.meta["handle_httpstatus_list"] = [302]
34 |             yield req
35 | 
36 |     def parse(self, response):
37 |         for piece in json.loads(response.text)["page"]["content"]:
38 |             piece["url"] = piece["DOCPUBURL"]
39 |             piece["UID"] = piece["url"].split("/")[-1].split(".")[0]
40 |             piece["title"] = piece["BT"]
41 |             piece["date"] = piece["FBRQ"].split("T")[0]
42 |             piece["mainText"] = piece["ZW"]
43 |             piece["crawl state"] = "full"
44 |             piece["text length"] = 0
45 |             piece["FileNumber"] = piece["FWZH"]
46 |             yield piece
47 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/XinjiangSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | import ast
 5 | from urllib import parse
 6 | from scrapy.selector import Selector
 7 | 
 8 | class XinjiangSpider(scrapy.Spider):
 9 |     name = "Xinjiang"
10 |     if not os.path.exists('../../data/HTML_pk/%s' % name):
11 |         os.makedirs('../../data/HTML_pk/%s' % name)
12 |     if not os.path.exists('../../data/text/%s' % name):
13 |         os.makedirs('../../data/text/%s' % name)
14 |     def start_requests(self):
15 |         total_page = 34
16 |         # total_page = 3 
17 |         url_base = 'http://www.xinjiang.gov.cn/xinjiang/gfxwj/zfxxgk_gknrz{0}.shtml'
18 |         for i in range(total_page):
19 |             page = '_'+ str(i+1) if i > 0 else ''
20 |             yield scrapy.Request(url=url_base.format(page), callback=self.parse)
21 | 
22 |     def parse(self,response):
23 |         detail_page_links = []
24 |         for dd in response.css('div.gknr_list dd'):
25 |             url = response.urljoin(dd.css('a::attr(href)').get())
26 |             UID = url.split('/')[-1][:-6]
27 |             if '?' not in UID:
28 |                 detail_page_links.append(url)
29 |             yield {
30 |                 'UID': UID,
31 |                 'title': dd.css('a::attr(title)').get(),
32 |                 'date': dd.css('span::text').get(),
33 |                 'FileNumber':None,
34 |                 'text length':0,
35 |                 'url': url,
36 |                 'crawl state':'half'
37 |             }
38 |         yield from response.follow_all(detail_page_links, callback = self.parse_content)
39 | 
40 |     def parse_content(self, response):
41 |         UID = response.url.split('/')[-1][:-6]
42 |         doc_info_dict = {}
43 |         for li in response.css('ul.clearfix li'):
44 |             tmp_l = li.css('*::text').getall()
45 |             if len(tmp_l) == 2:
46 |                 doc_info_dict[tmp_l[0]] = tmp_l[1]
47 |             else:
48 |                 tmp_l = tmp_l[0].split('：')
49 |                 if len(tmp_l) == 2:
50 |                     doc_info_dict[tmp_l[0]] = tmp_l[1]
51 |         File_num = None
52 |         if '发文字号' in doc_info_dict.keys():
53 |             File_num = doc_info_dict['发文字号']
54 |         paragraph_list = response.css('div.gknbxq_detail p *::text').getall()        
55 |         attachment_link = response.css('div.ewebeditor_doc img::attr(src)').getall()        
56 |         if len(paragraph_list) == 0:
57 |             paragraph_list =  response.css('p *::text').getall() 
58 |         length = len(''.join(paragraph_list))
59 |         if length > 0:
60 |             state = 'full'
61 |             with open('../../data/HTML_pk/%s/%s.pkl' % (self.name,UID), 'wb') as f:
62 |                 pickle.dump(response.text,f)
63 |             with open('../../data/text/%s/%s.txt' % (self.name,UID), 'w') as f:
64 |                 f.write('\n'.join(paragraph_list))
65 |         else:
66 |             state = 'empty'
67 |         return {
68 |             'UID': UID,
69 |             'FileNumber':File_num,
70 |             'mainText': paragraph_list,
71 |             'attachment_link': attachment_link,
72 |             'doc_info_dict':doc_info_dict,
73 |             'crawl state':state,
74 |             'text length':length,
75 |         }
76 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/XizangSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | import ast
 5 | from urllib import parse
 6 | from scrapy.selector import Selector
 7 | 
 8 | class XizangSpider(scrapy.Spider):
 9 |     name = "Xizang"
10 |     if not os.path.exists('../../data/HTML_pk/%s' % name):
11 |         os.makedirs('../../data/HTML_pk/%s' % name)
12 |     if not os.path.exists('../../data/text/%s' % name):
13 |         os.makedirs('../../data/text/%s' % name)
14 |     def start_requests(self):
15 |         # total_page = 3
16 |         total_page = 54
17 |         url_base = 'http://www.xizang.gov.cn/zwgk/xxfb/gsgg_428/index{0}.html'
18 |         for i in range(total_page):
19 |             page = '_'+ str(i+1) if i > 0 else ''
20 |             yield scrapy.Request(url=url_base.format(page), callback=self.parse)
21 | 
22 |     def parse(self,response):
23 |         detail_page_links = []
24 |         for li in response.css('ul.zwyw_list li'):
25 |             url = response.urljoin(li.css('a::attr(href)').get())
26 |             UID = url.split('/')[-1][:-5]
27 |             if '?' not in UID:
28 |                 detail_page_links.append(url)
29 |             yield {
30 |                 'UID': UID,
31 |                 'title': li.css('a::text').get(),
32 |                 'date': li.css('span::text').get(),
33 |                 'FileNumber':None,
34 |                 'url': url,
35 |                 'text length':0,
36 |                 'crawl state':'half'
37 |             }
38 |         yield from response.follow_all(detail_page_links, callback = self.parse_content)
39 | 
40 |     def parse_content(self, response):
41 |         UID = response.url.split('/')[-1][:-5]
42 |         doc_info_dict = {}
43 |         th_list = response.css('table.table tr td.th')
44 |         td_list = response.css('table.table tr td.td')
45 |         for i in range(len(th_list)):
46 |             key = th_list[i].css('::text').get()
47 |             value = td_list[i].css('::text').get()
48 |             doc_info_dict[key] = value
49 |         File_num = None
50 |         if '文 \xa0\xa0\xa0\xa0 号' in doc_info_dict.keys():
51 |             File_num = doc_info_dict['文 \xa0\xa0\xa0\xa0 号']
52 |         paragraph_list = response.css('div.view *::text').getall()
53 |         if len(paragraph_list) == 0:
54 |             paragraph_list = response.css('div *::text').getall()
55 |         length = len(''.join(paragraph_list))
56 |         if length > 0:
57 |             state = 'full'
58 |             with open('../../data/HTML_pk/%s/%s.pkl' % (self.name,UID), 'wb') as f:
59 |                 pickle.dump(response.text,f)
60 |             with open('../../data/text/%s/%s.txt' % (self.name,UID), 'w') as f:
61 |                 f.write('\n'.join(paragraph_list))
62 |         else:
63 |             state = 'empty'
64 |         return {
65 |             'UID': UID,
66 |             'FileNumber':File_num,
67 |             'mainText': paragraph_list,
68 |             'doc_info_dict':doc_info_dict,
69 |             'crawl state':state,
70 |             'text length':length,
71 |         }
72 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/YunnanSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | import ast
 5 | from urllib import parse
 6 | from scrapy.selector import Selector
 7 | 
 8 | 
 9 | class YunnanSpider(scrapy.Spider):
10 |     name = "Yunnan"
11 |     if not os.path.exists("../../data/HTML_pk/%s" % name):
12 |         os.makedirs("../../data/HTML_pk/%s" % name)
13 |     if not os.path.exists("../../data/text/%s" % name):
14 |         os.makedirs("../../data/text/%s" % name)
15 | 
16 |     def start_requests(self):
17 |         url_dict = {
18 |             "http://www.yn.gov.cn/zwgk/zcwj/szfl/index{0}.html": 5,
19 |             "http://www.yn.gov.cn/zwgk/zcwj/yzf/index{0}.html": 42,
20 |             "http://www.yn.gov.cn/zwgk/zcwj/yzg/index{0}.html": 1,
21 |             "http://www.yn.gov.cn/zwgk/zcwj/yzh/index{0}.html": 7,
22 |             "http://www.yn.gov.cn/zwgk/zcwj/yunzf/index{0}.html": 12,
23 |             "http://www.yn.gov.cn/zwgk/zcwj/yzr/index{0}.html": 42,
24 |             "http://www.yn.gov.cn/zwgk/zcwj/yfmd/index{0}.html": 1,
25 |             "http://www.yn.gov.cn/zwgk/zcwj/yzfb/index{0}.html": 42,
26 |             "http://www.yn.gov.cn/zwgk/zcwj/yzbg/index{0}.html": 2,
27 |             "http://www.yn.gov.cn/zwgk/zcwj/yzbh/index{0}.html": 28,
28 |             "http://www.yn.gov.cn/zwgk/zcwj/yzbmd/index{0}.html": 5,
29 |             "http://www.yn.gov.cn/zwgk/zcwj/qtwj/index{0}.html": 5,
30 |         }
31 |         for url_base, max_page in url_dict.items():
32 |             for i in range(max_page):
33 |                 page = "_" + str(i) if i > 0 else ""
34 |                 yield scrapy.Request(url=url_base.format(page), callback=self.parse)
35 | 
36 |     def parse(self, response):
37 |         detail_page_links = []
38 |         for ul in response.css("tbody tr")[1:]:
39 |             url = response.urljoin(ul.css("td")[1].css("a::attr(href)").get())
40 |             UID = url.split("/")[-1].split(".")[0]
41 | 
42 |             detail_page_links.append(url)
43 |             date = ul.css("td")[2].css("::text").get()
44 | 
45 |             yield {
46 |                 "UID": UID,
47 |                 "title": ul.css("td")[1].css("a::text").get(),
48 |                 "date": date,
49 |                 "FileNumber": ul.css("td")[0].css("::text").get(),
50 |                 "url": url,
51 |                 "text length": 0,
52 |                 "crawl state": "half",
53 |             }
54 |         yield from response.follow_all(detail_page_links, callback=self.parse_content)
55 | 
56 |     def parse_content(self, response):
57 |         UID = response.url.split("/")[-1].split(".")[0]
58 |         paragraph_list = response.css('div[class="arti"] \*::text').getall()
59 |         if len(paragraph_list) == 0:
60 |             paragraph_list = response.css("p *::text").getall()
61 |         length = len("".join(paragraph_list))
62 |         if length > 0:
63 |             state = "full"
64 |             with open("../../data/HTML_pk/%s/%s.pkl" % (self.name, UID), "wb") as f:
65 |                 pickle.dump(response.text, f)
66 |             with open("../../data/text/%s/%s.txt" % (self.name, UID), "w") as f:
67 |                 f.write("\n".join(paragraph_list))
68 |         else:
69 |             state = "empty"
70 |         return {
71 |             "UID": UID,
72 |             "mainText": paragraph_list,
73 |             "crawl state": state,
74 |             "text length": length,
75 |         }
76 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/ZhejiangSpider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import pickle
 3 | import os
 4 | import ast
 5 | from urllib import parse
 6 | from scrapy.selector import Selector
 7 | 
 8 | class ZhejiangSpider(scrapy.Spider):
 9 |     name = "Zhejiang"
10 |     if not os.path.exists('../../data/HTML_pk/%s' % name):
11 |         os.makedirs('../../data/HTML_pk/%s' % name)
12 |     if not os.path.exists('../../data/text/%s' % name):
13 |         os.makedirs('../../data/text/%s' % name)
14 |     def start_requests(self):
15 |         total_page = 509
16 |         # total_page = 3 
17 |         url_base = 'http://www.zj.gov.cn/module/xxgk/search.jsp?infotypeId=&jdid=3096&area=000014349&divid=div1551294&vc_title=&vc_number=&sortfield=,compaltedate:0&currpage={0}&vc_filenumber=&vc_all=&texttype=0&fbtime=&texttype=0&fbtime=&vc_all=&vc_filenumber=&vc_title=&vc_number=&currpage=3&sortfield=,compaltedate:0'
18 |         for i in range(total_page):
19 |             yield scrapy.Request(url=url_base.format(i+1), callback=self.parse)
20 | 
21 |     def parse(self,response):
22 |         detail_page_links = []
23 |         for tr in response.css('tr')[4:-2]:
24 |             url = response.urljoin(tr.css('a::attr(href)').get())
25 |             UID = url.split('/')[-1][:-5]
26 |             if '?' not in UID:
27 |                 detail_page_links.append(url)
28 |             yield {
29 |                 'UID': UID,
30 |                 'title': tr.css('a::attr(mc)').get(),
31 |                 'date': tr.css('a::attr(rq)').get(),
32 |                 'FileNumber':tr.css('a::attr(wh)').get(),
33 |                 'url': url,
34 |                 'crawl state':'half',
35 |                 'text length':0,
36 |             }
37 |         yield from response.follow_all(detail_page_links, callback = self.parse_content)
38 | 
39 |     def parse_content(self, response):
40 |         UID = response.url.split('/')[-1][:-5]
41 |         paragraph_list = response.css('div.bt_content p *::text').getall()          
42 |         if len(paragraph_list) == 0:
43 |             paragraph_list =response.css('div#zoom p *::text').getall()           
44 |         if len(paragraph_list) == 0:
45 |             paragraph_list =  response.css('p *::text').getall() 
46 |         if len(paragraph_list) == 0:
47 |             paragraph_list =  response.css('tbody *::text').getall() 
48 |         length = len(''.join(paragraph_list))
49 |         if length > 0:
50 |             with open('../../data/HTML_pk/%s/%s.pkl' % (self.name,UID), 'wb') as f:
51 |                 pickle.dump(response.text,f)
52 |             with open('../../data/text/%s/%s.txt' % (self.name,UID), 'w') as f:
53 |                 f.write('\n'.join(paragraph_list))
54 |             return {
55 |                 'UID': UID,
56 |                 'mainText': paragraph_list,
57 |                 'crawl state':'full',
58 |                 'text length':length,
59 |             }
60 |         else:
61 |             return {
62 |                 'UID': UID,
63 |                 'mainText': paragraph_list,
64 |                 'crawl state':'empty',
65 |                 'text length':0,
66 |             }
67 | 


--------------------------------------------------------------------------------
/src/crawl_data/crawl_data/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/src/crawl_data/crwal_jsst.sh:
--------------------------------------------------------------------------------
1 | scrapy crawl Jilin
2 | scrapy crawl Shaanxi_shan
3 | scrapy crawl Shanxi_jin
4 | scrapy crawl Tianjin
5 | 


--------------------------------------------------------------------------------
/src/crawl_data/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = crawl_data.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = crawl_data
12 | 


--------------------------------------------------------------------------------
/src/crawl_data/test_get_ip.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | 
 4 | url = "https://api.xiaoxiangdaili.com/ip/get?appKey=571491556088238080&appSecret=6VZhoE4G&cnt=1&method=http&releaseAuto=false&wt=json"
 5 | 
 6 | resp = requests.get(url)
 7 | print(resp.status_code,type(resp.status_code))
 8 | if resp.status_code == 200:
 9 |     x = json.loads(resp.text)
10 |     s = 'http://%s:%s' %(x['data'][0]['ip'],x['data'][0]['port'])
11 |     print(s)
12 |     
13 | 


--------------------------------------------------------------------------------
/src/crawl_data/test_proxy.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | #请求地址
 4 | targetUrl = "http://baidu.com"
 5 | 
 6 | #代理服务器
 7 | proxyHost = "125.122.199.13"
 8 | proxyPort = "9000"
 9 | 
10 | proxyMeta = "http://%(host)s:%(port)s" % {
11 | 
12 |     "host" : proxyHost,
13 |     "port" : proxyPort,
14 | }
15 | 
16 | proxyMeta = "http://114.106.72.38:3000"
17 | 
18 | #pip install -U requests[socks]  socks5代理
19 | # proxyMeta = "socks5://%(host)s:%(port)s" % {
20 | 
21 | #     "host" : proxyHost,
22 | 
23 | #     "port" : proxyPort,
24 | 
25 | # }
26 | 
27 | proxies = {
28 | 
29 |     "http"  : proxyMeta,
30 | }
31 | 
32 | resp = requests.get(targetUrl, proxies=proxies)
33 | print(resp.status_code)
34 | print(resp.text)
35 | 


--------------------------------------------------------------------------------
/src/crawl_test/sichuan.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SmartDataLab/Policy_crawler/fb9fcb7ab701dfb98606afe9f7260f2f2e857506/src/crawl_test/sichuan.py


--------------------------------------------------------------------------------
/src/data_analysis/preprocess.py:
--------------------------------------------------------------------------------
 1 | #%%
 2 | """
 3 | To csv format for submission
 4 | """
 5 | import pymongo
 6 | import pandas as pd
 7 | 
 8 | MONGO_URI = 'mongodb://localhost:27017'
 9 | MONGO_DB = 'Policy'
10 | COLLECTION = 'sample_task'
11 | 
12 | client = pymongo.MongoClient(MONGO_URI)
13 | db  = client[MONGO_DB]
14 | table = db[COLLECTION]
15 | data_list = list(table.find())
16 | client.close()
17 | df = pd.DataFrame(data_list)
18 | 
19 | df.to_csv('../../data/raw_data.csv')
20 | 
21 | # %%
22 | """
23 | Data clean & Partition
24 | """
25 | import pickle
26 | import re
27 | import jieba
28 | from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
29 | doc_list = []
30 | for piece in data_list:
31 |     words_per_doc = []
32 |     for paragraph in piece['mainText']:
33 |         sub_paragraph = re.sub(\
34 |             "[0-9\s+\.\!\/_,$%^*()?;；:-【】+\"\']+|[+——！，;:。？、~@#￥%……&*（）]+",\
35 |             ' ',paragraph)
36 |         words_per_doc += jieba.lcut(sub_paragraph)
37 |     doc_list.append(' '.join(words_per_doc))
38 | 
39 | #%%
40 | """
41 | Prepare data to plot wordcloud
42 | """
43 | CV = CountVectorizer(token_pattern='\\b\\w+\\b')
44 | CV.fit_transform(doc_list)
45 | word_count = CV.vocabulary_
46 | 
47 | with open('../../data/word_cloud.pk','wb') as f:
48 |     pickle.dump(word_count,f)
49 | 
50 | # %%
51 | import numpy
52 | cv_count = CountVectorizer(token_pattern='\\b\\w+\\b').fit_transform(doc_list)
53 | tf_transformer = TfidfTransformer(use_idf=True)
54 | tf_idf = tf_transformer.fit_transform(cv_count)
55 | print(tf_idf.shape)
56 | 
57 | with open('../../data/tf_idf.pk','wb') as f:
58 |     pickle.dump(tf_idf,f)
59 | 
60 | # %%
61 | with open('../../data/doc_list.pk','wb') as f:
62 |     pickle.dump(doc_list,f)
63 | 
64 | 
65 | # %%
66 | 


--------------------------------------------------------------------------------
/src/data_analysis/text_cluster.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | with open('../../data/tfidf.pk','rb') as f:
3 |     tfidf = pickle.load(f)
4 | from sklearn.cluster import KMeans  
5 | clf = KMeans(n_clusters=20)  
6 | s = clf.fit(tfidf)


--------------------------------------------------------------------------------
/src/data_analysis/topic.py:
--------------------------------------------------------------------------------
 1 | from gensim import corpora, models
 2 | import pickle
 3 | with open('../../data/doc_list.pk','rb') as f:
 4 |     doc_list = pickle.load(f)
 5 | 
 6 | dictionary = corpora.Dictionary(words_ls)
 7 | 
 8 | corpus = [dictionary.doc2bow(words) for words in words_ls]
 9 | 
10 | lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10)
11 | 
12 | for topic in lda.print_topics(num_topics=4):
13 |     print(topic)


--------------------------------------------------------------------------------
/src/data_analysis/wordcloud_diamond.py:
--------------------------------------------------------------------------------
 1 | from pyecharts import options as opts
 2 | from pyecharts.charts import WordCloud
 3 | from pyecharts.globals import SymbolType
 4 | import pickle
 5 | 
 6 | with open('../../data/word_cloud.pk','rb') as f:
 7 |     words = pickle.load(f)
 8 | 
 9 | # words = [
10 | #     ("Sam S Club", 10000),
11 | #     ("Macys", 6181),
12 | #     ("Amy Schumer", 4386),
13 | #     ("Jurassic World", 4055),
14 | #     ("Charter Communications", 2467),
15 | #     ("Chick Fil A", 2244),
16 | #     ("Planet Fitness", 1868),
17 | #     ("Pitch Perfect", 1484),
18 | #     ("Express", 1112),
19 | #     ("Home", 865),
20 | #     ("Johnny Depp", 847),s
21 | #     ("Lena Dunham", 582),
22 | #     ("Lewis Hamilton", 555),
23 | #     ("KXAN", 550),
24 | #     ("Mary Ellen Mark", 462),
25 | #     ("Farrah Abraham", 366),
26 | #     ("Rita Ora", 360),
27 | #     ("Serena Williams", 282),
28 | #     ("NCAA baseball tournament", 273),
29 | #     ("Point Break", 265),
30 | # ]
31 | 
32 | 
33 | c = (
34 |     WordCloud()
35 |     .add("", words[:30], word_size_range=[20, 100], shape=SymbolType.DIAMOND)
36 |     .set_global_opts(title_opts=opts.TitleOpts(title="WordCloud-shape-diamond"))
37 |     .render(path="../../figure/word_cloud.png")
38 | )
39 | 


--------------------------------------------------------------------------------
/src/data_analysis/wordcloud_plot.py:
--------------------------------------------------------------------------------
 1 | #%%
 2 | from wordcloud import WordCloud
 3 | import matplotlib.pyplot as plt
 4 | import pickle
 5 | font = '/usr/share/fonts/华文楷体.ttf'
 6 | 
 7 | with open('../../data/word_cloud.pk','rb') as f:
 8 |     words = pickle.load(f)
 9 | 
10 | wc=WordCloud(background_color='White',width=800,height=600,font_path=font,scale=64)
11 | wc.generate_from_frequencies(words)
12 | 
13 | plt.imshow(wc)
14 | plt.axis("off")
15 | plt.savefig('../../figure/wordcloud.png')
16 | plt.show()
17 | 
18 | 
19 | 
20 | 
21 | # %%
22 | 


--------------------------------------------------------------------------------
/src/doc_tex/en.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[10pt, conference, compsocconf]{IEEEtran}
  2 | \hyphenation{op-tical net-works semi-conduc-tor}
  3 | \usepackage{amsmath}
  4 | \usepackage{algorithm}
  5 | \usepackage{algorithmic}
  6 | \usepackage{color}
  7 | \usepackage{stfloats}
  8 | \usepackage{supertabular}
  9 | \usepackage{booktabs}
 10 | \usepackage{graphicx}
 11 | \usepackage{bm}
 12 | \usepackage[colorlinks,linkcolor=red,anchorcolor=blue,citecolor=green,CJKbookmarks=True]{hyperref}
 13 | \begin{document}
 14 | \title{Constructing Financial Sentimental Factors in Chinese Market Using Natural Language Processing}
 15 | \author
 16 | {\IEEEauthorblockN{Junfeng Jiang \IEEEauthorrefmark{1}\IEEEauthorrefmark{2},Jiahao Li\IEEEauthorrefmark{1}\IEEEauthorrefmark{2}}
 17 | \IEEEauthorblockA
 18 | {
 19 | \IEEEauthorrefmark{1}Likelihood Technology\\
 20 | }
 21 | \IEEEauthorblockA
 22 | {
 23 | \IEEEauthorrefmark{2}Sun Yat-sen University\\
 24 | }
 25 | $ $\\
 26 | $\{Jiangjf6,lijh76\}@mail2.sysu.edu.cn$
 27 | }
 28 | 
 29 | \maketitle
 30 | \begin{abstract}
 31 | In this paper, we design an integrated algorithm to evaluate the sentiment of Chinese market. Firstly, with the help of the web browser automation, we crawl a lot of news and comments from several influential financial websites automatically. Secondly, we use techniques of Natural Language Processing(NLP) under Chinese context, including tokenization, Word2vec word embedding and semantic database WordNet, to compute Senti-scores of these news and comments, and then construct the sentimental factor. Here, we build a finance-specific sentimental lexicon so that the sentimental factor can reflect the sentiment of financial market but not the general sentiments as happiness, sadness, etc. Thirdly, we also implement an adjustment of the standard sentimental factor. Our experimental performance shows that there is a significant correlation between our standard sentimental factor and the Chinese market, and the adjusted factor is even more informative, having a stronger correlation with the Chinese market. Therefore, our sentimental factors can be important references when making investment decisions. Especially during the Chinese market crash in 2015, the Pearson correlation coefficient of adjusted sentimental factor with SSE is 0.5844, which suggests that our model can provide a solid guidance, especially in the special period when the market is influenced greatly by public sentiment.
 32 | \end{abstract}
 33 | 
 34 | \begin{IEEEkeywords}
 35 | Natural Language Processing; Word2Vec; WordNet; Sentiment Analysis;
 36 | \end{IEEEkeywords}
 37 | 
 38 | \IEEEpeerreviewmaketitle
 39 | \section{Introduction}
 40 | 
 41 | Natural language processing, as one of the most promising fields in machine learning, has achieved great development recently and has been used in lots of aspects in the society. Many researches also implemented the technique of NLP in financial market. The difficulty when applying NLP is that the natural language is not a kind of structural data. Finding a way to process such kind of non-structural data is the main focus of NLP. A lot of models have been demonstrated to do well in turning natural language data into numerical data, which is more tractable. With the implementation of these models, it becomes possible and easier to make use of natural language data.
 42 | 
 43 | Some models are based on the idea of Naive Bayesian[\hyperref[ref 1]{1}]. The logic behind these models is: words that show the same kind of sentiment will appear simultaneously more frequently. These models usually select some words as the label words. By analyzing the words appeared in a large amount of texts and researching on the relationship between the frequency of these label words and the frequency of other words, it becomes possible to cluster words. For any given texts, it is able to use these words to evaluate the sentiment behind. Researches have proved this kind of methods can successfully evaluate the sentiment of texts like twitter or news. By taking advantage of the sentiment, investors can make appropriate investment decisions.
 44 | 
 45 | However, this kind of methods have their own limitations. The main is that they just focus on a few words. Some new words that show the similar sentiment but do not appear frequently will be ignored. Sometimes unfortunately, these words do play a significant role when analyzing the sentiment of a text. The lost of information can have great damage to the accuracy of the evaluation.
 46 | 
 47 | 
 48 | This study is aiming to put eyes as many words as possible to analyze sentiment. The specific steps of this study are as followed:
 49 | 
 50 | \begin{itemize}
 51 | \item Download news from several influential financial websites automatically.
 52 | \item Make a pre-treatment on the news we crawl from the Internet.
 53 | \item Find a way or some algorithms to analyse this per-treated text data, and finally compute the sentimental factor of each day with the news at that day.
 54 | \item Choose a proper criterion to analyze the correlation between the sentimental factor and the market trend, and judge whether our factor is useful in financial investment.
 55 | \end{itemize}
 56 | 
 57 | The code we use are open source in Github.\footnote{\href{https://github.com/Coldog2333/Financial-NLP}{https://github.com/Coldog2333/Financial-NLP}}
 58 | 
 59 | The remainder of this paper is organized as follows: Section 2 describes the research background and related works of Jieba, Word2vec and WordNet. Section 3 shows the methodology and the data we use in analysis. Section 4 contains the experimental result and discussion. Finally, in Section 5, we proposed our conclusions.
 60 | \vspace{0.5cm}
 61 | \section{Related Works}
 62 | \subsection{\underline{Jieba}}
 63 | The tokenization of Chinese is much more complicated than English. To tokenize English words, we just need to split words in sentence by blank or punctuation. Chinese doesn't have blank between words. An additional step of tokenization is, therefore, needed.
 64 | 
 65 | Jieba(Chinese for "to shutter") Chinese text tokenization is a Chinese word tokenization module. The algorithm of Jieba is probability language modeling. It generates a trie tree based on a dictionary transcendentally and also calculate the frequency of words in the dictionary. When dealing with the sentence that is needed to be tokenized, it generates a DAG(Directed Acyclic Graph) to record every possible tokenization.A DAG is a dictionary, where the keys are the starting position of a word in the sentence and the values are lists of possible ending position.
 66 | 
 67 | For every possible words in the DAG, Jieba calculates their probability based on the transcendental dictionary. Then it find the path with the largest probability from the right side of the sentence to the left side. This largest probability path gives us the most possible tokenization.
 68 | 
 69 | In the case where the sentence includes words that are not in the dictionary, Jieba uses HMM(Hidden Markov Model) and Viterbi algorithm to tokenize. Every character has four conditions based on its possible condition in a word: B(Begin), M(Middle), E(End) and S(Single). The process of tokenizing words not in the dictionary is based on their conditions mainly. With three probability tables from the training of a large amount of texts, Jiaba then applies Viterbi algorithm to calculate the most possible condition of a word and uses the conditions chain to tokenize.
 70 | \subsection{\underline{Word2vec}}
 71 | 
 72 | In 2013, Google published a powerful tool named word2vec[\hyperref[ref 2]{2}]. It contains two models, one is the Skip-gram, another is Continuous bag of words(CBOW). With the word2vec model, we can turn a specific word into a calculable numeric vector. To speak of, moreover, it can well express the degree of similarity and analogy between two different words.
 73 | 
 74 | Since word2vec have been published, it is widely applied in Natural Language Processing, and its original models and training methods also enlighten many word embedding models and algorithms on the following days. Now, we introduce the word2vec model with an English example.
 75 | 
 76 | \subsubsection{\underline{Skip-gram}}
 77 | In Skip-gram, we focus on one word, and use it to predict which words will appear around it.
 78 | 
 79 | For example,"the boy adores that girl", we can achieve five background words like "the", "boy", "adores", "that", "girl" easily because we have blanks between every two words. Let "adores" be the center word, and set window size equals to 2, then, in Skip-gram, what we are interested in is the conditional probabilities of each background word under the given center word, where the background words is apart from the center word in two words. That is the mainly idea of Skip-gram. Let's describe the Skip-gram model in a strict mathematical language.
 80 | 
 81 | Assume that size of the set of dictionary index \textit{D} is $\big|$\textit{D}$\big|$, and denoted as \textit{D}=\{1,2,...,$\big|$\textit{D}$\big|$\}. Given a text sequence with the length of \textit{T}, and the $t^{th}$ word denoted as $w^{(t)}$.When window size equals to m, Skip-gram requires that we should maximize the total of all conditional probabilities of each background word that is apart from the center word in \textit{m} words under arbitrary center word.
 82 | 
 83 | \begin{equation}
 84 | \prod_{t=1}^{T}\prod_{-m \leq j \leq m, j\neq 0, 1\leq t+j \leq \big|T\big|}P(w^{(t+j)}\big|w^{(t)})
 85 | \end{equation}
 86 | 
 87 | So, the likelihood function is,
 88 | 
 89 | \begin{equation}
 90 | \sum_{t=1}^{T}\sum_{-m \leq j \leq m, j\neq 0, 1\leq t+j \leq \big|T\big|}log P(w^{(t+j)}\big|w^{(t)})
 91 | \end{equation}
 92 | 
 93 | Maximizing the likelihood function above minimize the following loss function,
 94 | 
 95 | \begin{equation}
 96 | -\frac{1}{T} \sum_{t=1}^{T}\sum_{-m \leq j \leq m, j\neq 0, 1\leq t+j \leq \big|T\big|}log P(w^{(t+j)}\big|w^{(t)})
 97 | \end{equation}
 98 | 
 99 | Denote the vectors of center words and background words with \textbf{v} and \textbf{u}, that is, as for a word with index \textit{i}, $\textbf{v}_{i}$ and $\textbf{u}_{i}$ are the vectors when it is as center word and background word. And the parameters of model we want to train are the two kinds of vectors of every words.
100 | 
101 | In order to implement the model parameters into loss function, we should express the conditional probabilities of background word under given center word with model parameters. Assume that generating each background words is independent mutually when center word is given, then as for the center word $w_{c}$ and the background word $w_{b}$, b, c are the indexes of them in the dictionary. Such that, the probability of generating background word $w_{b}$ under the given center word $w_{c}$ can be defined by softmax function, as
102 | 
103 | \begin{equation}
104 | P(w_{b}\big|w_{c})=\frac{exp(\textbf{u}_{b} ^{T}\textbf{v}_{c})}{\sum_{i\in D}exp(\textbf{u}_{i}^{T}\textbf{v}_{c})}
105 | \end{equation}
106 | 
107 | With derivation, we achieve the gradient of the conditional probability above,
108 | 
109 | \begin{equation}
110 | \frac{\partial logP(w_{b}\big|w_{c})}{\partial \textbf{v}_{c}}=\textbf{u}_{b}-\sum_{j\in D}\frac{exp(\textbf{u}_{j} ^{T}\textbf{v}_{c})}{\sum_{i\in D}exp(\textbf{u}_{i}^{T}\textbf{v}_{c})}\textbf{u}_{j}
111 | \end{equation}
112 | 
113 | Namely,
114 | 
115 | \begin{equation}
116 | \frac{\partial logP(w_{b}\big|w_{c})}{\partial \textbf{v}_{c}}=\textbf{u}_{b}-\sum_{j\in D}P(w_{j}\big|w_{c})\textbf{u}_{j}
117 | \end{equation}
118 | 
119 | Then, we can solve this by Gradient Descent or Stochastic Gradient Descent iteratively, and finally achieve the word vectors $v_{i}$ and $u_{i}$, \(i=1,2,...,\big|D\big|\) of every single words when it is as center word and background word, when the loss function reaches to minimum.
120 | 
121 | If the length of text sequence \textit{T} is too long, we can sample a rather short subsequence randomly to calculate the loss about this subsequence in each epoch, in order to find out an approximate solution.
122 | 
123 | In general, we will use the central word vector of Skip-gram as the word vector of each word in natural language processing application.
124 | 
125 | \subsubsection{\underline{Continuous Bag of Words}}
126 | 
127 | CBOW is similar to Skip-gram, this model predicts the central word with the background words around it in a text sequence. For example, "the boy adores that girl", we can achieve five background words like "the", "boy", "adores", "that", "girl". Let "adores" be the central word again, and set window size equals to 2, then, in CBOW, what we are interested in is the conditional probabilities of generating the given central word under all the background words which are apart from the central word in two words. That is the mainly idea of CBOW.
128 | 
129 | Assume that size of the set of dictionary index \textit{D} is $\big|\textit{D}\big|$, and denoted as \textit{D}=\{1,2,...,\big|\textit{D}\big|\}. Given a text sequence with the length of \textit{T}, and the $t^{th}$ word denoted as $w^{(t)}$.When window size equals to m, CBOW requires that we should maximize the total of all conditional probabilities of generating the arbitrarily given central word under all the background words which are apart from the central word in m words.
130 | 
131 | \begin{equation}
132 | \prod_{t=1}^{T}P(w^{(t)}\big|w^{(t-m)},...,w^{(t-1)},w^{(t+1)},...,w^{(t+m)})
133 | \end{equation}
134 | 
135 | where \textit{m} is the window size, and we should insure that (t-m+j)$\in$[1,\big|\textit{T}\big|],j$\in$[0,2m].
136 | 
137 | Therefore, the likelihood function is,
138 | 
139 | \begin{equation}
140 | \sum_{t=1}^{T}logP(w^{(t)}\big|w^{(t-m)},...,w^{(t-1)},w^{(t+1)},...,w^{(t+m)})
141 | \end{equation}
142 | 
143 | Maximizing the likelihood function above minimize the following loss function,
144 | 
145 | \begin{equation}
146 | -\sum_{t=1}^{T}logP(w^{(t)}\big|w^{(t-m)},...,w^{(t-1)},w^{(t+1)},...,w^{(t+m)})
147 | \end{equation}
148 | 
149 | We still use the notation when we discuss Skip-gram model. Now, as for the central word $v_{c}$ and its background words $w_{b0}$,$w_{b1}$,...,$w_{b\cdot 2m}$, such that, the probability of generating the given central word $w_{b}$ under all the background words $w_{b1}$,$w_{b2}$,...,$w_{b\cdot 2m}$ can be defined by softmax function as,
150 | 
151 | \begin{equation}
152 | \begin{aligned}
153 | & P(w_{c}\big|w_{b0},w_{b1},...,w_{b\cdot 2m}) \\
154 | & = \frac {exp( \frac{\textbf{v}_{c}^{T}(\textbf{u}_{b0}+\textbf{u}_{b1}+...+\textbf{u}_{b\cdot 2m})}{2m})} {\sum_{i\in D}exp(\frac{\textbf{v}_{i}^{T}(\textbf{u}_{b0}+\textbf{u}_{b1}+...+\textbf{u}_{b\cdot 2m})}{2m}) } \\
155 | \end{aligned}
156 | \end{equation}
157 | 
158 | With derivation, we achieve the gradient of the conditional probability above,
159 | 
160 | \begin{equation}
161 | \begin{aligned}
162 | & \frac{\partial logP(w_{c}\big|w_{b0},w_{b1},...,w_{b\cdot 2m})}{\partial \textbf{u}_{bi}} \\
163 | & =\frac{1}{2m} (\textbf{v}_{c}-\sum_{j\in D} \frac{exp( \frac{\textbf{v}_{c}^{T}(\textbf{u}_{b0}+\textbf{u}_{b1}+...+\textbf{u}_{b\cdot 2m})}{2m})} {\sum_{i\in D}exp(\frac{\textbf{v}_{i}^{T}(\textbf{u}_{b0}+\textbf{u}_{b1}+...+\textbf{u}_{b\cdot 2m})}{2m}) }\cdot \textbf{v}_{j}) \\
164 | \end{aligned}
165 | \end{equation}
166 | 
167 | Namely,
168 | 
169 | \begin{equation}
170 | \begin{aligned}
171 | & \frac{\partial logP(w_{c}\big|w_{b0},w_{b1},...,w_{b\cdot 2m})}{\partial \textbf{u}_{bi}} \\
172 | & = \frac{1}{2m} (\textbf{v}_{c}-\sum_{j\in D} P(w_{c}\big|w_{b0},w_{b1},...,w_{b\cdot 2m})\cdot \textbf{v}_{j}) \\
173 | \end{aligned}
174 | \end{equation}
175 | 
176 | As the same of Skip-gram, we can also solve this by Gradient Descent or Stochastic Gradient Descent iteratively, and finally achieve the word vectors $\textbf{v}_{i}$ and $\textbf{u}_{i}$ (\textit{i}=1,2,...,\big|\textit{D}\big|) of every single words when it is as center word and background word, when the loss function reaches to minimum.
177 | 
178 | If the length of text sequence T is too long, we can sample a rather short subsequence randomly to calculate the loss about this subsequence in each epoch, in order to find out an approximate solution.
179 | 
180 | In general, we will use the background word vector of CBOW as the word vector of each word in natural language processing application.
181 | 
182 | \vspace{0.5cm}
183 | \subsection{\underline{WordNet}}
184 | 
185 | WordNet is a large lexical database of English. In WordNet, synsets are interlinked by means of conceptual-semantic and lexical relations. The main relation between words in WordNet is synonym[\hyperref[ref 3]{3}]. By using a network to show the relation between words, WordNet helps us find synonym of words and also shows how two much words are similar with each other in the perspective of meanings.
186 | 
187 | % \begin{figure}[ht]
188 | % \centering
189 | % % \includegraphics[scale=0.2]{WordNet_example.png}
190 | % \caption{An Example of the Structure of WordNet[\hyperref[ref 4]{4}]}
191 | % \end{figure}
192 | 
193 | Having the properties above makes WordNet more reliable to analyze the sentiment. However, when using NLP under Chinese context, we should also translate Chinese to English at first. Thanks to the contributors of Chinese Open Wordnet (COW), as for a Chinese word, we can conveniently find the corresponding meaning in English, namely, we can find the corresponding node of this Chinese word in WordNet. COW is a large scale, freely available, semantic dictionary of Mandarin Chinese inspired by WordNet[\hyperref[ref 5]{5}]. It has the same structure and principles as WordNet but based on Chinese. It contains 42,315 synsets, 79,812 senses and 61,536 unique words and the construction is still ongoing. Our research mainly uses COW to compute the Senti-score of each single words.
194 | 
195 | \section{Methodology}
196 | \subsection{\underline{Data Mining}}
197 | 
198 | In order to gain plenty of financial news, we use selenium\footnote{\href{https://www.seleniumhq.org/}{https://www.seleniumhq.org/}} to crawl the news from network. We collect the historical data automatically and use them to compute historical Senti-scores.
199 | 
200 | At first we crawl from Xueqiu, bjzq (Beijing Securities website) and Chinastock. But later we find that Chinastock contains articles from a much longer time and broader areas. The sources of Chinastock already include many financial websites. As a result, we choose Chinastock to do text data crawling.
201 | 
202 | In order to do back testing more conveniently, we sort the text data by date.
203 | 
204 | \subsection{\underline{Pre-treatment}}
205 | 
206 | Chinese is one of typical isolated languages. Instead of using inflections, Chinese uses isolated function words and various word order to express grammar. Compared with English, Chinese has its uniqueness. Unlike English, Chinese does not use blank to tokenize words. Therefore, before doing natural language processing over Chinese, we need to have one more tokenization step, which turns an article into a bag of words. It is a significant difference between Chinese and English natural language processing. For any given piece of news or article, we use Jieba for pre-treatment. Besides turning the whole article into a bag of words, we also need to eliminate the stop words which are not important and will affect our analysis.
207 | 
208 | However, when we train our model with the corpus, we do not remove the stop words from the corpus because the lost of stop words in such a case will affect the generation of the proper word vector. We will make a concrete analysis of this question in \textbf{Text Analysis} below.
209 | 
210 | Besides, we also need to define some symbols like numbers and punctuation that needed to be removed in order to lessen the amount of calculation and noise.
211 | 
212 | \subsection{\underline{Text Analysis}}
213 | \subsubsection{\underline{Morphological Similarity}}
214 | 
215 | At the beginning, we should train our word2vec model achieving more word vectors as much as possible (Theoretically we can achieve word vectors of all words as long as we train word2vec with a large enough corpus, but in fact, we cannot always have such a corpus). For the generality, we choose zh-wiki as our training corpus.
216 | 
217 | It should be noted that we cannot remove stop words on training corpus because stop words can be significant describing a specified central word.
218 | 
219 | Set an appropriate large length of word vector in order to make words linearly separable in the hyper plane as far as possible. And we can set a proper threshold of frequency to omit some unfamiliar words, saving memory without losing any vital information.
220 | 
221 | After training, we achieve a word2vec model, meanwhile we get to know the word vectors of all words. We should just look up a word vector of a word in the model(like a dictionary).
222 | 
223 | Now, we can calculate the morphological similarity of two words with their cosine distance. For example, let's consider two words $w_{1}$ and $w_{2}$, and normalize as $w_{1}^{'}$ and $w_{2}^{'}$, so
224 | 
225 | \begin{equation}
226 | distance = \frac{w_{1}\cdot w_{2}}{\|w_{1}\|\|w_{2}\|} = w_{1}^{'}\cdot w_{2}^{'}
227 | \end{equation}
228 | 
229 | which indicates the morphological similarity of two words $w_{1}$ and $w_{2}$.
230 | 
231 | \textbf{Note}: Though now we can estimate the similarity of two words with word2vec, what we achieve above is only the similarity morphologically. That is to say, we can just find out which words are similar with a specified word morphologically, but do not know their meanings. Let's take an easy example, consider two words 'increase' and 'decrease', we can often read such a sentence 'The .DJI increases by 5 percent today' on the financial news, and you will find it certainly possible to exchange 'increase' into 'decrease' in this sentence without any difficulty. As known in \textbf{2.2}, the word vectors of this two words will approximately be the same, which may make us confused in determine their respective Senti-scores. But, what deserves to be mentioned is that word2vec does help us to find out some words familiar with a given word in a manner.
232 | 
233 | Facing this embarrassing situation, we propose a relatively well method in \textbf{\underline{\textit{D.Senti-score of Words}}}.
234 | 
235 | \subsubsection{\underline{Semantics Similarity}}
236 | 
237 | As mentioned in \textbf{2.3}, WordNet uses trees to record words. The structure of trees defines distances naturally. When computing the semantic similarity of words using WordNet, we use the shortest path linking the two nodes representing the words to compute. Take the reciprocal of the shortest path of the two nodes as the similarity. The similarity between a word and itself is 1. We also define the similarity between two words is 0 if there is no path linking them. With definitions above, the semantics similarity we compute will be a value between 0 and 1. The larger the value is, the more similar the two words are semantically.
238 | 
239 | \subsection{\underline{Senti-score Computation}}
240 | \subsubsection{\underline{Sentiment Lexicon}}
241 | 
242 | Generally in sentiment analysis, we will build a sentiment lexicon to tell our model that which words are positive or negative. It inspires us to define our specific sentiment lexicon so that our model can output the sentiment which reflects financial market.
243 | 
244 | Then, we define a sentiment lexicon as a small tally set to evaluate the sentiment of a single word. The idea is that it is possible to reflect the sentiment of a word by calculating the similarity of this word with the word in sentiment lexicon, which we initialize its sentiment. We choose 100 words that appear in financial news frequently and also have specific sentiment as our label words. On the other hand, in order to make the computation more fair, we choose 50 words that have positive attitude toward the market (positive words) and the other 50 have the opposite sentiment (negative words). Before computing the Senti-score of a piece of new, we firstly compute both the morphological similarity and the semantics similarity with the label words of every single word.
245 | 
246 | \hyperref[Table 1]{Table 1} provides some words in our sentiment lexicon as an example. The whole sentiment lexicon displays in our codes on our GitHub.\footnote{\href{https://github.com/Coldog2333/Financial-NLP}{https://github.com/Coldog2333/Financial-NLP}}
247 | \begin{table}[!ht]
248 |     \centering
249 |     \begin{tabular}{c|c}
250 |         \hline
251 |         \textbf{Positive} & \textbf{Negative}\\
252 |         \hline
253 |         {bullish} & {bearish} \\
254 |         {climb} & {fall} \\
255 |         {surge} & {slump} \\
256 |         {...} & {...} \\
257 |         {hortation} & {sanction} \\
258 |         \hline
259 |     \end{tabular}
260 |     \caption{Sentiment Lexicon}\label{Table 1}
261 | \end{table}
262 | 
263 | \subsubsection{\underline{Senti-score of Words}}
264 | 
265 | With the computation above, we will have a vector of 200 dimensions for every word. The first 100 dimensions are the Word2vec similarities with the 100 words in the sentiment lexicon. They show the morphological similarity with the label words in sentiment lexicon. The last 100 dimensions, on the other hand, are the WordNet similarities with the 100 words in the sentiment lexicon, which represent the semantic similarity with the label words. Since the words in the sentiment lexicon show attitudes toward the market, based on the similarities above, we can evaluate the attitude of a specific word, namely the sentiment of it. To represent the sentiment quantitatively, we define a value called Senti-score.
266 | 
267 | In our research, we use the similarity vector to compute the Senti-score of every word. In specific, the process includes the following steps: (1)Use the Word2vec similarity and the WordNet similarity respectively. Then use collaborative filtering to classify it as positive word or negative word. (2)Use Word2vec similarity to compute the Senti-score. The specific process and the reasons behind are as followed.
268 | 
269 | For a word, when we consider the similarities of it with those words in the sentiment lexicon, there are two kinds of similarities we should think about. The first one is the morphological similarity. The second one is the semantic similarity. Collaborative filtering can help us to find the several most similar words morphologically and semantically. First of all, we use the first 100 dimensions, which are the Word2vec similarities, to find the top \textit{n} similar words. We find them by picking up these \textit{n} words that have the largest value in the first 100 dimensions. The words we find in this way will be the \textit{n} words that are the most morphologically similar to the target word we need to compute. Then we compare the WordNet similarities of these \textit{n} words, which are in the last 100 dimensions. From these \textit{n} words, we pick up top\textit{m} of them that have the largest WordNet similarities. These \textit{m} words will be the top \textit{m} similar to the target word both morphologically and semantically.
270 | 
271 | With these \textit{m} words we can judge whether the target word is positive or negative. However, it is not enough if we just label it as +1 or -1. The reason is that the degree of positive or negative can be different for the same kind of words. A positive word can be more positive than another positive word. Therefore, we need to define a score to measure how positive or negative the word is. The method is, we firstly define the scores of positive words and negative words in sentiment lexicon are +1 and -1. Then we use Word2vec similarities of the \textit{m} words with a target word as the weights, and calculate the weighted average to be the Senti-score of a target word.
272 | 
273 | A problem we have to face is that the words included in the COW is far less than words in the Word2vec model we trained. In some cases, the last 100 dimensions of the word cannot be computed. In these cases, we use the Word2vec similarity only to evaluate. For these words, in the second step of collaborative filtering, we cannot find the top \textit{m} Semantically similar words from the top \textit{n} morphologically similar words. Therefore, we can just find the top\textit{m} morphologically similar words from top \textit{n} morphologically similar words. For the same reason, we still compute the Senti-score of every word weighted by Word2vec similarity.
274 | 
275 | Up to now, we can compute the Senti-score of every single word.
276 | 
277 | \subsubsection{\underline{Senti-score of Articles and Sentimental Factor}}
278 | 
279 | Use the Senti-scores of words above, we can compute the Senti-score of every piece of news. By adding up all the Senti-score of news in a day, we will get the sentimental factor we need. Details are as follows.
280 | 
281 | After the pretreatment, we will get a word bag. It is straight-forward to compute the similarity vector of each word in the word bag and use these vectors to compute the Senti-scores. However, it is not efficient and not necessary. Too many words will be computed for many times. The fact is, if we compute the Senti-socres of the most commonly used words in advance, we can get the Senti-score of an article much quicker and much more efficient.
282 | 
283 | We use over 50,000 common words in \emph{The Common Vocabulary of Modern Chinese} published by Commercial Press and pick up words that are not in these over 50,000 words in 3,000 pieces of news. Collect these words and we get a common word set of around 100,000 words. We compute the Senti-scores of words in common word set in advance.So that when we compute the Senti-score of an article, we will just need to look up the common word set. Some words may be ignored if we use this method, but our experiment results show that words not in the common word set will only be around 5\% of words in an article. The influence of ignoring the 5\% is just a drop in the bucket.
284 | 
285 | Use the method above we can compute Senti-score of news. We compute the average Senti-score of a day and use it as the sentimental factor that day. The sentimental factor computed in this way can show the sentiment of the market effectively.
286 | 
287 | \subsubsection{\underline{Adjustment of model}}
288 | 
289 | Until now, we have computed the Senti-score of every single day. When we make a time series analysis with the Senti-score and the market index on the same day, we find that though market index change smoothly, the Senti-score come to a violent fluctuations, which is inconsistent with reality. Therefore, we have to adjust the model before applying it. Then we smooth the Senti-score, that is, when evaluating the Sentiment scores of one day does not mean we should use the Senti-score of that day directly, but taking the average of the Senti-score of a period of time. It's reasonable because of the timeliness of information. As we know, the public sentiment of a day will not only influence the market at the same day, but also influence the market in the following days. Next, we will compare the adjusted model with the original model in the following experiment.
290 | 
291 | \subsection{\underline{Correlation Analysis}}
292 | We need a criterion to evaluate the efficiency of the sentimental factor. The main criterion is to do a linear regression using the sentimental factor and the market trend[\hyperref[ref 7]{7}]. Assume that the market sentiment has positive correlation with the market trend, then the significance of the regression result will be able to show the efficiency of the sentimental factor. What's more, we can also evaluate their correlation of them with Pearson correlation coefficient below.
293 | 
294 | \begin{equation}
295 | \rho_{X,Y} = \frac{E[X\dot Y]-E[X]E[Y]}{\sigma_{X}\dot \sigma_{Y}}
296 | \end{equation}
297 | 
298 | \vspace{0.5cm}
299 | 
300 | \section{\underline{Baseline}}
301 | 
302 | \textbf{Random.} We randomly create some series of real numbers as a random factor to compare with our sentimental factor. The one is generated by uniform distribution, which scales from the minimum to the maximum of our sentimental factor. Another is generated by normal distribution, which has the mean and the standard deviation same as our sentimental factor.
303 | 
304 | \textbf{Temperature.} Edward M. Saunders Jr.[\hyperref[ref 8]{8}] proposed that weather can also influent the financial market. To see whether our complex sentimental factor can outperform than the simply found factor, we compare it with our sentimental factor as a baseline. We download climatic data from NNDC.\footnote{\href{https://www7.ncdc.noaa.gov/CDO/cdoselect.cmd}{https://www7.ncdc.noaa.gov/CDO/cdoselect.cmd}} We especially choose the climatic data in Shanghai and Guangzhou.
305 | 
306 | \section{\underline{Experimental Results and Discussion}}
307 | 
308 | We calculate the standard sentimental factor of 1379 market days from 2012/11/6 to now, and make a correlation analysis about the standard sentimental factor and some market indexes like SSE and SZSE.
309 | 
310 | Linear regression is carried out with them, and we can see that all of the coefficients successfully pass significance test in \hyperref[Table 2]{Table 2} and \hyperref[Table 3]{Table 3}.
311 | 
312 | \begin{table}[h]
313 | \caption{The regression result of the standard sentimental factor with SSE}\label{Table 2}
314 | \centering
315 | \begin{tabular}{c|c}
316 | \hline
317 | {standard sentimental factor} & 11138.27 ***\\
318 | {p} & (3.48e-12) \\
319 | \hline
320 | \end{tabular}
321 | \end{table}
322 | 
323 | \begin{table}[h]
324 | \caption{The regression result of the standard sentimental factor with SZSE}\label{Table 3}
325 | \centering
326 | \begin{tabular}{c|c}
327 | \hline
328 | {standard sentimental factor} & 39796.6 ***\\
329 | {p} & ($<$2e-16) \\
330 | \hline
331 | \end{tabular}
332 | \end{table}
333 | 
334 | This time the Pearson correlation coefficient is 0.18731 (with SSE), which states the standard sentimental factor has a weak dependence with the market index. However, when we adopt the adjusted sentiment factor and apply a similar analysis, we can find that the Pearson correlation coefficient is 0.26119, which is improved a lots.
335 | 
336 | Especially, we do another experiment during the period of the China stock market crash, from 2015/02/11 to 2015/09/11. We compute the sentimental factor of 139 market days of this year, and similarly make a correlation analysis about the sentimental factor with SSE and SZSE.
337 | 
338 | This time the Pearson correlation coefficient is 0.36284, which states the Sentiment factor has a medium dependence with the market index. Moreover, when we adopt the adjusted Sentiment factor and apply a similar analysis, we can find that the Pearson correlation coefficient is 0.58815, which shows a higher dependence. What's more, when we make a time series analysis, we can see that especially during the period of the stock market crash, the sentimental factor moves almost the same as the market index.
339 | 
340 | % \begin{figure}[ht]
341 | % \centering
342 | % % \includegraphics[scale=0.18]{senti-score_vs_SSE(en)_2015.png}
343 | % \caption{Time Series of 10d Average Senti-score and SSE during the crash}
344 | % \end{figure}
345 | 
346 | Meanwhile, we apply the similar correlation analysis about the random factor and temperature with SSE and SZSE, and the complete result displayed on \hyperref[Table 4]{Table 4} and \hyperref[Table 5]{Table 5}. For reducing random errors, when we applying correlation analysis on random factor, we create 1000 series of real numbers and compute the average of these 1000 Pearson correlation coefficient as the final result.
347 | 
348 | \begin{table}[!ht]
349 |     \centering
350 |     \begin{tabular}{ccc}
351 |         \hline
352 |         \textbf{Pearson correlation coefficient}\\
353 |         \hline
354 |         {} & {SSE} & {SZSE} \\
355 |         \hline
356 |         {random from uniform} & {-0.00049399} & {-0.0005597} \\
357 |         {random from normal} & {0.00017451} & {0.00016782} \\
358 |         {temperature} & {-0.025135} & {-0.063723}\\
359 |         {standard sentimental factor} & {0.18731} & {0.22595}\\
360 |         {adjusted sentimental factor} & {\textbf{0.26119}} & {\textbf{0.28472}}\\
361 |         \hline
362 |     \end{tabular}
363 |     \caption{Pearson correlation coefficient from 2012/11/6 to 2018/8/17}\label{Table 4}
364 | \end{table}
365 | 
366 | \begin{table}[!ht]
367 |     \centering
368 |     \begin{tabular}{ccc}
369 |         \hline
370 |         \textbf{Pearson correlation coefficient}\\
371 |         \hline
372 |         {} & {SSE} & {SZSE} \\
373 |         \hline
374 |         {random from uniform} & {0.0011922} & {0.0010882} \\
375 |         {random from normal} & {0.00055439} & {0.00050227} \\
376 |         {temperature} & {0.10125} & {0.064122}\\
377 |         {standard sentimental factor} & {0.36284} & {0.37204}\\
378 |         {adjusted sentimental factor} & {\textbf{0.58815}} & {\textbf{0.58042}}\\
379 |         \hline
380 |     \end{tabular}
381 |     \caption{Pearson correlation coefficient from 2015/2/11 to 2015/9/11}\label{Table 5}
382 | \end{table}
383 | 
384 | From then on, we see that our model can provide a satisfied guide meaning, especially in the special period, namely the period which is influenced greatly by public sentiment.
385 | 
386 | \section{\underline{Conclusion}}
387 | 
388 | In this paper we develop an algorithm to compute a sentimental factor of Chinese markets and demonstrate that this factor has significant correlation with Chinese market. This factor provides us with a new way to make investment decisions.
389 | 
390 | The method to compute a sentimental factor is the main contribution of this paper. It will help us even more if we are able to compute sentimental factors for every financial product. Also, a combination of this sentimental factor and traditional financial factors may help us to make even better investment decisions. Looking forward to seeing related research.
391 | 
392 | \section*{Acknowledgment}
393 | 
394 | We would like to say thanks to MingWen Liu from ShiningMidas Private Fund for his generous help throughout the research. We are also grateful to Xingyu Fu from Sun Yat-sen University for his guidance and help. With their help, this research has been completed successfully.
395 | 
396 | \vspace{0.5cm}
397 | \begin{thebibliography}{1}
398 | \bibitem{IEEEhowto:kopka}
399 | Alec Go, Richa Bhayani, Lei Huang, 2009 [R]. Twitter Sentiment Classification Using Distant Supervision.\label{ref 1}
400 | \bibitem{IEEEhowto:kopka}
401 | Mikolov T, Chen K, Corrado G, et al. Efficient Estimation of Word Representations in Vector Space[J]. Computer Science, 2013. \label{ref 2}
402 | \bibitem{IEEEhowto:kopka}
403 | George A. Miller (1995). WordNet: A Lexical Database for English. Communications of the ACM Vol. 38, No. 11: 39-41.\label{ref 3}
404 | \bibitem{IEEEhowto:kopka}
405 | Bird S, Klein E, Loper E. Natural language processing with Python: analyzing text with the natural language toolkit[M]. " O'Reilly Media, Inc.", 2009. \label{ref 4}
406 | \bibitem{IEEEhowto:kopka}
407 | Building the Chinese Wordnet (COW): Starting from Core Synsets. In Proceedings of the 11th Workshop on Asian Language Resources: ALR-2013 a Workshop of The 6th International Joint Conference on Natural Language Processing (IJCNLP-6). Nagoya. pp.10-18. \label{ref 5}
408 | \bibitem{IEEEhowto:kopka}
409 | Mikolov, T., Sutskever, I., Chen, K., Corrado, G. S., \& Dean, J. (2013). Distributed representations of words and phrases and their compositionality. In Advances in neural information processing systems (pp. 3111-3119). \label{ref 6}
410 | \bibitem{IEEEhowto:kopka}
411 | Rao T, Srivastava S. Analyzing Stock Market Movements Using Twitter Sentiment Analysis[C]// International Conference on Advances in Social Networks Analysis and Mining. IEEE Computer Society, 2012:119-123. \label{ref 7}
412 | \bibitem{IEEEhowto:kopka}
413 | Jr E M S. Stock prices and the Wall Street weather[J]. American Economic Review, 1993, 83(3):p��gs. 1337-1345. \label{ref 8}
414 | \end{thebibliography}
415 | 
416 | \section{Appendix}
417 | \subsection{Figure}
418 | 
419 | \newpage
420 | %\begin{figure}[ht]
421 | %\begin{minipage}[t]{1\linewidth}
422 | %\centering
423 | %\includegraphics[scale=0.60]{Correlation_Between_Sentimental_Factor_and_SSE(en).png}
424 | %\caption{Correlation Between Sentimental Factor an SSE}
425 | %\label{fig:side:a}
426 | %\end{minipage}%
427 | %\begin{minipage}[t]{1\linewidth}
428 | %\centering
429 | %\includegraphics[scale=0.60]{Correlation_Between_Sentimental_Factor_and_SZSE(en).png}
430 | %\caption{Correlation Between Sentimental Factor an SZSE}
431 | %\label{fig:side:b}
432 | %\end{minipage}
433 | %\end{figure}
434 | 
435 | % \begin{figure}[ht]
436 | %   \centering
437 | %   % \includegraphics[scale=0.60]{Correlation_Between_Sentimental_Factor_and_SSE(en).png}
438 | %   \caption{Correlation Between Sentimental Factor an SSE}
439 | % \end{figure}
440 | 
441 | % \begin{figure}[ht]
442 | %   \centering
443 | %   % \includegraphics[scale=0.60]{Correlation_Between_Sentimental_Factor_and_SZSE(en).png}
444 | %   \caption{Correlation Between Sentimental Factor an SZSE}
445 | % \end{figure}
446 | 
447 | % \begin{figure*}[ht]
448 | %   \centering
449 | %   % \includegraphics[width=\textwidth]{senti-score_vs_SSE(en).png}
450 | %   \caption{Time Series of 10 Days Average Senti-score and SSE}
451 | % \end{figure*}
452 | 
453 | % \begin{figure*}[ht]
454 | %   \centering
455 | %   \includegraphics[width=\textwidth]{senti-score_vs_SZSE(en).png}
456 | %   \caption{Time Series of 10 Days Average Senti-score and SZSE}
457 | % \end{figure*}
458 | % \end{document}
459 | 


--------------------------------------------------------------------------------