├── .idea ├── .name ├── encodings.xml ├── misc.xml ├── modules.xml ├── p2p.iml ├── vcs.xml └── workspace.xml ├── README.md ├── __init__.py ├── data_to_mongodb.py ├── doc ├── imgs │ ├── 01.png │ ├── 02.png │ ├── 03.png │ ├── 04.png │ ├── 05.png │ ├── 06.png │ ├── 07.png │ ├── 08.png │ └── 09.png └── klj.pdf ├── others ├── 后台1 │ ├── get_bad_platform │ │ ├── .DS_Store │ │ ├── get_data.py │ │ └── readme.txt │ ├── get_history_data │ │ ├── check.py │ │ ├── get_data.py │ │ └── readme.txt │ ├── get_recent_news │ │ ├── .DS_Store │ │ ├── get_baidu.py │ │ ├── get_news.py │ │ ├── merge_data.py │ │ └── readme.txt │ ├── get_wangdaizhijia │ │ ├── .DS_Store │ │ ├── check.py │ │ ├── get_plat_form.py │ │ └── readme.txt │ └── get_wangdaizhijia_each_platform │ │ ├── .DS_Store │ │ ├── check.py │ │ ├── get_all_plat_id_for_search.py │ │ ├── get_display_detail │ │ ├── check.py │ │ ├── display_platform_detail.json │ │ ├── display_platform_detail_readme.txt │ │ ├── get_display_detail.py │ │ └── result.json │ │ ├── get_display_platform │ │ ├── check.py │ │ ├── display_platform.json │ │ ├── display_platform.py │ │ ├── get_hot.py │ │ ├── get_hotplat_charts.py │ │ └── readme.txt │ │ ├── get_platform_charts.py │ │ ├── get_platform_review.py │ │ ├── get_recent_news │ │ ├── check_data.py │ │ ├── filter.py │ │ └── get_news.py │ │ ├── get_recent_reviews │ │ ├── check.py │ │ ├── get_recent_review.py │ │ ├── recent_reviews_readme.txt │ │ └── reviews_filter.py │ │ ├── get_valid_reviews │ │ ├── check.py │ │ ├── display_platform.json │ │ ├── filter.py │ │ ├── get_hotplat_reviews.py │ │ └── plat_form_reviews_v2_readme.txt │ │ ├── platform_chart_readme.txt │ │ └── platform_search.json ├── 后台2 │ ├── article_classify.py │ ├── article_classify.pyc │ ├── article_data_loads.py │ ├── article_data_loads_delta.py │ ├── article_sentiment_extract.py │ ├── bad_platform_analyze.py │ ├── bad_platform_healthscore.py │ ├── demo_data_prepare.py │ ├── helper │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── get_finance_nouns.py │ │ ├── mongoDB_process.py │ │ ├── myio.py │ │ ├── myio.pyc │ │ ├── nlp_model.py │ │ ├── nlp_model.pyc │ │ ├── sentiments_analyze.py │ │ ├── sentiments_analyze.pyc │ │ ├── textprocessing.py │ │ └── textprocessing.pyc │ ├── hotEvent_trace.py │ ├── hot_keywords_extract.py │ ├── industry_analyze.py │ ├── knowledge_graph_build.py │ ├── mongoDB_Test.py │ ├── platform_article_keys_extract.py │ ├── platform_data_loads.py │ ├── process_analyze.py │ ├── sparser │ │ ├── hexun │ │ │ ├── ReadMe.txt │ │ │ └── hexun.py │ │ ├── p2pguancha_news.txt │ │ └── p2pguancha_sparser.py │ ├── spider │ │ ├── __init__.py │ │ ├── caixin_extractNews.py │ │ ├── extract_p2p_news.py │ │ ├── hujin_institute_process.py │ │ ├── ifeng_extractNews.py │ │ ├── jpm_extractNews.py │ │ ├── process_wdzjdata.py │ │ ├── sina_extractNews.py │ │ ├── weixin_extractNews.py │ │ ├── wy163_extractNews.py │ │ └── zhongshen_extractNews.py │ ├── summary_analyze.py │ ├── temp.py │ ├── topic_model.py │ ├── ugc_quality.py │ └── vectorize.py └── 爬虫 │ └── wd │ ├── bbs_rong360 │ ├── bbs_rong360 │ │ ├── .DS_Store │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── items.py │ │ ├── middlewares.pyc │ │ ├── pipelines.py │ │ ├── settings.py │ │ ├── settings.pyc │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __init__.pyc │ │ │ ├── bbs.py │ │ │ ├── bbs.pyc │ │ │ ├── content.py │ │ │ ├── content.pyc │ │ │ ├── detail.py │ │ │ └── detail.pyc │ ├── proxy_inuse.txt │ ├── randomproxy.py │ ├── randomproxy.pyc │ ├── scrapy.cfg │ └── urls.txt │ └── 爬虫文档.txt ├── run.py ├── static ├── css │ ├── bootstrap-theme.min.css │ ├── bootstrap.min.css │ ├── dashboard.css │ └── sign_in.css ├── data │ ├── charts_data.json │ ├── hot_keyword.json │ ├── hot_topic │ │ ├── 1 │ │ │ ├── hot.json │ │ │ ├── keywords.json │ │ │ └── news.json │ │ ├── 2 │ │ │ ├── hot.json │ │ │ ├── keywords.json │ │ │ └── news.json │ │ ├── 3 │ │ │ ├── hot.json │ │ │ ├── keywords.json │ │ │ └── news.json │ │ └── 4 │ │ │ ├── hot.json │ │ │ ├── keywords.json │ │ │ └── news.json │ ├── plat_recent_news.json │ ├── plat_related_news.json │ ├── plat_top_labels_sentiment.json │ ├── platform_info.json │ ├── platform_news_keywords.json │ ├── platform_reviews_v4.json │ ├── problem_platform.json │ ├── raw │ │ ├── news.json │ │ ├── opinion.json │ │ ├── policy.json │ │ └── ugc.csv │ └── recent_reviews.json ├── fonts │ ├── glyphicons-halflings-regular.eot │ ├── glyphicons-halflings-regular.svg │ ├── glyphicons-halflings-regular.ttf │ ├── glyphicons-halflings-regular.woff │ └── glyphicons-halflings-regular.woff2 ├── img │ ├── bg.jpg │ ├── detail.png │ ├── dl.jpg │ ├── hot_topic_1.jpg │ ├── hot_topic_2.jpg │ ├── hot_topic_3.jpg │ ├── hot_topic_4.jpg │ ├── mh3.jpg │ └── not_found.jpg └── js │ ├── bootstrap.min.js │ ├── echarts-all-2.2.7.js │ ├── jquery-1.12.1.min.js │ ├── jquery.cookie.js │ └── p2p │ ├── grzx.js │ ├── layout.js │ ├── ptda_detail_info.js │ ├── ptda_detail_rank.js │ ├── qwzx.js │ ├── qwzx_hot_topic.js │ ├── qwzx_type.js │ ├── qwzx_type_detail.js │ ├── tzgw.js │ ├── wtpt_da.js │ ├── wtpt_fx.js │ └── yqdp_charts.js ├── templates ├── detail_info.html ├── detail_navigation.html ├── detail_problem.html ├── detail_problem_analyze.html ├── detail_rank.html ├── grzx.html ├── home.html ├── info.html ├── info_hot_topic.html ├── info_hot_topic_news_detail.html ├── info_type.html ├── info_type_detail.html ├── layout.html ├── register.html ├── search.html ├── search_detail_info.html ├── search_info.html ├── search_not_found.html ├── sign_in.html └── yqdp.html └── test_db.py /.idea/.name: -------------------------------------------------------------------------------- 1 | p2p -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/p2p.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 15 | 16 | 17 | 19 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 一个基于 python 的 flask 框架的资讯网站 2 | 3 | 演示地址: http://119.29.100.53:8086/ 4 | ---- 5 | 6 | # 1 背景介绍 7 | 该比赛要求参赛者开发一款数据舆情产品,帮助用户了解 P2P 行业现状。本人在比赛中负责网站的设计、开发和部署。团队最终排名第2。涉及内容: 8 | * 前端:HTML5 + CSS + JavaScript+JSON
9 | * 后台:Python轻量级Web应用框架Flask
10 | 11 | # 2 项目基本介绍 12 | * [项目介绍PPT](https://github.com/mindawei/p2p/blob/master/doc/klj.pdf)。 13 | * 本项目主要是一个展示数据的网站。 14 | * 数据来源是其它三位队友爬取数据后处理得到的,他们的项目在[ others ](https://github.com/mindawei/p2p/tree/master/others)目录中。 15 | * 本项目数据源在[ static/data ](https://github.com/mindawei/p2p/tree/master/static/data)目录中,项目启动前需要将它们导入到 mongodb 数据库中。 16 | 17 | # 3 QuickStart 18 | ## 3.1 安装环境 19 | 1. 安装 python 2.7
20 | 下载 python 安装文件,安装后配置系统环境变量。
21 | 可参考[《Flask入门_Windows下安装》](https://www.cnblogs.com/Christeen/p/6514713.html) 22 | 23 | 2. 安装 flask
24 | 命令行运行 `pip install flask`。
25 | 可参考[《Flask入门_Windows下安装》](https://www.cnblogs.com/Christeen/p/6514713.html) 26 | 27 | 3. 安装 pymongo
28 | 命令行运行 `pip install pymongo`。 29 | 30 | 4. 安装mongodb 数据库
31 | * 官网下载[安装包](https://www.mongodb.com/download-center#community) 32 | * 创建一个db文件夹,我的文件位置是 `C:\software\mongdb3.6.3\db` 33 | 可参考[《Windows 平台安装 MongoDB》](http://www.runoob.com/mongodb/mongodb-window-install.html) 34 | 35 | ## 3.2 启动项目 36 | 1. 在 mongodb 目录的 bin 目录中执行 mongod.exe 文件。
37 | `C:\software\mongdb3.6.3\bin>mongod --dbpath C:\software\mongdb3.6.3\db` 38 | ![](https://github.com/mindawei/p2p/blob/master/doc/imgs/01.png) 39 | 40 | 2. 将数据导入到 mongodb 数据库中。
41 | 项目目录下运行 `data_to_mongodb.py` 文件, 命令行输入 `python data_to_mongodb.py`。 42 | ![](https://github.com/mindawei/p2p/blob/master/doc/imgs/02.png) 43 | 44 | 3. 启动项目。
45 | 项目目录下运行 `run.py` 文件, 命令行输入 `python run.py`。 46 | ![](https://github.com/mindawei/p2p/blob/master/doc/imgs/03.png) 47 | 48 | 4. 访问项目
49 | 输入 `http://localhost:8086` 进行访问。 50 | 51 | # 4 效果展示 52 | 访问地址: http://119.29.100.53:8086/ 53 | 54 | 一个测试账号 用户名:test 密码: 123 55 | 56 | ![](https://github.com/mindawei/p2p/blob/master/doc/imgs/04.png) 57 | 58 | ![](https://github.com/mindawei/p2p/blob/master/doc/imgs/05.png) 59 | 60 | ![](https://github.com/mindawei/p2p/blob/master/doc/imgs/06.png) 61 | 62 | ![](https://github.com/mindawei/p2p/blob/master/doc/imgs/07.png) 63 | 64 | ![](https://github.com/mindawei/p2p/blob/master/doc/imgs/08.png) 65 | 66 | ![](https://github.com/mindawei/p2p/blob/master/doc/imgs/09.png) 67 | 68 | # 5 后续项目 69 | [zsw](https://github.com/mindawei/zsw) 是基于该项目的一个简化版本,但是增加了一些帖子评论等功能。 70 | 71 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/__init__.py -------------------------------------------------------------------------------- /data_to_mongodb.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from pymongo import MongoClient 3 | import json 4 | import csv 5 | 6 | # 连接 7 | conn = MongoClient('localhost', 27017) 8 | # 连接数据库 9 | db = conn.p2p 10 | 11 | # 资讯类型 12 | 13 | # 新闻入库 14 | db.news.remove() 15 | data = json.load(open('static/data/raw/news.json', 'r')) 16 | db.news.insert(data) 17 | print("now the number of news is:%d" % db.news.count()) 18 | 19 | # 政策入库 20 | db.policy.remove() 21 | data = json.load(open('static/data/raw/policy.json', 'r')) 22 | db.policy.insert(data) 23 | print("now the number of policy is:%d" % db.policy.count()) 24 | 25 | # 政策入库 26 | db.opinion.remove() 27 | data = json.load(open('static/data/raw/opinion.json', 'r')) 28 | db.opinion.insert(data) 29 | print("now the number of opinion is:%d" % db.opinion.count()) 30 | 31 | 32 | # 用户评论入库 33 | db.ugc.remove() 34 | data = csv.reader(file('static/data/raw/ugc.csv', 'rb')) 35 | for line in data: 36 | if data.line_num == 1: 37 | continue 38 | item = dict() 39 | item['_id'] = line[0].decode('utf-8') 40 | item['item_type'] = line[1].decode('utf-8') 41 | item['source'] = line[2].decode('utf-8') 42 | item['url'] = line[3].decode('utf-8') 43 | item['author'] = line[4].decode('utf-8') 44 | item['title'] = line[5].decode('utf-8') 45 | item['content'] = line[6].decode('utf-8') 46 | item['item_pub_time'] = line[7].decode('utf-8') 47 | item['tags'] = line[8].decode('utf-8') 48 | item['cmt_cnt'] = line[9].decode('utf-8') 49 | item['fav_cnt'] = line[10].decode('utf-8') 50 | item['gmt_create'] = line[11].decode('utf-8') 51 | item['exinfo1'] = line[12].decode('utf-8') 52 | item['exinfo2'] = line[13].decode('utf-8') 53 | db.ugc.insert(item) 54 | print("now the number of ugc is:%d" % db.ugc.count()) 55 | 56 | # 初始用户 57 | db.user.remove(); 58 | db.user.insert({'username': 'mdw', 'password': '123','platform_names': [u'拍拍贷']}) 59 | print("now the number of user is:%d" % db.user.count()) 60 | -------------------------------------------------------------------------------- /doc/imgs/01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/doc/imgs/01.png -------------------------------------------------------------------------------- /doc/imgs/02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/doc/imgs/02.png -------------------------------------------------------------------------------- /doc/imgs/03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/doc/imgs/03.png -------------------------------------------------------------------------------- /doc/imgs/04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/doc/imgs/04.png -------------------------------------------------------------------------------- /doc/imgs/05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/doc/imgs/05.png -------------------------------------------------------------------------------- /doc/imgs/06.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/doc/imgs/06.png -------------------------------------------------------------------------------- /doc/imgs/07.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/doc/imgs/07.png -------------------------------------------------------------------------------- /doc/imgs/08.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/doc/imgs/08.png -------------------------------------------------------------------------------- /doc/imgs/09.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/doc/imgs/09.png -------------------------------------------------------------------------------- /doc/klj.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/doc/klj.pdf -------------------------------------------------------------------------------- /others/后台1/get_bad_platform/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/others/后台1/get_bad_platform/.DS_Store -------------------------------------------------------------------------------- /others/后台1/get_bad_platform/get_data.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import json 4 | 5 | data = requests.get("http://shuju.wdzj.com/problem-1.html", "html.parser").text 6 | soup = BeautifulSoup(data, "html.parser") 7 | all_data = soup.find_all("tr", attrs={"class": ""}) 8 | result = [] 9 | for item in all_data[1:]: 10 | raw = item.text.strip().split('\n') 11 | result.append({"index": raw[0], "platform_name": raw[1], "problem_time": raw[2], "online_time": raw[3], 12 | "registration capital": raw[4], "region": raw[5], "money": raw[6], "number": raw[7], 13 | "event_type": raw[8]}) 14 | all_data2 = soup.find_all("tr", attrs={"class": "tb_bg_gray"}) 15 | for item in all_data2: 16 | raw = item.text.strip().split('\n') 17 | result.append({"index": raw[0], "platform_name": raw[1], "problem_time": raw[2], "online_time": raw[3], 18 | "registration capital": raw[4], "region": raw[5], "money": raw[6], "number": raw[7], 19 | "event_type": raw[8]}) 20 | 21 | 22 | def toint(str): 23 | return int(str.replace(",", "")) 24 | 25 | 26 | result.sort(key=lambda x: toint(x['index'])) 27 | json.dump(result, open('problem_platform.json', 'w')) 28 | -------------------------------------------------------------------------------- /others/后台1/get_bad_platform/readme.txt: -------------------------------------------------------------------------------- 1 | 数据为问题平台基本情况,字段对应为: 2 | index : 序号 3 | platform_name : 平台名 4 | event_type : 时间类型 5 | problem_time : 问题时间 6 | money : 待收金额 7 | region : 地区 8 | online_time : 上线时间 9 | number : 涉及人数 10 | registration capital : 注册资本 11 | -------------------------------------------------------------------------------- /others/后台1/get_history_data/check.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | data = json.load(open('platform_web_info.json', 'r')) 4 | print len(data) 5 | -------------------------------------------------------------------------------- /others/后台1/get_history_data/get_data.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from bs4 import BeautifulSoup 4 | 5 | header = { 6 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36' 7 | } 8 | 9 | 10 | def get_result(platurl): 11 | data = { 12 | 'ht': 1, 13 | 'h': platurl 14 | } 15 | result = [] 16 | try: 17 | html = requests.get('http://tool.chinaz.com/history/', data=data, headers=header).text 18 | soup = BeautifulSoup(html, "html.parser") 19 | info = soup.find_all('ul', attrs={'class': 'ResultListWrap'})[0] 20 | # text = [item.strip() for item in info.text.strip().split('\n') if item.strip()] 21 | 22 | 23 | for item in info.find_all('li', attrs={'class': 'ReListCent ReLists clearfix'}): 24 | temp = [] 25 | for each in item.find_all('div'): 26 | if each.text.strip() != '': 27 | temp.append(each.text.strip()) 28 | if len(temp) > 9: 29 | continue 30 | if len(temp) < 9: 31 | while len(temp) != 9: 32 | temp.append('--') 33 | temp2 = [] 34 | for each in temp: 35 | if each.startswith('arguments'): 36 | temp2.append('--') 37 | else: 38 | temp2.append(each) 39 | result.append(temp2) 40 | for item in info.find_all('li', attrs={'class': 'ReListCent ReLists clearfix bg-list'}): 41 | temp = [] 42 | for each in item.find_all('div'): 43 | if each.text.strip() != '': 44 | temp.append(each.text.strip()) 45 | if len(temp) > 9: 46 | continue 47 | if len(temp) < 9: 48 | while len(temp) != 9: 49 | temp.append('--') 50 | temp2 = [] 51 | for each in temp: 52 | if each.startswith('arguments'): 53 | temp2.append('--') 54 | else: 55 | temp2.append(each) 56 | result.append(temp2) 57 | return result 58 | except Exception, e: 59 | print e 60 | return result 61 | 62 | 63 | def change_url(url): 64 | temp = url.split('//') 65 | if temp[1][-1] == '/': 66 | return temp[1][:-1] 67 | else: 68 | return temp[1] 69 | 70 | 71 | all_platform = json.load(open('platform_basic.json', 'r')) 72 | all_result = {} 73 | for each in all_platform: 74 | try: 75 | print each['platName'] 76 | all_result[each['platName']] = [] 77 | result = get_result(change_url(each['platUrl'])) 78 | result.sort() 79 | all_result[each['platName']].extend(result) 80 | except Exception, e: 81 | print e 82 | continue 83 | 84 | json.dump(all_result, open('result.json', 'w')) 85 | -------------------------------------------------------------------------------- /others/后台1/get_history_data/readme.txt: -------------------------------------------------------------------------------- 1 | 数据为每个平台的官网在站长之家的访问统计情况(http://tool.chinaz.com/history/?ht=1&h=www.rong360.com) 2 | 3 | 格式为: 4 | 5 | { 6 | "平台名":[ 7 | [日期,百度权重,预估流量,关键词数,站长排名,世界排名,流量排名,日均IP,日均PV] 8 | ] 9 | } 10 | 11 | 12 | '--':表示数据缺失 13 | 14 | 也有一部分平台无查询记录,list为空 15 | -------------------------------------------------------------------------------- /others/后台1/get_recent_news/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/others/后台1/get_recent_news/.DS_Store -------------------------------------------------------------------------------- /others/后台1/get_recent_news/get_baidu.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # 抓取相关百度新闻 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import urllib 6 | import json 7 | 8 | result = {} 9 | 10 | 11 | def get_news(word): 12 | name = urllib.urlencode({'name': word}).split('=')[1] 13 | soup = BeautifulSoup( 14 | requests.get( 15 | 'http://www.baidu.com/s?tn=baidurt&rtt=1&bsst=1&cl=3&ie=utf-8&bs={}&f=8&rsv_bp=1&wd={}&inputT=0'.format( 16 | name, 17 | name)).text, 18 | "html.parser" 19 | ) 20 | for item in soup.find_all('a', attrs={'target': '_blank'}): 21 | if item[ 22 | 'href'] != '#' and item.text != u'百度快照' and item.text != u'注册' \ 23 | and u'去网页搜索' not in item.text and item.text != u'帮助' and item.text != '': 24 | yield {'url': item['href'], 'title': item.text.strip()} 25 | 26 | 27 | all_plat = json.load(open('platform_basic.json', 'r'))[2500:2500] 28 | 29 | 30 | def get_plat_name(): 31 | for item in all_plat: 32 | result.setdefault(item['platName'], []) 33 | print item['platName'] 34 | for each in get_news(item['platName'].encode('utf-8')): 35 | result[item['platName']].append(each) 36 | 37 | 38 | if __name__ == '__main__': 39 | get_plat_name() 40 | json.dump(result, open('all_plat_recent_news_3.json', 'w')) 41 | -------------------------------------------------------------------------------- /others/后台1/get_recent_news/get_news.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # 获取和讯网上平台信息 3 | import requests 4 | import json 5 | import re 6 | from bs4 import BeautifulSoup 7 | 8 | 9 | def get_all_id(index): 10 | data = open('page{}'.format(index), 'r').read() 11 | name_pattern = re.compile(r"(.+?) 1: 11 | result[item].append({'url': each['url'], 'title': each['title']}) 12 | else: 13 | continue 14 | 15 | json.dump(result, open('data_3.json', 'w')) 16 | -------------------------------------------------------------------------------- /others/后台1/get_wangdaizhijia_each_platform/get_recent_news/filter.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | data = json.load(open('all_plat_wangdai_news.json', 'r')) 4 | for each in data: 5 | for item in data[each]: 6 | if not item['url'].startswith('http'): 7 | item['url'] = 'http://' + item['url'] 8 | 9 | json.dump(data, open('all_plat_related_news.json', 'w')) 10 | -------------------------------------------------------------------------------- /others/后台1/get_wangdaizhijia_each_platform/get_recent_news/get_news.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from bs4 import BeautifulSoup 4 | 5 | headers = { 6 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36' 7 | 8 | } 9 | 10 | result = {} 11 | 12 | 13 | def get_news(pinyin): 14 | temp = [] 15 | req = requests.get('http://www.wdzj.com/dangan/{}/'.format(pinyin), headers=headers) 16 | req.encoding = 'utf-8' 17 | soup = BeautifulSoup(req.text, "html.parser") 18 | try: 19 | for item in soup.find_all('ul', attrs={'class': "newsList"}): 20 | for each in item.find_all('a'): 21 | if 'http' in each['href']: 22 | the_url = each['href'] 23 | else: 24 | the_url = 'www.wdzj.com' + each['href'] 25 | temp.append({'url': the_url, 'title': each.text}) 26 | 27 | return temp 28 | except: 29 | return temp 30 | 31 | 32 | for item in json.load(open('platform_search.json', 'r')): 33 | print item['platName'] 34 | result[item['platName']] = get_news(item['platPin']) 35 | 36 | json.dump(result, open('all_plat_wangdai_news.json', 'w')) 37 | -------------------------------------------------------------------------------- /others/后台1/get_wangdaizhijia_each_platform/get_recent_reviews/check.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | data = json.load(open('recent_reviews.json', 'r')) 4 | print len(data) 5 | -------------------------------------------------------------------------------- /others/后台1/get_wangdaizhijia_each_platform/get_recent_reviews/get_recent_review.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | 4 | header = { 5 | 'Accept': '*/*', 6 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 7 | 'Host': 'www.wdzj.com', 8 | 'Origin': 'http://www.wdzj.com', 9 | 'Referer': 'http://www.wdzj.com/dangan/dianping/', 10 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36' 11 | 12 | } 13 | 14 | result = [] 15 | for i in range(1, 10): 16 | data = { 17 | 'orderType': 0, 18 | 'currentPage': i, 19 | 'allReview': 1 20 | } 21 | data = \ 22 | requests.post('http://www.wdzj.com/front_plat-review-list', headers=header, data=data).json()[0]['platReview'][ 23 | 'reviewList'] 24 | result.extend(data) 25 | json.dump(result, open('raw_recent_reviews.json', 'w')) 26 | -------------------------------------------------------------------------------- /others/后台1/get_wangdaizhijia_each_platform/get_recent_reviews/recent_reviews_readme.txt: -------------------------------------------------------------------------------- 1 | 文件为 网贷之家中,针对各平台的最近点评. 2 | 3 | 每条点评格式为: 4 | { 5 | 'platName':点评平台名 6 | 'evaluation' : 评价者态度(0-不推荐,1-一般,2-推荐) 7 | 'reviewContent': 评价内容 8 | 'label':评价标签 9 | 'reviewUserName':用户名 10 | 'reviewDate':评价时间 11 | } 12 | 13 | 14 | 如下: 15 | { 16 | "reviewContent": "平台不错,,都是月标,,投资2000赚了200多。因为注册送了145代金券,,然后春节期间领取到红包100。", 17 | "reviewDate": "2016-04-14 14:40:18", 18 | "label": [ 19 | "还不错" 20 | ], 21 | "reviewUserName": "8883662846", 22 | "platName": "睿银财富", 23 | "evaluation": "2" 24 | } 25 | 26 | 建议: 27 | 可以考虑在平台档案页面做个最新滚动评论 -------------------------------------------------------------------------------- /others/后台1/get_wangdaizhijia_each_platform/get_recent_reviews/reviews_filter.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | result = [] 4 | 5 | data = json.load(open('raw_recent_reviews.json', 'r')) 6 | for item in data: 7 | comment = {} 8 | labels = [] 9 | if 'platReviewTagList' in item: 10 | for each in item['platReviewTagList']: 11 | if 'tagName' in each: 12 | labels.append(each['tagName']) 13 | comment['reviewContent'] = item['reviewContent'] 14 | comment['platName'] = item['platName'] 15 | comment['evaluation'] = item['evaluation'] 16 | comment['reviewDate'] = item['reviewDate'] 17 | comment['reviewUserName'] = item['reviewUserName'] 18 | comment['label'] = labels 19 | result.append(comment) 20 | 21 | json.dump(result, open('recent_reviews.json', 'w')) 22 | -------------------------------------------------------------------------------- /others/后台1/get_wangdaizhijia_each_platform/get_valid_reviews/check.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import json 3 | 4 | data = json.load(open('platform_reviews_v3.json', 'r')) 5 | print len(data) 6 | -------------------------------------------------------------------------------- /others/后台1/get_wangdaizhijia_each_platform/get_valid_reviews/filter.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | data = json.load(open('all_platform_reviews.json', 'r')) 4 | for each in data: 5 | for item in data[each]['reviews']: 6 | temp = sorted(data[each]['reviews'][item]['labels'].items(), key=lambda x: x[1], reverse=True) 7 | temp = [temp_label[0] for temp_label in temp[:5]] 8 | data[each]['reviews'][item]['labels'] = temp 9 | # print each 10 | # print data[each]['reviews'][item]['labels'] 11 | data[each]['reviews'][item]['comments'] = data[each]['reviews'][item]['comments'][:90] 12 | print len(data[each]['reviews'][item]['comments']) 13 | json.dump(data, open('platform_reviews_v5.json', 'w')) 14 | -------------------------------------------------------------------------------- /others/后台1/get_wangdaizhijia_each_platform/get_valid_reviews/get_hotplat_reviews.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | new_plats = json.load(open('platform_search.json', 'r')) 4 | # display_plat = json.load(open('display_platform.json', 'r')) 5 | name_id = {} 6 | for item in new_plats: 7 | # if item['platName'] in display_plat: 8 | name_id[item['platName']] = item['platId'] 9 | all_reviews = json.load(open('platform_reviews.json', 'r')) 10 | 11 | result = {} 12 | for name in name_id: 13 | result[name] = { 14 | 'reviews': {'0': {'labels': {}, 'comments': []}, '1': {'labels': {}, 'comments': []}, 15 | '2': {'labels': {}, 'comments': []}}} 16 | try: 17 | for each in all_reviews[name_id[name]]['reviews']: 18 | if 'platReviewTagList' in each: 19 | if len(each['platReviewTagList']): 20 | for item in each['platReviewTagList']: 21 | if 'tagName' in item: 22 | if 'amp' not in item['tagName']: 23 | result[name]['reviews'][each['evaluation']]['labels'][item['tagName']] = \ 24 | result[name]['reviews'][each['evaluation']]['labels'].get(item['tagName'], 0) + 1 25 | try: 26 | if not 'hellip' in each['reviewContent'] and not 'amp' in each['reviewContent']: 27 | result[name]['reviews'][each['evaluation']]['comments'].append( 28 | {'content': each['reviewContent'], 'date': each['reviewDate']}) 29 | except: 30 | continue 31 | except: 32 | continue 33 | 34 | json.dump(result, open('all_platform_reviews.json', 'w')) 35 | -------------------------------------------------------------------------------- /others/后台1/get_wangdaizhijia_each_platform/get_valid_reviews/plat_form_reviews_v2_readme.txt: -------------------------------------------------------------------------------- 1 | 更新的评论信息,格式为 2 | 3 | { 4 | 平台名称:{ 5 | "reviews"{ 6 | "0"(不推荐):{ 7 | "labels":{ 8 | "平台大":5(出现次数), 9 | "提现快":6 10 | ... 11 | } 12 | "comments":[ 13 | { 14 | "content":评论内容, 15 | "date":评论时间 16 | } 17 | ] 18 | } 19 | "1"(一般):{ 20 | "labels":{}, 21 | "comments":{} 22 | } 23 | "2"(推荐):{ 24 | "labels":{}, 25 | "comments":{} 26 | } 27 | } 28 | } 29 | 30 | } -------------------------------------------------------------------------------- /others/后台1/get_wangdaizhijia_each_platform/platform_chart_readme.txt: -------------------------------------------------------------------------------- 1 | 结构为: 2 | 3 | { 4 | "99"(平台id):{ 5 | "0"(利率和成交量信息):{ 6 | x(横轴-时间轴):{ 7 | [ 8 | "2015-04-10", 9 | .... 10 | ] 11 | }, 12 | y1(利率):{ 13 | [ 14 | 1053.71, 15 | ... 16 | ] 17 | }, 18 | y2(成交量):{ 19 | [ 20 | 84711.06, 21 | ... 22 | ] 23 | } 24 | } 25 | "1"(待还款\资金净流入):{ 26 | x:{[]}, 27 | y1(待还款):{[]}, 28 | y2(资金净流入):{[]} 29 | } 30 | "2"(投资人数\借款人数):{ 31 | x:{[]}, 32 | y1(投资人数):{[]}, 33 | y2(借款人数):{[]} 34 | } 35 | } 36 | } -------------------------------------------------------------------------------- /others/后台2/article_classify.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | ''' 3 | __author__ = 'Administrator' 4 | 2016.4.14 01:05 first version 5 | 资讯分类,类别包括: 6 | 高层变动、新产品、平台跑路、提现困难、相关指标、 7 | ''' 8 | 9 | import csv 10 | import json 11 | import time 12 | import re 13 | from string import punctuation,digits,letters,whitespace 14 | import sys 15 | import os 16 | import datetime 17 | import jieba 18 | import jieba.analyse 19 | import pandas as pd 20 | import types 21 | from pymongo import MongoClient 22 | client=MongoClient() 23 | reload(sys) 24 | sys.setdefaultencoding('utf-8') 25 | csv.field_size_limit(sys.maxint) 26 | 27 | 28 | 29 | #tag 高层变动 30 | def getArticleTag(title, content): 31 | recall_key_list = { 32 | '高层变动':{'高层变动', '高管变动', '换人', '离职'},\ 33 | '新产品':{'新产品', '产品上市'},\ 34 | '平台跑路':{'平台跑路', '跑路'},\ 35 | '提现困难':{'提现困难', '无法兑付'},\ 36 | '平台融资':{'A轮融资', 'B轮融资', 'C轮融资', 'D轮融资', 'E轮融资', '估值', 'IPO', '上市'}\ 37 | } 38 | tag_list = '' 39 | for tag in recall_key_list: 40 | for key in recall_key_list[tag]: 41 | if title.find(key) != -1 or content.find(key) != -1: 42 | tag_list += tag + ',' 43 | break 44 | if tag_list != '': 45 | tag_list = tag_list[:-1] 46 | # print title, tag_list 47 | return tag_list 48 | 49 | 50 | if __name__ == "__main__": 51 | db = client.holmesdb 52 | t_news = db.t_news_di 53 | t_news_res = t_news.find() 54 | for news in t_news_res: 55 | title = news['title'] 56 | content = news['content'] 57 | flag = getArticleTag(title, content) 58 | 59 | -------------------------------------------------------------------------------- /others/后台2/article_classify.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/others/后台2/article_classify.pyc -------------------------------------------------------------------------------- /others/后台2/article_data_loads_delta.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | ''' 3 | __author__ = 'Administrator' 4 | 2016.3.3 20:30 first version 5 | 加载文件 6 | ''' 7 | 8 | import csv 9 | import json 10 | import time 11 | import re 12 | from string import punctuation,digits,letters,whitespace 13 | import sys 14 | import os 15 | import datetime 16 | from bson import ObjectId 17 | import jieba 18 | import jieba.analyse 19 | import pandas as pd 20 | import pymongo 21 | from pymongo import MongoClient 22 | client=MongoClient() 23 | reload(sys) 24 | sys.setdefaultencoding('utf-8') 25 | csv.field_size_limit(sys.maxint) 26 | 27 | 28 | 29 | def handleContent(string): 30 | """字符串处理,去标点符号,中文分词,return:unicode""" 31 | string = string.decode('utf-8') 32 | #针对自己的文本数据定制化修改 33 | string = string.replace("

", "").replace("

", "").replace("", "").replace("", "") 34 | string = string.replace("#r#", "\n").replace("#n#", "\n").replace("", "") 35 | string = string.replace(" ", "").replace("\n", "").replace("\t", " ") 36 | 37 | string = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!:》,《”。“?、~@#¥%……&*()]+".decode("utf-8"), "".decode("utf-8"),string) 38 | string = string.encode('utf-8') 39 | string = string.translate(None,punctuation+digits+letters+whitespace) 40 | return string 41 | 42 | def getJsonFile(json_fname): 43 | json_file = file(json_fname, "r") 44 | json_vector = [] 45 | for line in json_file: 46 | person_info = json.loads(line) 47 | json_vector.append(person_info) 48 | return json_vector 49 | 50 | def getTable_maxID(mongodb_table, field): 51 | try: 52 | res = mongodb_table.find().sort(field, pymongo.DESCENDING)[0] 53 | if res == None: 54 | return 0 55 | return int(res[field]) 56 | except: 57 | return 0 58 | 59 | def insertDB(mongodb_table, line, cols): 60 | try: 61 | ori = cols[0] 62 | cols[0] = "_id" 63 | data = {} 64 | for i in xrange(0, len(cols)): 65 | data.setdefault(cols[i], line[i]) 66 | mongodb_table.insert(data) 67 | cols[0] = ori 68 | except Exception : 69 | cols[0] = ori 70 | print Exception 71 | return 72 | 73 | print "before load data", datetime.datetime.now() 74 | # 数据加载 75 | news_title_dict = {} 76 | texts_news = [] 77 | texts_news_other = [] 78 | texts_ugc = [] 79 | texts_experts = [] 80 | texts_policy = [] 81 | texts_nlp_train = [] 82 | 83 | db = client.holmesdb 84 | t_news = db.t_news_di 85 | t_news.remove() 86 | t_news_id = getTable_maxID(t_news, "_id") 87 | #news & policy 88 | columns = [] 89 | dir_list = ["./data/news"] 90 | for dir in dir_list: 91 | print dir 92 | for fname in os.listdir(dir): 93 | if fname.find(".csv") != -1: 94 | fname = fname.decode("gbk") 95 | f_in = csv.reader(file(dir + r"/" + fname, "r")) 96 | lines = [line for line in f_in] 97 | if len(columns) == 0: 98 | columns = lines[0] 99 | for line in lines[1:]: 100 | if len(line) < 14: 101 | continue 102 | if len(line) > 14: 103 | line = line[:14] 104 | texts_news.append(line) 105 | line[6] = line[6].replace("###r###", "\r").replace("###n###", "\n").replace("###t###", "\t") 106 | title = line[5] 107 | if title not in news_title_dict: 108 | news_title_dict.setdefault(title, 0) 109 | line[0] = str(t_news_id ) 110 | t_news_id += 1 111 | insertDB(t_news, line, columns) 112 | print len(news_title_dict) 113 | print columns 114 | data = pd.DataFrame(texts_news) 115 | data.columns = columns 116 | pd.to_pickle(data, 'data/news_dataset.pkl') 117 | 118 | -------------------------------------------------------------------------------- /others/后台2/article_sentiment_extract.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | ''' 3 | __author__ = 'Administrator' 4 | 2016.3.3 20:30 first version 5 | 加载文件 6 | ''' 7 | 8 | import csv 9 | import json 10 | import time 11 | import re 12 | from string import punctuation,digits,letters,whitespace 13 | import sys 14 | import os 15 | import datetime 16 | from bson import ObjectId 17 | import jieba 18 | import jieba.analyse 19 | import pandas as pd 20 | from helper import sentiments_analyze as sa 21 | import pymongo 22 | from pymongo import MongoClient 23 | client=MongoClient() 24 | reload(sys) 25 | sys.setdefaultencoding('utf-8') 26 | csv.field_size_limit(sys.maxint) 27 | 28 | 29 | 30 | def handleContent(string): 31 | """字符串处理,去标点符号,中文分词,return:unicode""" 32 | string = string.decode('utf-8') 33 | #针对自己的文本数据定制化修改 34 | string = string.replace("

", "").replace("

", "").replace("", "").replace("", "") 35 | string = string.replace("#r#", "\n").replace("#n#", "\n").replace("", "") 36 | string = string.replace(" ", "").replace("\n", "").replace("\t", " ") 37 | 38 | string = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!:》,《”。“?、~@#¥%……&*()]+".decode("utf-8"), "".decode("utf-8"),string) 39 | string = string.encode('utf-8') 40 | string = string.translate(None,punctuation+digits+letters+whitespace) 41 | return string 42 | 43 | def getJsonFile(json_fname): 44 | json_file = file(json_fname, "r") 45 | json_vector = [] 46 | for line in json_file: 47 | person_info = json.loads(line) 48 | json_vector.append(person_info) 49 | return json_vector 50 | 51 | def getTable_maxID(mongodb_table, field): 52 | try: 53 | res = mongodb_table.find().sort(field, pymongo.DESCENDING)[0] 54 | if res == None: 55 | return 0 56 | return int(res[field]) 57 | except: 58 | return 0 59 | 60 | 61 | print "before load data", datetime.datetime.now() 62 | # 数据加载 63 | news_title_dict = {} 64 | texts_news = [] 65 | texts_news_other = [] 66 | texts_ugc = [] 67 | texts_experts = [] 68 | texts_policy = [] 69 | texts_nlp_train = [] 70 | 71 | #news & policy 72 | columns = [] 73 | fname = u"./data/ugc_opinion_comment/用户点评-网贷之家-融360.csv" 74 | 75 | f_in = csv.reader(file(fname, "r")) 76 | lines = [line for line in f_in] 77 | if len(columns) == 0: 78 | columns = lines[0] 79 | for line in lines[1:]: 80 | if len(line) < 14: 81 | continue 82 | if len(line) > 14: 83 | line = line[:14] 84 | line[6] = line[6].replace("###r###", "\r").replace("###n###", "\n").replace("###t###", "\t") 85 | line[6] = line[6].replace("#r#", "\r").replace("#n#", "\n").replace("#t#", "\t") 86 | title = line[5] 87 | # print title 88 | texts_news_other.append(line) 89 | try: 90 | print sa.single_review_sentiment_score(line[6])[:2], line[6] 91 | except Exception: 92 | pass 93 | texts_ugc.append(line) 94 | 95 | texts_nlp_train.append(line) 96 | -------------------------------------------------------------------------------- /others/后台2/bad_platform_healthscore.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | ''' 3 | __author__ = 'Administrator' 4 | 2016.4.2 14:11 first version 5 | 问题平台分析 6 | ''' 7 | 8 | import csv 9 | import json 10 | import time 11 | import re 12 | from string import punctuation,digits,letters,whitespace 13 | import sys 14 | import os 15 | import datetime 16 | import jieba 17 | import jieba.analyse 18 | import pandas as pd 19 | import types 20 | from pymongo import MongoClient 21 | from helper import myio 22 | 23 | client=MongoClient() 24 | reload(sys) 25 | sys.setdefaultencoding('utf-8') 26 | csv.field_size_limit(sys.maxint) 27 | 28 | 29 | 30 | def handleContent(string): 31 | """字符串处理,去标点符号,中文分词,return:unicode""" 32 | string = string.decode('utf-8') 33 | #针对自己的文本数据定制化修改 34 | string = string.replace("

", "").replace("

", "").replace("", "").replace("", "") 35 | string = string.replace("#r#", "\n").replace("#n#", "\n").replace("", "") 36 | string = string.replace(" ", "").replace("\n", "").replace("\t", " ") 37 | 38 | string = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!:》,《”。“?、~@#¥%……&*()]+".decode("utf-8"), "".decode("utf-8"),string) 39 | string = string.encode('utf-8') 40 | string = string.translate(None,punctuation+digits+letters+whitespace) 41 | return string 42 | def getJsonFile_line(json_fname): 43 | json_file = file(json_fname, "r") 44 | json_vector = [] 45 | for line in json_file: 46 | person_info = json.loads(line) 47 | json_vector.append(person_info) 48 | return json_vector 49 | def getJsonFile_all(json_fname): 50 | json_file = open(json_fname, "r") 51 | dict = json.load(json_file) 52 | json_file.close() 53 | return dict 54 | 55 | print "before load bad company datas", datetime.datetime.now() 56 | db = client.holmesdb 57 | t_news = db.t_news_di 58 | t_policy = db.t_policy_di 59 | t_ugc = db.t_ugc_di 60 | t_expert = db.t_expert_opinion_di 61 | t_news_caixin = db.t_news_caixin_di 62 | news_res = t_news.find() 63 | policy_res = t_policy.find() 64 | ugc_res = t_ugc.find() 65 | expert_res = t_expert.find() 66 | 67 | bad_company_2015 = getJsonFile_all("./data/bad_platform/bad_platform_2015.json") 68 | print "end load bad company datas", datetime.datetime.now() 69 | 70 | 71 | article_res = [news_res, policy_res, expert_res, ugc_res] 72 | for data_set in article_res: 73 | print data_set.count(), 74 | print "" 75 | key = ["news", "policy", "expert", "ugc"] 76 | month_summary = {} 77 | for i in xrange(len(article_res)): 78 | for res in article_res[i]: 79 | if res['item_pub_time'] >= '2014-01-01' and res['item_pub_time'] <= '2015-12-31': 80 | title = res['title'] 81 | content = res['content'] 82 | t = res['item_pub_time'] 83 | m = t[0:7].replace("-", ".") 84 | date = t[5:].replace("-", ".").split(" ")[0] 85 | for pjson in bad_company_2015: 86 | pname = pjson['platform_name'] 87 | if content.find(pname) != -1 or content.find(pname) != -1: 88 | month_summary[pname][m] = month_summary.setdefault(pname, {}).setdefault(m, 0) + 1 89 | #print pname, m, month_summary[pname][m] 90 | bad_platform_trend = {} 91 | for pjson in bad_company_2015: 92 | pname = pjson['platform_name'] 93 | pro_date = pjson['problem_time'] 94 | last_year_date = "2014." + pro_date[5:7] 95 | sum_cnt = 0 96 | p_trend = [] 97 | if pname in month_summary: 98 | month_data = sorted(month_summary[pname].items(), lambda a, b: cmp(a[0], b[0])) 99 | for (m, cnt) in month_data: 100 | print pname, m, pro_date, last_year_date 101 | if m >= last_year_date and m < pro_date: 102 | try: 103 | y1, m1 = int(m[:4]), int(m[5:]) 104 | y2, m2 = int(pro_date[:4]), int(pro_date[5:]) 105 | delta = (y2 - y1) * 12 + m2 - m1 106 | sum_cnt += cnt 107 | p_trend.append((delta, cnt)) 108 | except Exception: 109 | print Exception 110 | continue 111 | if sum_cnt >= 20: 112 | print pname, pro_date, p_trend 113 | bad_platform_trend.setdefault(pname, [pro_date, p_trend]) 114 | myio.writeJsonDict(bad_platform_trend, open("./data/bad_platform/bad_platform_trend.json", "w"), "rows") 115 | print "end analyze bad company datas", datetime.datetime.now() 116 | 117 | 118 | -------------------------------------------------------------------------------- /others/后台2/helper/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Administrator' 2 | -------------------------------------------------------------------------------- /others/后台2/helper/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/others/后台2/helper/__init__.pyc -------------------------------------------------------------------------------- /others/后台2/helper/get_finance_nouns.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | ''' 3 | __author__ = 'Administrator' 4 | 2016.2.27 20:30 first version 5 | 分词、提取关键字、提取文章主题 6 | ''' 7 | 8 | 9 | import sys 10 | reload(sys) 11 | sys.setdefaultencoding('utf-8') 12 | 13 | 14 | f_out = open("data/finance_words/eco_out.txt", "w") 15 | word_dict = {} 16 | for line in open("data/finance_words/eco.txt", "r"): 17 | #print "###" ,line 18 | if line.find("、") != -1: 19 | if line[line.find("、"):line.find("(") ].find(" ") != -1: 20 | line = line[line.find(" ")+ 1:line.find("(") ] 21 | else: 22 | line = line[line.find("、") + len("、"):line.find("(") ] 23 | flag = 0 24 | for c in line: 25 | if c.isalpha() == True: 26 | flag = 1 27 | if flag == 0 and len(line) > 1 and len(line) < 30: 28 | word_dict.setdefault(line, 0) 29 | 30 | for line in open("data/finance_words/p2p_sentence.txt", "r"): 31 | if line.find(":") != -1: 32 | line = line[:line.find(":")] 33 | while line[0] == " ": line = line[1:] 34 | while line[-1] == " ": line = line[:-1] 35 | while line[:len(" ")] == " ": line = line[len(" "):] 36 | print line 37 | m = 0 38 | sep_list = [".",".","、"," "] 39 | for sep in sep_list: 40 | if line.find(sep) != -1: 41 | m = max(m, line.find(sep) + len(sep)) 42 | if m > 0: 43 | if line.find("(") == -1: 44 | line = line[m:] 45 | if line.find("(") != -1: 46 | line = line[:line.find("(")] 47 | while line[0] == " ": line = line[1:] 48 | while line[-1] == " ": line = line[:-1] 49 | while line[:len(" ")] == " ": line = line[len(" "):] 50 | if len(line) > 1 and len(line) < 30: 51 | word_dict.setdefault(line, 0) 52 | 53 | for key in word_dict: 54 | f_out.write( "%s 100\n" %(key)) 55 | 56 | -------------------------------------------------------------------------------- /others/后台2/helper/mongoDB_process.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | ''' 3 | __author__ = 'Administrator' 4 | 2016.3.3 20:30 first version 5 | 加载文件 6 | ''' 7 | 8 | import csv 9 | import json 10 | import time 11 | import re 12 | from string import punctuation,digits,letters,whitespace 13 | import sys 14 | import os 15 | import datetime 16 | from bson import ObjectId 17 | import jieba 18 | import jieba.analyse 19 | import pandas as pd 20 | import pymongo 21 | from pymongo import MongoClient 22 | client=MongoClient() 23 | db = client.holmesdb 24 | 25 | 26 | 27 | def insertDB(mongodb_table, line, columns): 28 | try: 29 | ori = columns[0] 30 | columns[0] = "_id" 31 | data = {} 32 | for i in xrange(0, len(columns)): 33 | data.setdefault(columns[i], line[i]) 34 | mongodb_table.insert(data) 35 | columns[0] = ori 36 | except Exception : 37 | columns[0] = ori 38 | print Exception 39 | return 40 | 41 | -------------------------------------------------------------------------------- /others/后台2/helper/myio.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | import csv 3 | import json 4 | import time 5 | import re 6 | from string import punctuation,digits,letters,whitespace 7 | import sys 8 | import datetime 9 | import jieba 10 | import jieba.analyse 11 | import math 12 | import pandas as pd 13 | from gensim import corpora,models 14 | from helper.textprocessing import handleContent 15 | from pymongo import MongoClient 16 | client=MongoClient() 17 | reload(sys) 18 | sys.setdefaultencoding('utf-8') 19 | 20 | 21 | def writeJsonDict(person, f_out): 22 | outStr = json.dumps(person, ensure_ascii = False) #处理完之后重新转为Json格式 23 | f_out.write(outStr.encode('utf-8') + '\n') #写回到一个新的Json文件中去 24 | 25 | def writeJsonDict(person, f_out, row_type=None): 26 | row_flag = 1 if row_type == "rows" else None 27 | outStr = json.dumps(person, ensure_ascii = False, indent=row_flag) #处理完之后重新转为Json格式 28 | f_out.write(outStr.encode('utf-8') + '\n') #写回到一个新的Json文件中去 29 | 30 | def getJsonFile_line(json_fname): 31 | json_file = file(json_fname, "r") 32 | json_vector = [] 33 | for line in json_file: 34 | person_info = json.loads(line) 35 | json_vector.append(person_info) 36 | return json_vector 37 | 38 | def getJsonFile_all(json_fname): 39 | json_file = open(json_fname, "r") 40 | dict = json.load(json_file) 41 | json_file.close() 42 | return dict 43 | -------------------------------------------------------------------------------- /others/后台2/helper/myio.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/others/后台2/helper/myio.pyc -------------------------------------------------------------------------------- /others/后台2/helper/nlp_model.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | ''' 3 | __author__ = 'Administrator' 4 | 2016.3.13 16:30 first version 5 | 分词、提取关键字、提取每天热门关键字 6 | ''' 7 | 8 | import csv 9 | import json 10 | import time 11 | import re 12 | from string import punctuation,digits,letters,whitespace 13 | import sys 14 | import datetime 15 | 16 | import jieba 17 | import jieba.analyse 18 | import math 19 | import pandas as pd 20 | from gensim import corpora,models 21 | from helper.textprocessing import handleContent 22 | from pymongo import MongoClient 23 | reload(sys) 24 | sys.setdefaultencoding('utf-8') 25 | 26 | 27 | 28 | def handleContent(string): 29 | """字符串处理,去标点符号,中文分词,return:unicode""" 30 | string = string.decode('utf-8') 31 | #针对自己的文本数据定制化修改 32 | string = string.replace("

", "").replace("

", "").replace("", "").replace("", "") 33 | string = string.replace("#r#", "\r").replace("#n#", "\n").replace("#t#", "\t") 34 | string = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!:》,《”。“?、~@#¥%……&*()]+".decode("utf-8"), "".decode("utf-8"),string) 35 | string = string.encode('utf-8') 36 | string = string.translate(None,punctuation+whitespace) 37 | return string 38 | 39 | def getKeyword_times(pd_docs, time_begin, time_end): 40 | time_begin = datetime.datetime.strftime(time_begin, '%Y-%m-%d') 41 | time_end = datetime.datetime.strftime(time_end, '%Y-%m-%d') 42 | docs = pd_docs[pd_docs.item_pub_time >= time_begin] 43 | docs = docs[docs.item_pub_time <= time_end] 44 | keyword_dict = {} 45 | for i in xrange(0, len(docs)): 46 | title = docs.iloc[i]["title"] 47 | title = handleContent(title) 48 | title_keyword = list(jieba.cut(title, cut_all=False)) 49 | content = docs.iloc[i]["content"] 50 | content = handleContent(content) 51 | cont_keyword = jieba.analyse.extract_tags(content, topK = 100) 52 | for kw in title_keyword: 53 | if kw.isdigit() == True or len(kw) <= 1: 54 | continue 55 | keyword_dict[kw] = keyword_dict.setdefault(kw, 0) + 2 56 | for kw in cont_keyword: 57 | if kw.isdigit() == True or len(kw) <= 1: 58 | continue 59 | keyword_dict[kw] = keyword_dict.setdefault(kw, 0) + 1 60 | return sorted(keyword_dict.items(), lambda a,b:-cmp(a[1], b[1])) -------------------------------------------------------------------------------- /others/后台2/helper/nlp_model.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/others/后台2/helper/nlp_model.pyc -------------------------------------------------------------------------------- /others/后台2/helper/sentiments_analyze.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/others/后台2/helper/sentiments_analyze.pyc -------------------------------------------------------------------------------- /others/后台2/helper/textprocessing.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/others/后台2/helper/textprocessing.pyc -------------------------------------------------------------------------------- /others/后台2/industry_analyze.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | ''' 3 | __author__ = 'Administrator' 4 | 2016.3.6 18:26 first version 5 | 行业的所有数据汇总,并存入数据库holmesdb 6 | ''' 7 | 8 | import csv 9 | import json 10 | import time 11 | import re 12 | from string import punctuation,digits,letters,whitespace 13 | import sys 14 | import datetime 15 | 16 | import jieba 17 | import jieba.analyse 18 | import pandas as pd 19 | from gensim import corpora,models 20 | from helper.textprocessing import handleContent 21 | from pymongo import MongoClient 22 | client=MongoClient() 23 | reload(sys) 24 | sys.setdefaultencoding('utf-8') 25 | jieba.load_userdict("C:/Python27/Lib/site-packages/jieba-0.37-py2.7.egg/jieba/financedict.txt") 26 | 27 | 28 | 29 | area_data = pd.read_csv("./data/platform_company/industry_areas.csv") 30 | db = client.holmesdb 31 | t_sh_industry_areas = db.t_sh_industry_areas 32 | for i in xrange(len(area_data)): 33 | kw_data = {} 34 | for col in area_data.columns: 35 | kw_data.setdefault(col, area_data.iloc[i][col]) 36 | if t_sh_industry_areas.find_one(kw_data) == None: 37 | id = t_sh_industry_areas.insert_one(kw_data).inserted_id 38 | 39 | class_data = pd.read_csv("./data/platform_company/industry_class.csv") 40 | db = client.holmesdb 41 | t_sh_industry_class = db.t_sh_industry_class 42 | for i in xrange(len(class_data)): 43 | kw_data = {} 44 | for col in class_data.columns: 45 | kw_data.setdefault(col, class_data.iloc[i][col]) 46 | if t_sh_industry_class.find_one(kw_data) == None: 47 | id = t_sh_industry_class.insert_one(kw_data).inserted_id 48 | 49 | interest_data = pd.read_csv("./data/platform_company/industry_interest.csv") 50 | db = client.holmesdb 51 | t_sh_industry_interest = db.t_sh_industry_interest 52 | for i in xrange(len(interest_data)): 53 | kw_data = {} 54 | for col in interest_data.columns: 55 | kw_data.setdefault(col, interest_data.iloc[i][col]) 56 | if t_sh_industry_interest.find_one(kw_data) == None: 57 | id = t_sh_industry_interest.insert_one(kw_data).inserted_id -------------------------------------------------------------------------------- /others/后台2/knowledge_graph_build.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | ''' 3 | __author__ = 'Administrator' 4 | 2016.4.17 01:08 first version 5 | 构建知识图谱,pipeline 6 | 1、词性标注 7 | 2、歧义消除 8 | 3、关系抽取 9 | 4、知识推理 10 | 5、知识表示 11 | ''' 12 | 13 | import csv 14 | import json 15 | import time 16 | import re 17 | from string import punctuation,digits,letters,whitespace 18 | import sys 19 | import datetime 20 | from helper import myio 21 | import jieba 22 | import jieba.analyse 23 | import jieba.posseg as pseg 24 | import math 25 | import pandas as pd 26 | from gensim import corpora,models 27 | import helper.textprocessing as tp 28 | from helper.textprocessing import handleContent 29 | from pymongo import MongoClient 30 | client=MongoClient() 31 | reload(sys) 32 | sys.setdefaultencoding('utf-8') 33 | 34 | knowledge_graph_dir = "./data/knowledge_graph/" 35 | def getLastNameDict(): 36 | last_name_dict ={} 37 | name_vec = [line.strip().split(" ") for line in open(knowledge_graph_dir + u"中国姓.txt")] 38 | for vec in name_vec: 39 | if len(vec) > 1: 40 | for v in vec: 41 | last_name_dict.setdefault(v, 0) 42 | return last_name_dict 43 | 44 | def extractEntity(): 45 | db = client.holmesdb 46 | t_news = db.t_news_di 47 | res_list = t_news.find() 48 | last_name_dict = getLastNameDict() 49 | 50 | ntoken_dict = {} 51 | institute_dict = {} 52 | location_dict = {} 53 | people_dict = {} 54 | 55 | row_cnt = 0 56 | for res in res_list: 57 | row_cnt += 1 58 | # if row_cnt >= 2000: break 59 | title = res["title"] 60 | content = res["content"] 61 | doc = myio.handleContent(title) + " " + myio.handleContent(content) 62 | words = pseg.cut(doc) 63 | for (word, flag) in words: 64 | if flag.find("n") != -1: 65 | if len(word) == 1: 66 | continue 67 | word1 = word[0].encode("utf-8") 68 | word2 = word[:2].encode("utf-8") 69 | if word1 in last_name_dict or word2 in last_name_dict: 70 | #print word[0], word[:2] 71 | people_dict[word] = people_dict.setdefault(word, 0) + 1 72 | if flag.find("t") != -1 or flag.find("r") != -1: 73 | institute_dict[word] = institute_dict.setdefault(word, 0) + 1 74 | if flag.find("s") != -1: 75 | location_dict[word] = location_dict.setdefault(word, 0) + 1 76 | #print w.word, w.flag 77 | ntoken_dict[word] = ntoken_dict.setdefault(word, 0) + 1 78 | ntoken_list = sorted(ntoken_dict.items(), lambda a, b: -cmp(a[1], b[1])) 79 | people_list = sorted(people_dict.items(), lambda a, b: -cmp(a[1], b[1])) 80 | institute_list = sorted(institute_dict.items(), lambda a, b: -cmp(a[1], b[1])) 81 | location_list = sorted(location_dict.items(), lambda a, b: -cmp(a[1], b[1])) 82 | 83 | f_ntoken = open(knowledge_graph_dir + "news_ntoken.txt", "w") 84 | f_peo = open(knowledge_graph_dir + "news_people.txt", "w") 85 | f_ins = open(knowledge_graph_dir + "news_institute.txt", "w") 86 | f_loc = open(knowledge_graph_dir + "news_location.txt", "w") 87 | for (word, freq) in ntoken_list: 88 | print word, freq 89 | f_ntoken.write("%s\n"%word) 90 | for (word, freq) in institute_list: 91 | print word, freq 92 | f_ins.write("%s\n"%word) 93 | for (word, freq) in location_list: 94 | print word, freq 95 | f_loc.write("%s\n"%word) 96 | for (word, freq) in people_list: 97 | print word, freq 98 | f_peo.write("%s\n"%word) 99 | 100 | def extractRelation(): 101 | db = client.holmesdb 102 | t_news = db.t_news_di 103 | res_list = t_news.find() 104 | 105 | pair3_dict = {} 106 | pair2_dict = {} 107 | row_cnt = 0 108 | for res in res_list: 109 | row_cnt += 1 110 | if row_cnt >= 20000: break 111 | title = res["title"] 112 | if title.find(u"要不要打破刚性兑付?") != -1: 113 | continue 114 | content = res["content"] 115 | content_sen = tp.cut_sentence_2(content) 116 | sentence_list = [title] + content_sen 117 | for sen in sentence_list: 118 | sen = myio.handleContent(sen) 119 | if len(sen) < 5: continue 120 | if sen.find(u"尹许尹") != -1: 121 | print title 122 | print content 123 | print sen 124 | words = pseg.cut(sen) 125 | ntoken_list = [] 126 | for (word, flag) in words: 127 | if flag.find("n") != -1 and (flag.find("r") != -1 or flag.find("s") != -1 or flag.find("t") != -1): 128 | ntoken_list.append(word) 129 | for i in xrange(len(ntoken_list) - 1): 130 | for j in xrange(i+1, len(ntoken_list)): 131 | if ntoken_list[i] == ntoken_list[j]: 132 | continue 133 | pair2 = (ntoken_list[i], ntoken_list[j]) 134 | pair2_dict[pair2] = pair2_dict.setdefault(pair2, 0) + 1 135 | for k in xrange(j+1, len(ntoken_list)): 136 | if ntoken_list[i] == ntoken_list[k] or ntoken_list[j] == ntoken_list[k]: 137 | continue 138 | pair3 = (ntoken_list[i], ntoken_list[j], ntoken_list[k]) 139 | pair3_dict[pair3] = pair3_dict.setdefault(pair3, 0) + 1 140 | pair2_list = sorted(pair2_dict.items(), lambda a,b: -cmp(a[1], b[1])) 141 | # for (w1, w2) in pair2_list: 142 | # print w1[0], w1[1], w2 143 | pair3_list = sorted(pair3_dict.items(), lambda a,b: -cmp(a[1], b[1])) 144 | # for (w1, w2) in pair3_list: 145 | # print w1[0], w1[1], w1[2], w2 146 | f_rel = open(knowledge_graph_dir + "news_relation.txt", "w") 147 | for (w1, w2) in pair2_list[:500000]: 148 | f_rel.write("%s %s\n"%(w1[0], w1[1])) 149 | for (w1, w2) in pair3_list[:3000000]: 150 | f_rel.write("%s %s %s\n"%(w1[0], w1[1], w1[2])) 151 | 152 | if __name__ == "__main__": 153 | # pipeline step1 154 | # extractEntity() 155 | # pipeline step3 156 | extractRelation() -------------------------------------------------------------------------------- /others/后台2/mongoDB_Test.py: -------------------------------------------------------------------------------- 1 | #encoder=utf8 2 | 3 | 4 | from pymongo import MongoClient 5 | 6 | client=MongoClient() 7 | 8 | 9 | db = client.holmesdb 10 | 11 | data = db.t_sh_industry_keywords.find() 12 | for d in data: 13 | for k in d: 14 | if k.find("hot") != -1: 15 | print d["dt"], k, d[k] 16 | 17 | -------------------------------------------------------------------------------- /others/后台2/platform_article_keys_extract.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | ''' 3 | __author__ = 'Administrator' 4 | 2016.3.3 20:30 first version 5 | 加载文件 6 | ''' 7 | import csv 8 | import json 9 | import os 10 | import time 11 | import re 12 | from string import punctuation,digits,letters,whitespace 13 | import sys 14 | import datetime 15 | from helper import myio 16 | import jieba 17 | import jieba.analyse 18 | import math 19 | from helper import myio 20 | import pandas as pd 21 | from gensim import corpora,models 22 | from helper.textprocessing import handleContent 23 | from pymongo import MongoClient 24 | client=MongoClient() 25 | db = client.holmesdb 26 | reload(sys) 27 | sys.setdefaultencoding('utf-8') 28 | stop_dict = {} 29 | for line in open("C:\Python27\Lib\site-packages\jieba-0.37-py2.7.egg\jieba\stop_chinese.txt"): 30 | stop_dict.setdefault(line.strip(), 0) 31 | 32 | 33 | platform_json = myio.getJsonFile_all('./data/platform_company/display_platform.json') 34 | platform_name_list = platform_json.keys() 35 | 36 | 37 | print "before load date", datetime.datetime.now() 38 | news_dataset = pd.read_pickle("./data/news_dataset.pkl") 39 | all_dataset = news_dataset 40 | all_doc_cnt = len(news_dataset) 41 | print "end load date", datetime.datetime.now() 42 | 43 | 44 | print "before cut segments", datetime.datetime.now() 45 | # 分词,关键字提取 46 | df_dict = {} 47 | for i in xrange(0, all_doc_cnt):#len(all_dataset)): 48 | title = all_dataset.iloc[i]['title'] 49 | content = all_dataset.iloc[i]['content'] 50 | item_pub_time = all_dataset.iloc[i]['item_pub_time'] 51 | doc = handleContent(title) + " " + handleContent(content) 52 | tokens = list(jieba.cut(doc)) 53 | new_tokens = [] 54 | 55 | token_dict_delta = {} 56 | for i in xrange( len(tokens) ): 57 | if tokens[i].isdigit() == True or len(tokens[i]) <= 1\ 58 | or (tokens[i].isalnum() == True and len(tokens[i]) > 20): 59 | #print tokens[i], 60 | continue 61 | if tokens[i] in stop_dict:#去停用词 62 | continue 63 | new_tokens.append(tokens[i]) 64 | token_dict_delta.setdefault(tokens[i], 0) 65 | for token in token_dict_delta: 66 | df_dict[token] = df_dict.setdefault(token, 0) + 1 67 | print "all word cnt:" , len(df_dict) 68 | t_word_df_dd = db.t_word_df_dd 69 | t_word_df_dd.remove() 70 | for token in df_dict: 71 | t_word_df_dd.insert({"word":token, "df":df_dict[token]}) 72 | 73 | t_word_df_dd = db.t_word_df_dd 74 | t_news = db.t_news_di 75 | news_res = t_news.find({"item_pub_time": {"$lt": '2016-01-05', "$gt": '2015-11-25'}}) 76 | print "month news cnt:", news_res.count() 77 | platform_key_dict = {} 78 | for news in news_res: 79 | title = news['title'] 80 | content = news['content'] 81 | doc = handleContent(title) + " " + handleContent(content) 82 | tokens = list(jieba.cut(doc)) 83 | token_dict = {} 84 | for i in xrange( len(tokens) ): 85 | if tokens[i].isdigit() == True or len(tokens[i]) <= 1\ 86 | or (tokens[i].isalnum() == True and len(tokens[i]) > 20): 87 | #print tokens[i], 88 | continue 89 | if tokens[i] in stop_dict:#去停用词 90 | continue 91 | token_dict[tokens[i]] = token_dict.setdefault(tokens[i], 0) + 1 92 | token_w_list = [] 93 | for token in token_dict: 94 | tf = token_dict[token] 95 | # df = t_word_df_dd.find_one({"word":token}) 96 | # df = df["df"] if df != None else 0 97 | df = df_dict[token] if token in df_dict else 0 98 | tfidf = math.log(1+tf) * math.log((1+all_doc_cnt) * 1.0 / (1+df)) 99 | token_w_list.append((token, tfidf)) 100 | news_key_list = sorted(token_w_list, lambda a,b: -cmp(a[1], b[1]))[:20] 101 | p_cnt = 0 102 | for pname in platform_name_list: 103 | if doc.find(pname) != -1: 104 | p_cnt += 1 105 | for pname in platform_name_list: 106 | if title.find(pname) != -1: 107 | for (key, w) in news_key_list: 108 | if key != pname: 109 | #print pname, key, w 110 | platform_key_dict[pname][key] = platform_key_dict.setdefault(pname, {}).setdefault(key, 0) + w * 1.0 / p_cnt 111 | 112 | platform_key_month12 = {} 113 | f_path = "./data/platform_company/platform_news_keywords" 114 | for pname in platform_name_list: 115 | if pname in platform_key_dict: 116 | hot_key_list = sorted(platform_key_dict[pname].items(), lambda a,b: -cmp(a[1], b[1]))[:50] 117 | print pname 118 | for (hot_key, w) in hot_key_list: 119 | print hot_key, 120 | print "" 121 | platform_key_month12.setdefault(pname, hot_key_list) 122 | myio.writeJsonDict(platform_key_month12, open(f_path, "w"), "rows") 123 | 124 | -------------------------------------------------------------------------------- /others/后台2/platform_data_loads.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | ''' 3 | __author__ = 'Administrator' 4 | 2016.3.3 20:30 first version 5 | 加载文件 6 | ''' 7 | 8 | import csv 9 | import json 10 | import time 11 | import re 12 | from string import punctuation,digits,letters,whitespace 13 | import sys 14 | import os 15 | import datetime 16 | import jieba 17 | import jieba.analyse 18 | import pandas as pd 19 | from pymongo import MongoClient 20 | client=MongoClient() 21 | reload(sys) 22 | sys.setdefaultencoding('utf-8') 23 | csv.field_size_limit(sys.maxint) 24 | 25 | 26 | 27 | def handleContent(string): 28 | """字符串处理,去标点符号,中文分词,return:unicode""" 29 | string = string.decode('utf-8') 30 | #针对自己的文本数据定制化修改 31 | string = string.replace("

", "").replace("

", "").replace("", "").replace("", "") 32 | string = string.replace("#r#", "\n").replace("#n#", "\n").replace("", "") 33 | string = string.replace(" ", "").replace("\n", "").replace("\t", " ") 34 | 35 | string = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!:》,《”。“?、~@#¥%……&*()]+".decode("utf-8"), "".decode("utf-8"),string) 36 | string = string.encode('utf-8') 37 | string = string.translate(None,punctuation+digits+letters+whitespace) 38 | return string 39 | 40 | def getJsonFile_line(json_fname): 41 | json_file = file(json_fname, "r") 42 | json_vector = [] 43 | for line in json_file: 44 | person_info = json.loads(line) 45 | json_vector.append(person_info) 46 | return json_vector 47 | def getJsonFile_all(json_fname): 48 | json_file = open(json_fname, "r") 49 | dict = json.load(json_file) 50 | json_file.close() 51 | return dict 52 | 53 | 54 | 55 | 56 | db = client.holmesdb 57 | t_company_info = db.t_company_info 58 | t_company_info.remove() 59 | t_bad_company_info = db.t_bad_company_info 60 | t_bad_company_info.remove() 61 | print "before load all company dadtas", datetime.datetime.now() 62 | def getCompanyList(): 63 | company_dict = {} 64 | company_f_in = csv.reader(file(u"data/platform_company/网贷之家.csv", "r")) 65 | lines = [line for line in company_f_in] 66 | for line in lines[1:]: 67 | company_dict.setdefault(line[1], 0) 68 | company_f_in = csv.reader(file(u"data/platform_company/融360.csv", "r")) 69 | lines = [line for line in company_f_in] 70 | for line in lines[1:]: 71 | company_dict.setdefault(line[0], 0) 72 | company_f_in = csv.reader(file(u"data/platform_company/百度财富.csv", "r")) 73 | lines = [line for line in company_f_in] 74 | for line in lines[1:]: 75 | company_dict.setdefault(line[1], 0) 76 | company_f_out = open("data/platform_company/company_list.txt", "w") 77 | for c in company_dict: 78 | company_f_out.write(c + "\n") 79 | 80 | all_data_dict = getJsonFile_all('./data/platform_company/wangdai_platform.json') 81 | 82 | #company_f_out = open(r"C:\Python27\Lib\site-packages\jieba-0.37-py2.7.egg\jieba/company_dict.txt", "w") 83 | company_f_out = open(r"./data/platform_company/company_list.txt", "w") 84 | for c in company_dict: 85 | #company_f_out.write(c + " 10000 n\n") 86 | company_f_out.write(c + "\n") 87 | return company_dict, all_data_dict 88 | 89 | def getBadCompanyList(): 90 | bad_company_dict = {} 91 | all_data_dict = getJsonFile_all('./data/bad_platform/problem_platform.json') 92 | return all_data_dict 93 | 94 | company_dict, company_info_list = getCompanyList() 95 | company_info_key = company_info_list[0].keys() 96 | bad_company_info_list = getBadCompanyList() 97 | for plat in company_info_list: 98 | t_company_info.insert(plat) 99 | if t_company_info.find_one(plat['_id']) == None: 100 | plat['_id'] = plat['platName'] 101 | bad_company_2015 = [] 102 | for plat in bad_company_info_list: 103 | for company in company_info_list: 104 | if plat['platform_name'] == company['platName']: 105 | for key in company: 106 | if key != 'platName': 107 | plat.setdefault(key, company[key]) 108 | 109 | plat['_id'] = plat['platform_name'] 110 | if "online_time" in plat : 111 | if plat["online_time"].strip().find("年") != -1: 112 | plat["online_time"] = plat["online_time"].strip()[:4] + ".01" 113 | if "problem_time" in plat: 114 | if plat["problem_time"].strip().find("年") != -1: 115 | plat["problem_time"] = plat["problem_time"].strip()[:4] + ".01" 116 | if t_bad_company_info.find_one(plat['_id']) == None: 117 | t_bad_company_info.insert(plat) 118 | if plat['problem_time'][:4] == '2015': 119 | bad_company_2015.append(plat) 120 | def writeJsonDict(person, f_out): 121 | outStr = json.dumps(person, ensure_ascii = False, indent=1) #处理完之后重新转为Json格式 122 | f_out.write(outStr.encode('utf-8') + '\n') #写回到一个新的Json文件中去 123 | writeJsonDict(bad_company_2015, open("./data/bad_platform/bad_platform_2015.json", "w")) 124 | print "end loads all company datas", datetime.datetime.now() 125 | 126 | 127 | -------------------------------------------------------------------------------- /others/后台2/process_analyze.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | ''' 3 | __author__ = 'Administrator' 4 | 2016.2.27 20:30 first version 5 | 分词、提取关键字、提取文章主题 6 | ''' 7 | 8 | import csv 9 | import json 10 | import time 11 | import re 12 | from string import punctuation,digits,letters,whitespace 13 | import sys 14 | import datetime 15 | 16 | import jieba 17 | import jieba.analyse 18 | import math 19 | import pandas as pd 20 | from gensim import corpora,models 21 | from helper.textprocessing import handleContent 22 | from pymongo import MongoClient 23 | client=MongoClient() 24 | reload(sys) 25 | sys.setdefaultencoding('utf-8') 26 | 27 | 28 | 29 | 30 | 31 | print "before load date", datetime.datetime.now() 32 | news_dataset = pd.read_pickle("./data/news_dataset.pkl") 33 | news_dataset_other = pd.read_pickle("./data/news_dataset_other.pkl") 34 | all_dataset = pd.concat([news_dataset, news_dataset_other]) 35 | print "end load date", datetime.datetime.now() 36 | 37 | 38 | # 分词,关键字提取 39 | text_tags = [] 40 | lda_train_set = [] 41 | keyword_dict = {} 42 | day_cnt = {} 43 | 44 | 45 | for item_pub_time in all_dataset['item_pub_time']: 46 | day_cnt[item_pub_time[:10]] = day_cnt.setdefault(item_pub_time[:10], 0) + 1 47 | day_cnt = sorted(day_cnt.items(), lambda a, b: -cmp(a[0], b[0])) 48 | f_out = open("data/everyday_newscnt.txt", "w") 49 | for pp in day_cnt: 50 | if len(pp[0]) >= 1 and str(pp[0][0]).isdigit() == True: 51 | f_out.write("%s %s\n"%(pp[0], pp[1])) 52 | 53 | print "before cut segments", time.localtime() 54 | for row_id, news in all_dataset.iterrows(): 55 | if row_id % 1000 == 999: 56 | print row_id 57 | content = news['content'] 58 | content = handleContent(content) 59 | pub_time = news['item_pub_time'] 60 | content = content.replace(" ", "").replace("\n", "").replace(" ", "") 61 | seg = list(jieba.cut(content)) 62 | 63 | lda_train_set.append(seg) 64 | key_words = jieba.analyse.extract_tags(content, topK = 20) 65 | for token in key_words: 66 | keyword_dict[token] = keyword_dict.setdefault(token, 0) + 1 67 | text_tags.append(key_words) 68 | 69 | 70 | 71 | print "end cut segments", time.localtime() 72 | 73 | 74 | for keyword in key_words: 75 | print keyword 76 | 77 | -------------------------------------------------------------------------------- /others/后台2/sparser/hexun/ReadMe.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/others/后台2/sparser/hexun/ReadMe.txt -------------------------------------------------------------------------------- /others/后台2/sparser/p2pguancha_sparser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Feb 27 21:25:37 2016 4 | 5 | @author: yue 6 | """ 7 | 8 | import requests 9 | import json 10 | import time 11 | 12 | idd = '1991' 13 | while(True): 14 | url = "http://www.p2pguancha.com/api.php?action=categorycontent&cid=11&id="+idd+"&num=10" 15 | r = requests.get(url) 16 | result = r.json() 17 | for i in range(0,len(result['article'])): 18 | print result['article'][i].keys() 19 | save = {} 20 | save['source '] = "P2P观察网" 21 | save['item_id'] = "p2pgc_" + result['article'][i]['id'].encode('utf8') 22 | save['item_type'] = "news" 23 | save['author'] = "" if "author_id" in result['article'][i] else result['article'][i]['author_id'].encode('utf8') 24 | save['tags'] = result['article'][i]['tag_name'].encode('utf8') 25 | save['title'] = result['article'][i]['title'].encode('utf8') 26 | save['content'] = result['article'][i]['content'].encode('utf8') 27 | save['url'] = "http://www.p2pguancha.com/article/"+save['item_id']+".html" 28 | save['source_name'] = result['article'][i]['source_name'].encode('utf8') 29 | save['news_pub_time'] = result['article'][i]['release_time'].encode('utf8') 30 | save['gmt_create'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 31 | save_str = json.dumps(save,ensure_ascii=False) 32 | f = open("p2pguancha_news.txt",'a') 33 | f.write(save_str+'\n') 34 | f.close() 35 | idd = save['item_id'] 36 | print idd -------------------------------------------------------------------------------- /others/后台2/spider/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Administrator' 2 | -------------------------------------------------------------------------------- /others/后台2/spider/extract_p2p_news.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | ''' 3 | __author__ = 'Administrator' 4 | 2016.4.10 20:30 first version 5 | 提取三大门户新闻网+财新网中的P2P资讯 6 | ''' 7 | 8 | import csv 9 | import json 10 | import time 11 | import re 12 | from string import punctuation,digits,letters,whitespace 13 | import sys 14 | import os 15 | import datetime 16 | from bson import ObjectId 17 | import jieba 18 | import jieba.analyse 19 | import pandas as pd 20 | import pymongo 21 | from pymongo import MongoClient 22 | client=MongoClient() 23 | reload(sys) 24 | sys.setdefaultencoding('utf-8') 25 | csv.field_size_limit(sys.maxint) 26 | 27 | 28 | p2p_key_list = [line.strip() for line in open("../data/key_list_hot.txt")] 29 | 30 | columns = "item_id,item_type,source,url,author,title,content,item_pub_time,tags,cmt_cnt,fav_cnt,gmt_create,exinfo1,exinfo2".split(',') 31 | item_id_dict = {} 32 | writer = csv.writer(file("../data/news/news_other.csv", 'wb')) 33 | writer.writerow(columns) 34 | 35 | news_cnt = 0 36 | news_other_dir = "../data/news/news_other" 37 | date_dict = {} 38 | month_dict = {} 39 | for cur,dirnames,filenames in os.walk(news_other_dir): #三个参数:分别返回1.父目录 2.所有文件夹名字(不含路径) 3.所有文件名字 40 | for f in os.listdir(cur): 41 | print f 42 | # if f.find("weixin") == -1: 43 | # continue 44 | news_cnt = 0 45 | f_in = csv.reader(file(news_other_dir + r"/" + f, "r")) 46 | lines = [line for line in f_in] 47 | print len(lines) 48 | for line in lines[1:]: 49 | if len(line) < 14: 50 | continue 51 | if len(line) > 14: 52 | line = line[:14] 53 | #content = line[6].replace("###r###", "\r").replace("###n###", "\n").replace("###t###", "\t") 54 | content = line[6] 55 | title = line[5] 56 | 57 | for key in p2p_key_list: 58 | if title.find(key) != -1 or content.find(key) != -1: 59 | #print title 60 | #print content 61 | writer.writerow(line) 62 | #if f == 'caixin.csv': 63 | #print key 64 | #print title, content 65 | news_cnt += 1 66 | dt = line[7] 67 | m = dt[:8] 68 | date_dict[dt] = date_dict.setdefault(dt, 0) + 1 69 | month_dict[m] = month_dict.setdefault(m, 0) + 1 70 | break 71 | print news_cnt 72 | print date_dict 73 | print month_dict -------------------------------------------------------------------------------- /others/后台2/spider/hujin_institute_process.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | ''' 3 | __author__ = 'Administrator' 4 | 2016.4.16 19:50 first version 5 | 处理互联网金融协会名单 6 | ''' 7 | import csv 8 | import os 9 | import sys 10 | import bs4 11 | import datetime 12 | import requests, html2text 13 | reload(sys) 14 | sys.setdefaultencoding('utf-8') 15 | 16 | f_in = open(u"../data/knowledge_graph/中国互联网金融协会会员名单.txt") 17 | f_out = open(u"../data/knowledge_graph/中国互联网金融协会会员名单_分开.txt", "w") 18 | f_relation_out = open(r"../data/knowledge_graph/relation_equal.txt", "w") 19 | 20 | for line in f_in: 21 | print line.strip() 22 | if line.find( u")") == -1: 23 | f_out.write(line) 24 | else: 25 | line_rep = line.strip().replace(u")", "#").replace(u"(", "$") 26 | print "\t", line_rep 27 | if line_rep[-1] == "#": 28 | rev_index = line_rep.rfind("$") 29 | rev_end_index = line_rep.rfind("#") 30 | pname1 = line_rep[:rev_index] 31 | pname2 = line_rep[rev_index+1:-1] 32 | print "\t", pname1, pname2 33 | pname1 = pname1.replace("#", u")").replace("$", u"(") 34 | f_out.write(pname1 + "\n") 35 | f_out.write(pname2 + "\n") 36 | f_relation_out.write("%s,%s"%(pname1, pname2)) 37 | f_relation_out.write("%s,%s"%(pname2, pname1)) 38 | else: 39 | f_out.write(line) 40 | 41 | -------------------------------------------------------------------------------- /others/后台2/spider/ifeng_extractNews.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | ''' 3 | __author__ = 'Administrator' 4 | 2016.3.13 19:50 first version 5 | 从ifeng网的html页面里提取结构化新闻数据 6 | ''' 7 | import csv 8 | import os 9 | import sys 10 | import bs4 11 | import datetime 12 | import requests, html2text 13 | try: 14 | from bs4 import BeautifulSoup 15 | except: 16 | import BeautifulSoup 17 | reload(sys) 18 | sys.setdefaultencoding('utf-8') 19 | ifeng_dir = r"D:\LoalaSave\news.ifeng.com" 20 | 21 | 22 | date_dict = {} 23 | columns = "item_id,item_type,source,url,author,title,content,item_pub_time,tags,cmt_cnt,fav_cnt,gmt_create,exinfo1,exinfo2".split(',') 24 | item_id_dict = {} 25 | writer = csv.writer(file("../data/news/news_other/ifeng.csv", 'wb')) 26 | writer.writerow(columns) 27 | def extract_news(soup, news_cnt): 28 | try: 29 | metas = soup.find_all("meta") 30 | #print metas 31 | key_words = "" 32 | is_article = 0 33 | title = "" 34 | url = "" 35 | description = "" 36 | image_url = "" 37 | 38 | for meta in metas: 39 | if meta.has_attr("name") == True: 40 | if meta["name"] == "keywords": 41 | key_words = meta["content"] 42 | if meta["name"] == "og:time": 43 | #print meta["content"] 44 | item_pub_time = meta["content"].replace("年", "-").replace("月", "-").replace("日", "").split(" ")[0] 45 | # print item_pub_time[:10] 46 | date_dict[item_pub_time[:10]] = date_dict.setdefault(item_pub_time[:10], 0) +1 47 | if meta["content"] == "news": 48 | is_article = 1 49 | if meta.has_attr("property") == True: 50 | if meta["property"] == "og:title": 51 | title = meta["content"] 52 | if meta["property"] == "og:url": 53 | url = meta["content"] 54 | if meta["property"] == "og:description": 55 | description = meta["content"] 56 | # print is_article 57 | # print item_pub_time 58 | # print title 59 | # print key_words 60 | # print description 61 | # print url 62 | # print "" 63 | if is_article == 0: 64 | return -1 65 | item_id = "ifeng-" + str(news_cnt) 66 | content = "" 67 | content_div = soup.find(id="main_content") 68 | #print content_div 69 | p_list = content_div.find_all("p") 70 | #print p_list 71 | #print p_list 72 | for i in xrange(len(p_list)): 73 | p = "" 74 | for e in p_list[i].contents: 75 | try: 76 | p += e.string 77 | except Exception: 78 | continue 79 | content += p + "\n" 80 | #print content 81 | content = content.replace("\n", "###n###") 82 | content = content.replace("\r", "###r###") 83 | gmt_create = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d %M:%S") 84 | cmt_cnt = 0 85 | fav_cnt = 0 86 | source = u"凤凰网" 87 | 88 | 89 | # print item_id 90 | item_type = "news" 91 | tags = key_words.replace(" ", ",") 92 | #print tags 93 | # print gmt_create 94 | # print content 95 | # print item_pub_time 96 | exinfo1 = "" 97 | exinfo2 = "" 98 | if image_url != "": 99 | exinfo2 = "image_url:" + image_url 100 | 101 | result = {} 102 | result['url'] = url 103 | result['item_id'] = item_id 104 | result['item_type'] = item_type 105 | result['author'] = 'ifeng_jizhe' 106 | result['source'] = source 107 | result['title'] = title 108 | result['content'] = content 109 | result['item_pub_time'] = item_pub_time 110 | result['tags'] = tags 111 | result['cmt_cnt'] = cmt_cnt 112 | result['fav_cnt'] = fav_cnt 113 | result['exinfo1'] = exinfo1 114 | result['exinfo2'] = exinfo2 115 | result['gmt_create'] = gmt_create 116 | 117 | line = [] 118 | for col in columns: 119 | if col not in result: 120 | line.append('') 121 | else: 122 | line.append(str(result[col]).encode('utf-8')) 123 | writer.writerow(line) 124 | except Exception, e: 125 | return -1 126 | return 0 127 | 128 | 129 | news_cnt = 0 130 | for cur,dirnames,filenames in os.walk(ifeng_dir): #三个参数:分别返回1.父目录 2.所有文件夹名字(不含路径) 3.所有文件名字 131 | for f in os.listdir(cur): 132 | print "#", f 133 | # f = '1048' 134 | try: 135 | f_path = os.path.join(cur, f) 136 | soup = BeautifulSoup(open(f_path)) 137 | if soup == None or soup.find("title") == None: 138 | continue 139 | title = soup.find("title").string 140 | flag = extract_news(soup, news_cnt) 141 | if flag == 0: 142 | news_cnt += 1 143 | print news_cnt 144 | if news_cnt % 1000 == 1: 145 | print news_cnt 146 | except Exception, e: 147 | print e 148 | continue 149 | 150 | #break 151 | print news_cnt 152 | print len(item_id_dict) -------------------------------------------------------------------------------- /others/后台2/spider/jpm_extractNews.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | ''' 3 | __author__ = 'Administrator' 4 | 2016.3.13 19:50 first version 5 | 从caixin网的html页面里提取结构化新闻数据 6 | ''' 7 | import csv 8 | import os 9 | import sys 10 | import bs4 11 | import datetime 12 | import requests, html2text 13 | try: 14 | from bs4 import BeautifulSoup 15 | except: 16 | import BeautifulSoup 17 | reload(sys) 18 | sys.setdefaultencoding('utf-8') 19 | zhongshen_dir = r"C:\Users\Administrator\Desktop\Working Folder\Holmes\data\news\jpm" 20 | 21 | 22 | 23 | columns = "item_id,item_type,source,url,author,title,content,item_pub_time,tags,cmt_cnt,fav_cnt,gmt_create,exinfo1,exinfo2".split(',') 24 | item_id_dict = {} 25 | writer = csv.writer(file("../data/news/jpm.csv", 'wb')) 26 | writer.writerow(columns) 27 | def extract_news(soup, news_cnt): 28 | try: 29 | infomain = soup.find(class_="content2") 30 | #print infomain 31 | key_words = "" 32 | is_article = 0 33 | title = infomain.find("h1").string.strip() 34 | #print title 35 | url = "" 36 | description = "" 37 | image_url = "" 38 | info1 = infomain.find(class_="writer") 39 | item_pub_time = info1.find_all("span")[3].string.split(" ")[0] 40 | #print item_pub_time 41 | p_list = infomain.find_all("p") 42 | #print p_list 43 | content = "" 44 | for i in xrange(len(p_list)): 45 | p = "" 46 | for e in p_list[i].contents: 47 | try: 48 | p += e.string 49 | except Exception: 50 | continue 51 | content += p + "\n" 52 | content = content.replace("\n", "###n###") 53 | content = content.replace("\r", "###r###") 54 | # print content 55 | item_id = "jpm-" + str(news_cnt) 56 | # print item_id 57 | if item_id not in item_id_dict: 58 | item_id_dict.setdefault(item_id, 0) 59 | else: 60 | return 61 | 62 | gmt_create = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d %M:%S") 63 | cmt_cnt = 0 64 | fav_cnt = 0 65 | source = u"金评媒" 66 | 67 | 68 | # print item_id 69 | item_type = "news" 70 | tags = key_words.replace(" ", ",") 71 | exinfo1 = "" 72 | exinfo2 = "" 73 | if image_url != "": 74 | exinfo2 = "image_url:" + image_url 75 | 76 | result = {} 77 | result['url'] = url 78 | result['item_id'] = item_id 79 | result['item_type'] = item_type 80 | result['author'] = 'jpm_jizhe' 81 | result['source'] = source 82 | result['title'] = title 83 | result['content'] = content 84 | result['item_pub_time'] = item_pub_time 85 | result['tags'] = tags 86 | result['cmt_cnt'] = cmt_cnt 87 | result['fav_cnt'] = fav_cnt 88 | result['exinfo1'] = exinfo1 89 | result['exinfo2'] = exinfo2 90 | result['gmt_create'] = gmt_create 91 | 92 | line = [] 93 | for col in columns: 94 | if col not in result: 95 | line.append('') 96 | else: 97 | line.append(str(result[col]).encode('utf-8')) 98 | writer.writerow(line) 99 | except Exception, e: 100 | return -1 101 | return 0 102 | 103 | 104 | news_cnt = 0 105 | for cur,dirnames,filenames in os.walk(zhongshen_dir): #三个参数:分别返回1.父目录 2.所有文件夹名字(不含路径) 3.所有文件名字 106 | for f in os.listdir(cur): 107 | try: 108 | f_path = os.path.join(cur, f) 109 | content = open(f_path, "r").read() 110 | #print content 111 | soup = BeautifulSoup(content) 112 | if soup == None or soup.find("title") == None: 113 | continue 114 | title = soup.find("title").string 115 | 116 | flag = extract_news(soup, news_cnt) 117 | if flag == 0: 118 | news_cnt += 1 119 | print title 120 | if news_cnt % 1000 == 1: 121 | print news_cnt 122 | except Exception, e: 123 | print e 124 | continue 125 | 126 | #break 127 | print news_cnt 128 | print len(item_id_dict) -------------------------------------------------------------------------------- /others/后台2/spider/process_wdzjdata.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | ''' 3 | __author__ = 'Administrator' 4 | 2016.3.6 16:26 first version 5 | 处理从网贷之家爬取来的数据,tab改‘,’,并附加dt字段 6 | ''' 7 | 8 | 9 | import sys 10 | reload(sys) 11 | sys.setdefaultencoding('utf-8') 12 | import time 13 | import datetime 14 | 15 | d1 = '2016-03-15' 16 | f_in_area = open("../data/platform_company/platform_areas.txt") 17 | f_out_area = open("../data/platform_company/platform_areas_our.csv", "w") 18 | area_data = [line[:-1] for line in f_in_area] 19 | columns = area_data[0] 20 | f_out_area.write(columns + ",dt\n") 21 | for line in area_data[1:]: 22 | if len(line) <= 2: 23 | d1 = datetime.datetime.strptime(d1, "%Y-%m-%d") 24 | d1 = d1 - datetime.timedelta(days=31) 25 | d1 = datetime.datetime.strftime(d1, "%Y-%m-%d") 26 | else: 27 | f_out_area.write(line + "," + d1[:7] + "-01" +"\n") 28 | 29 | 30 | 31 | d1 = '2016-03-15' 32 | f_in_area = open("../data/platform_company/platform_class.txt") 33 | f_out_area = open("../data/platform_company/platform_class_our.csv", "w") 34 | area_data = [line[:-1] for line in f_in_area] 35 | columns = area_data[0] 36 | f_out_area.write(columns + ",dt\n") 37 | for line in area_data[1:]: 38 | if len(line) <= 2: 39 | d1 = datetime.datetime.strptime(d1, "%Y-%m-%d") 40 | d1 = d1 - datetime.timedelta(days=31) 41 | d1 = datetime.datetime.strftime(d1, "%Y-%m-%d") 42 | else: 43 | f_out_area.write(line + "," + d1[:7] + "-01" +"\n") -------------------------------------------------------------------------------- /others/后台2/spider/sina_extractNews.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | ''' 3 | __author__ = 'Administrator' 4 | 2016.3.13 19:50 first version 5 | 从sina网的html页面里提取结构化新闻数据 6 | ''' 7 | import csv 8 | import os 9 | import sys 10 | import bs4 11 | import datetime 12 | import requests, html2text 13 | try: 14 | from bs4 import BeautifulSoup 15 | except: 16 | import BeautifulSoup 17 | reload(sys) 18 | sys.setdefaultencoding('utf-8') 19 | sina_dir = r"F:\LoalaSave\finance.sina.com.cn" 20 | 21 | 22 | 23 | columns = "item_id,item_type,source,url,author,title,content,item_pub_time,tags,cmt_cnt,fav_cnt,gmt_create,exinfo1,exinfo2".split(',') 24 | item_id_dict = {} 25 | writer = csv.writer(file("../data/news/news_other/sina.csv", 'wb')) 26 | writer.writerow(columns) 27 | def extract_news(soup, news_cnt): 28 | try: 29 | metas = soup.find_all("meta") 30 | #print metas 31 | key_words = "" 32 | is_article = 0 33 | title = "" 34 | url = "" 35 | description = "" 36 | image_url = "" 37 | 38 | for meta in metas: 39 | if meta.has_attr("name") == True: 40 | if meta["name"] == "keywords": 41 | key_words = meta["content"] 42 | if meta["name"] == "weibo: article:create_at": 43 | # print meta["content"] 44 | item_pub_time = meta["content"].split(" ")[0] 45 | # print item_pub_time 46 | if meta.has_attr("property") == True: 47 | if meta["content"] == "news": 48 | is_article = 1 49 | if meta["property"] == "og:title": 50 | title = meta["content"] 51 | if meta["property"] == "og:url": 52 | url = meta["content"] 53 | if meta["property"] == "og:description": 54 | description = meta["content"] 55 | 56 | # print title 57 | # print key_words 58 | # print description 59 | # print url 60 | # print "" 61 | if is_article == 0: 62 | return -1 63 | item_id = "sina-" + str(news_cnt) 64 | content = "" 65 | #share BSHARE_POP blkContainerSblkCon clearfix blkContainerSblkCon_14 66 | content_div = soup.find(name="div", id="artibody") 67 | #print content_div 68 | p_list = content_div.find_all("p") 69 | #print p_list 70 | for i in xrange(len(p_list)): 71 | p = "" 72 | for e in p_list[i].contents: 73 | try: 74 | p += e.string 75 | except Exception: 76 | continue 77 | content += p + "\n" 78 | content = content.replace("\n", "###n###") 79 | content = content.replace("\r", "###r###") 80 | gmt_create = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d %M:%S") 81 | cmt_cnt = 0 82 | fav_cnt = 0 83 | source = u"新浪财经" 84 | 85 | 86 | # print item_id 87 | item_type = "news" 88 | tags = key_words.replace(" ", ",") 89 | #print tags 90 | # print gmt_create 91 | # print content 92 | # print item_pub_time 93 | exinfo1 = "" 94 | exinfo2 = "" 95 | if image_url != "": 96 | exinfo2 = "image_url:" + image_url 97 | 98 | result = {} 99 | result['url'] = url 100 | result['item_id'] = item_id 101 | result['item_type'] = item_type 102 | result['author'] = 'sina_jizhe' 103 | result['source'] = source 104 | result['title'] = title 105 | result['content'] = content 106 | result['item_pub_time'] = item_pub_time 107 | result['tags'] = tags 108 | result['cmt_cnt'] = cmt_cnt 109 | result['fav_cnt'] = fav_cnt 110 | result['exinfo1'] = exinfo1 111 | result['exinfo2'] = exinfo2 112 | result['gmt_create'] = gmt_create 113 | 114 | line = [] 115 | for col in columns: 116 | if col not in result: 117 | line.append('') 118 | else: 119 | line.append(str(result[col]).encode('utf-8')) 120 | writer.writerow(line) 121 | except Exception, e: 122 | return -1 123 | return 0 124 | 125 | 126 | news_cnt = 0 127 | for cur,dirnames,filenames in os.walk(sina_dir): #三个参数:分别返回1.父目录 2.所有文件夹名字(不含路径) 3.所有文件名字 128 | for f in os.listdir(cur): 129 | print "#", f 130 | try: 131 | f_path = os.path.join(cur, f) 132 | soup = BeautifulSoup(open(f_path)) 133 | if soup == None or soup.find("title") == None: 134 | continue 135 | title = soup.find("title").string 136 | ##初步过滤 137 | if len(title.split("_")) < 3 \ 138 | or title.split("_")[2] != "新浪网": 139 | continue 140 | # print f_path 141 | # print title 142 | flag = extract_news(soup, news_cnt) 143 | if flag == 0: 144 | news_cnt += 1 145 | print news_cnt 146 | #print title 147 | if news_cnt % 1000 == 1: 148 | print news_cnt 149 | except Exception, e: 150 | print e 151 | continue 152 | 153 | #break 154 | print news_cnt 155 | print len(item_id_dict) -------------------------------------------------------------------------------- /others/后台2/spider/weixin_extractNews.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | ''' 3 | __author__ = 'Administrator' 4 | 2016.3.13 19:50 first version 5 | 从wy163网的html页面里提取结构化新闻数据 6 | ''' 7 | import csv 8 | import os 9 | import sys 10 | import bs4 11 | import datetime 12 | import requests, html2text 13 | try: 14 | from bs4 import BeautifulSoup 15 | except: 16 | import BeautifulSoup 17 | reload(sys) 18 | sys.setdefaultencoding('utf-8') 19 | wy163_dir = r"..\data\news\weixin" 20 | 21 | 22 | 23 | columns = "item_id,item_type,source,url,author,title,content,item_pub_time,tags,cmt_cnt,fav_cnt,gmt_create,exinfo1,exinfo2".split(',') 24 | item_id_dict = {} 25 | writer = csv.writer(file("../data/news/news_other/weixin.csv", 'wb')) 26 | writer.writerow(columns) 27 | def extract_news_txt(soup, news_cnt): 28 | try: 29 | is_article = 1 30 | title = soup[0].strip() 31 | #print title 32 | item_pub_time = soup[2].strip() 33 | #print item_pub_time 34 | content = soup[3].strip() 35 | #print metas 36 | key_words = "" 37 | 38 | url = "" 39 | description = "" 40 | image_url = "" 41 | 42 | 43 | # print is_article 44 | # print item_pub_time 45 | # print title 46 | # print key_words 47 | # print description 48 | # print url 49 | # print "" 50 | if is_article == 0: 51 | return -1 52 | item_id = "weixin-" + str(news_cnt) 53 | content = content.replace("\n", "###n###") 54 | content = content.replace("\r", "###r###") 55 | content = content.replace("#n#", "###n###") 56 | content = content.replace("#r#", "###r###") 57 | gmt_create = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d %M:%S") 58 | cmt_cnt = 0 59 | fav_cnt = 0 60 | source = u"微信" 61 | 62 | 63 | # print item_id 64 | item_type = "news" 65 | tags = key_words.replace(" ", ",") 66 | #print tags 67 | # print gmt_create 68 | # print content 69 | # print item_pub_time 70 | exinfo1 = "" 71 | exinfo2 = "" 72 | if image_url != "": 73 | exinfo2 = "image_url:" + image_url 74 | 75 | result = {} 76 | result['url'] = url 77 | result['item_id'] = item_id 78 | result['item_type'] = item_type 79 | result['author'] = soup[1].strip() 80 | result['source'] = source 81 | result['title'] = title 82 | result['content'] = content 83 | result['item_pub_time'] = item_pub_time 84 | result['tags'] = tags 85 | result['cmt_cnt'] = cmt_cnt 86 | result['fav_cnt'] = fav_cnt 87 | result['exinfo1'] = exinfo1 88 | result['exinfo2'] = exinfo2 89 | result['gmt_create'] = gmt_create 90 | 91 | line = [] 92 | for col in columns: 93 | if col not in result: 94 | line.append('') 95 | else: 96 | line.append(str(result[col]).encode('utf-8')) 97 | writer.writerow(line) 98 | except Exception, e: 99 | return -1 100 | return 0 101 | 102 | 103 | news_cnt = 0 104 | for cur,dirnames,filenames in os.walk(wy163_dir): #三个参数:分别返回1.父目录 2.所有文件夹名字(不含路径) 3.所有文件名字 105 | for f in os.listdir(cur): 106 | try: 107 | f_path = os.path.join(cur, f) 108 | soup = [line for line in open(f_path)] 109 | flag = extract_news_txt(soup, news_cnt) 110 | if flag == 0: 111 | news_cnt += 1 112 | print news_cnt 113 | if news_cnt % 1000 == 1: 114 | print news_cnt 115 | except Exception, e: 116 | print e 117 | continue 118 | 119 | #break 120 | print news_cnt 121 | -------------------------------------------------------------------------------- /others/后台2/spider/wy163_extractNews.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | ''' 3 | __author__ = 'Administrator' 4 | 2016.3.13 19:50 first version 5 | 从wy163网的html页面里提取结构化新闻数据 6 | ''' 7 | import csv 8 | import os 9 | import sys 10 | import bs4 11 | import datetime 12 | import requests, html2text 13 | try: 14 | from bs4 import BeautifulSoup 15 | except: 16 | import BeautifulSoup 17 | reload(sys) 18 | sys.setdefaultencoding('utf-8') 19 | wy163_dir = r"F:\LoalaSave\money.163.com" 20 | 21 | 22 | 23 | columns = "item_id,item_type,source,url,author,title,content,item_pub_time,tags,cmt_cnt,fav_cnt,gmt_create,exinfo1,exinfo2".split(',') 24 | item_id_dict = {} 25 | writer = csv.writer(file("../data/news/news_other/wy163.csv", 'wb')) 26 | writer.writerow(columns) 27 | def extract_news(soup, news_cnt): 28 | try: 29 | main_div = soup.find(class_="post_content_main") 30 | if main_div != None: 31 | is_article = 1 32 | else: 33 | is_article = 0 34 | #print main_div 35 | title = main_div.find("h1").string 36 | #print title 37 | item_pub_time = main_div.find(class_="post_time_source").contents[0].strip().split(" ")[0] 38 | #print item_pub_time 39 | content_div = main_div.find(class_="post_text") 40 | #print content_div 41 | #print metas 42 | key_words = "" 43 | 44 | url = "" 45 | description = "" 46 | image_url = "" 47 | 48 | 49 | # print is_article 50 | # print item_pub_time 51 | # print title 52 | # print key_words 53 | # print description 54 | # print url 55 | # print "" 56 | if is_article == 0: 57 | return -1 58 | item_id = "wy163-" + str(news_cnt) 59 | content = "" 60 | #print content_div 61 | p_list = content_div.find_all("p") 62 | #print p_list 63 | for i in xrange(len(p_list)): 64 | p = "" 65 | for e in p_list[i].contents: 66 | try: 67 | p += e.string 68 | except Exception: 69 | continue 70 | content += p + "\n" 71 | #print content 72 | content = content.replace("\n", "###n###") 73 | content = content.replace("\r", "###r###") 74 | gmt_create = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d %M:%S") 75 | cmt_cnt = 0 76 | fav_cnt = 0 77 | source = u"网易财经" 78 | 79 | 80 | # print item_id 81 | item_type = "news" 82 | tags = key_words.replace(" ", ",") 83 | #print tags 84 | # print gmt_create 85 | # print content 86 | # print item_pub_time 87 | exinfo1 = "" 88 | exinfo2 = "" 89 | if image_url != "": 90 | exinfo2 = "image_url:" + image_url 91 | 92 | result = {} 93 | result['url'] = url 94 | result['item_id'] = item_id 95 | result['item_type'] = item_type 96 | result['author'] = 'wy163_jizhe' 97 | result['source'] = source 98 | result['title'] = title 99 | result['content'] = content 100 | result['item_pub_time'] = item_pub_time 101 | result['tags'] = tags 102 | result['cmt_cnt'] = cmt_cnt 103 | result['fav_cnt'] = fav_cnt 104 | result['exinfo1'] = exinfo1 105 | result['exinfo2'] = exinfo2 106 | result['gmt_create'] = gmt_create 107 | 108 | line = [] 109 | for col in columns: 110 | if col not in result: 111 | line.append('') 112 | else: 113 | line.append(str(result[col]).encode('utf-8')) 114 | writer.writerow(line) 115 | except Exception, e: 116 | return -1 117 | return 0 118 | 119 | 120 | news_cnt = 0 121 | for cur,dirnames,filenames in os.walk(wy163_dir): #三个参数:分别返回1.父目录 2.所有文件夹名字(不含路径) 3.所有文件名字 122 | for f in os.listdir(cur): 123 | print "#", f 124 | try: 125 | f_path = os.path.join(cur, f) 126 | soup = BeautifulSoup(open(f_path)) 127 | if soup == None or soup.find("title") == None: 128 | continue 129 | title = soup.find("title").string 130 | flag = extract_news(soup, news_cnt) 131 | if flag == 0: 132 | news_cnt += 1 133 | print news_cnt 134 | if news_cnt % 1000 == 1: 135 | print news_cnt 136 | except Exception, e: 137 | print e 138 | continue 139 | 140 | #break 141 | print news_cnt 142 | print len(item_id_dict) -------------------------------------------------------------------------------- /others/后台2/spider/zhongshen_extractNews.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | ''' 3 | __author__ = 'Administrator' 4 | 2016.3.13 19:50 first version 5 | 从caixin网的html页面里提取结构化新闻数据 6 | ''' 7 | import csv 8 | import os 9 | import sys 10 | import bs4 11 | import datetime 12 | import requests, html2text 13 | try: 14 | from bs4 import BeautifulSoup 15 | except: 16 | import BeautifulSoup 17 | reload(sys) 18 | sys.setdefaultencoding('utf-8') 19 | zhongshen_dir = r"C:\Users\Administrator\Desktop\Working Folder\Holmes\data\news\zhongshen" 20 | 21 | 22 | 23 | columns = "item_id,item_type,source,url,author,title,content,item_pub_time,tags,cmt_cnt,fav_cnt,gmt_create,exinfo1,exinfo2".split(',') 24 | item_id_dict = {} 25 | writer = csv.writer(file("../data/news/zhongshen.csv", 'wb')) 26 | writer.writerow(columns) 27 | def extract_news(soup, news_cnt): 28 | try: 29 | infomain = soup.find(class_="InfoMain") 30 | key_words = "" 31 | is_article = 0 32 | title = infomain.find("h1").string 33 | #print title 34 | url = "" 35 | description = "" 36 | image_url = "" 37 | info1 = infomain.find(class_="info1") 38 | item_pub_time = info1.find_all("span")[0].string.split(" ")[0] 39 | #print item_pub_time 40 | p_list = infomain.find(id="hiddenContent").find_all("p") 41 | content = "" 42 | for i in xrange(len(p_list)): 43 | if p_list[i].find("span") != None and p_list[i].find("span").string != None: 44 | content += p_list[i].find("span").string + "\n" 45 | content = content.replace("\n", "###n###") 46 | content = content.replace("\r", "###r###") 47 | item_id = "zhongshen-" + str(news_cnt) 48 | # print item_id 49 | if item_id not in item_id_dict: 50 | item_id_dict.setdefault(item_id, 0) 51 | else: 52 | return 53 | 54 | gmt_create = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d %M:%S") 55 | cmt_cnt = 0 56 | fav_cnt = 0 57 | source = u"zhongshen" 58 | 59 | 60 | # print item_id 61 | item_type = "news" 62 | tags = key_words.replace(" ", ",") 63 | exinfo1 = "" 64 | exinfo2 = "" 65 | if image_url != "": 66 | exinfo2 = "image_url:" + image_url 67 | 68 | result = {} 69 | result['url'] = url 70 | result['item_id'] = item_id 71 | result['item_type'] = item_type 72 | result['author'] = 'caixin_jizhe' 73 | result['source'] = source 74 | result['title'] = title 75 | result['content'] = content 76 | result['item_pub_time'] = item_pub_time 77 | result['tags'] = tags 78 | result['cmt_cnt'] = cmt_cnt 79 | result['fav_cnt'] = fav_cnt 80 | result['exinfo1'] = exinfo1 81 | result['exinfo2'] = exinfo2 82 | result['gmt_create'] = gmt_create 83 | 84 | line = [] 85 | for col in columns: 86 | if col not in result: 87 | line.append('') 88 | else: 89 | line.append(str(result[col]).encode('utf-8')) 90 | writer.writerow(line) 91 | except Exception, e: 92 | return -1 93 | return 0 94 | 95 | 96 | news_cnt = 0 97 | for cur,dirnames,filenames in os.walk(zhongshen_dir): #三个参数:分别返回1.父目录 2.所有文件夹名字(不含路径) 3.所有文件名字 98 | for f in os.listdir(cur): 99 | try: 100 | f_path = os.path.join(cur, f) 101 | content = open(f_path, "r").read() 102 | #print content 103 | if content.find("AjaxPage_Click_NEWSID") == -1: 104 | continue 105 | soup = BeautifulSoup(content) 106 | if soup == None or soup.find("title") == None: 107 | continue 108 | title = soup.find("title").string 109 | 110 | ##初步过滤 111 | if len(title.split("-")) < 2: 112 | continue 113 | # print f_path 114 | # print title 115 | flag = extract_news(soup, news_cnt) 116 | if flag == 0: 117 | news_cnt += 1 118 | print title 119 | if news_cnt % 1000 == 1: 120 | print news_cnt 121 | except Exception, e: 122 | print e 123 | continue 124 | 125 | #break 126 | print news_cnt 127 | print len(item_id_dict) -------------------------------------------------------------------------------- /others/后台2/summary_analyze.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | ''' 3 | __author__ = 'Administrator' 4 | 2016.3.6 18:26 first version 5 | 舆情大盘数据汇总,并存入数据库holmesdb 6 | ''' 7 | 8 | import csv 9 | import json 10 | import time 11 | import re 12 | from string import punctuation,digits,letters,whitespace 13 | import sys 14 | import datetime 15 | 16 | import jieba 17 | import jieba.analyse 18 | import pandas as pd 19 | from gensim import corpora,models 20 | from helper.textprocessing import handleContent 21 | from pymongo import MongoClient 22 | client=MongoClient() 23 | reload(sys) 24 | sys.setdefaultencoding('utf-8') 25 | jieba.load_userdict("C:/Python27/Lib/site-packages/jieba-0.37-py2.7.egg/jieba/financedict.txt") 26 | 27 | 28 | 29 | db = client.holmesdb 30 | t_news = db.t_news_di 31 | t_policy = db.t_policy_di 32 | t_ugc = db.t_ugc_di 33 | t_expert = db.t_expert_opinion_di 34 | t_news_caixin = db.t_news_caixin_di 35 | 36 | news_res = t_news.find() 37 | policy_res = t_policy.find() 38 | ugc_res = t_ugc.find() 39 | expert_res = t_expert.find() 40 | article_res = [news_res, policy_res, ugc_res, expert_res] 41 | key = ["news", "policy", "ugc", "expert"] 42 | month_summary = {} 43 | month12_day_summary = {} 44 | source_summary = {} 45 | for i in xrange(4): 46 | for res in article_res[i]: 47 | if res['item_pub_time'] >= '2015-01-01' and res['item_pub_time'] <= '2015-12-31': 48 | title = res['title'] 49 | content = res['content'] 50 | t = res['item_pub_time'] 51 | m = t[5:7] 52 | date = t[5:].replace("-", ".").split(" ")[0] 53 | #print m, date 54 | month_summary[m][key[i]] = month_summary.setdefault(m , {}).setdefault(key[i], 0) + 1 55 | if date >= '12.01' and date <= '12.31': 56 | month12_day_summary[date][key[i]] = month12_day_summary.setdefault(date , {}).setdefault(key[i], 0) + 1 57 | source = res['source'] 58 | if source == "和讯P2P政策": 59 | source = "和讯P2P" 60 | if source == 'zhongshen': 61 | source = '中申网' 62 | source_summary[source] = source_summary.setdefault(source, 0) + 1 63 | 64 | print month_summary 65 | print month12_day_summary 66 | for source in source_summary: 67 | print source, source_summary[source] 68 | 69 | def writeJsonDict(person, f_out): 70 | outStr = json.dumps(person, ensure_ascii = False) #处理完之后重新转为Json格式 71 | f_out.write(outStr.encode('utf-8') + '\n') #写回到一个新的Json文件中去 72 | 73 | print "begin save datas", datetime.datetime.now() 74 | writeJsonDict(month_summary, open("./data/summary/month_summary.json", "w")) 75 | writeJsonDict(month12_day_summary, open("./data/summary/month12_day_summary.json", "w")) 76 | writeJsonDict(source_summary, open("./data/summary/source_summary.json", "w")) 77 | 78 | pos_weight = [0.37, 0.43, 0.42, 0.45, 0.43, 0.44, 0.42, 0.4, 0.39, 0.387,\ 79 | 0.38, 0.378, 0.376, 0.365, 0.33, 0.274, 0.25, 0.26, 0.35, 0.42,\ 80 | 0.41, 0.47, 0.43, 0.46, 0.47, 0.43, 0.44, 0.45, 0.463, 0.456, \ 81 | 0.465 82 | ] 83 | sa_month12_day_summary = {} 84 | for dt in month12_day_summary: 85 | cnt = month12_day_summary[dt]["ugc"] 86 | cnt_pos = int(cnt * pos_weight[int(dt[3:])-1]) 87 | cnt_nag = int(cnt - cnt_pos) 88 | sa_month12_day_summary.setdefault(dt, {"pos":cnt_pos, "nag":cnt_nag}) 89 | writeJsonDict(sa_month12_day_summary, open("./data/summary/sa_month12_day_summary.json", "w")) 90 | 91 | print "end save datas", datetime.datetime.now() -------------------------------------------------------------------------------- /others/后台2/temp.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | ''' 3 | __author__ = 'Administrator' 4 | 2016.3.31测试 5 | ''' 6 | import sys 7 | import os 8 | import json 9 | reload(sys) 10 | sys.setdefaultencoding('utf-8') 11 | 12 | 13 | def getJsonFile_all(json_fname): 14 | json_file = open(json_fname, "r") 15 | dict = json.load(json_file) 16 | json_file.close() 17 | return dict 18 | def writeJsonDict(person, f_out): 19 | outStr = json.dumps(person, ensure_ascii = False) #处理完之后重新转为Json格式 20 | f_out.write(outStr.encode('utf-8') + '\n') #写回到一个新的Json文件中去 21 | 22 | 23 | # json_list = [] 24 | # for cur, dir, fname_list in os.walk("./data/_temp"): 25 | # for f in fname_list: 26 | # print f 27 | # if f[0] == 'a': 28 | # continue 29 | # f_path = os.path.join(cur, f) 30 | # json_data = getJsonFile_all(f_path) 31 | # date_dict = {} 32 | # for data in json_data: 33 | # d = data["item_pub_time"].split(" ")[0].replace("-", ".") 34 | # date_dict[d] = date_dict.setdefault(d, 0) + 1 35 | # print sorted(date_dict.items(), lambda a,b: cmp(a[0], b[0])) 36 | # new_path = os.path.join(cur, "ana_" + f) 37 | # writeJsonDict(date_dict, open(new_path, "w")) 38 | 39 | 40 | #encoding=utf8 41 | ''' 42 | __author__ = 'Administrator' 43 | 2016.4.17 01:08 first version 44 | 构建知识图谱,pipeline 45 | 1、词性标注 46 | 2、歧义消除 47 | 3、关系抽取 48 | 4、知识推理 49 | 5、知识表示 50 | ''' 51 | 52 | import csv 53 | import json 54 | import time 55 | import re 56 | from string import punctuation,digits,letters,whitespace 57 | import sys 58 | import datetime 59 | from helper import myio 60 | import jieba 61 | import jieba.analyse 62 | import jieba.posseg as pseg 63 | import math 64 | import pandas as pd 65 | from gensim import corpora,models 66 | from helper.textprocessing import handleContent 67 | from pymongo import MongoClient 68 | client=MongoClient() 69 | reload(sys) 70 | sys.setdefaultencoding('utf-8') 71 | 72 | knowledge_graph_dir = "./data/knowledge_graph/" 73 | def getLastNameDict(): 74 | last_name_dict ={} 75 | name_vec = [line.strip().split(" ") for line in open(knowledge_graph_dir + u"中国姓.txt")] 76 | for vec in name_vec: 77 | if len(vec) > 1: 78 | for v in vec: 79 | last_name_dict.setdefault(v, 0) 80 | return last_name_dict 81 | 82 | def extractEntity(): 83 | db = client.holmesdb 84 | t_news = db.t_news_di 85 | res_list = t_news.find() 86 | last_name_dict = getLastNameDict() 87 | 88 | ntoken_dict = {} 89 | people_dict = {} 90 | row_cnt = 0 91 | for res in res_list: 92 | row_cnt += 1 93 | title = res["title"] 94 | content = res["content"] 95 | doc = myio.handleContent(title) + " " + myio.handleContent(content) 96 | words = pseg.cut(doc) 97 | for (word, flag) in words: 98 | if flag.find("n") != -1: 99 | print word, flag 100 | word1 = word[0].encode("utf-8") 101 | word2 = word[:2].encode("utf-8") 102 | if word1 in last_name_dict or word2 in last_name_dict: 103 | #print word[0], word[:2] 104 | people_dict[word] = people_dict.setdefault(word, 0) + 1 105 | else: 106 | #print w.word, w.flag 107 | ntoken_dict[word] = ntoken_dict.setdefault(word, 0) + 1 108 | ntoken_list = sorted(ntoken_dict.items(), lambda a, b: -cmp(a[1], b[1])) 109 | people_list = sorted(people_dict.items(), lambda a, b: -cmp(a[1], b[1])) 110 | 111 | 112 | if __name__ == "__main__": 113 | #pipeline step1 114 | extractEntity() -------------------------------------------------------------------------------- /others/后台2/topic_model.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | ''' 3 | __author__ = 'Administrator' 4 | 2016.2.27 20:30 first version 5 | 训练主题模型 6 | ''' 7 | import csv 8 | import json 9 | import time 10 | import re 11 | from string import punctuation,digits,letters,whitespace 12 | import sys 13 | import datetime 14 | import jieba 15 | import jieba.analyse 16 | import pandas as pd 17 | from gensim import corpora,models 18 | from helper.textprocessing import handleContent 19 | 20 | reload(sys) 21 | sys.setdefaultencoding('utf-8') 22 | jieba.load_userdict("C:/Python27/Lib/site-packages/jieba-0.37-py2.7.egg/jieba/financedict.txt") 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | print "before load date", datetime.datetime.now() 32 | 33 | news_dataset = pd.read_pickle("./data/news_dataset.pkl") 34 | # news_dataset_other = pd.read_pickle("./data/news_dataset_other.pkl") 35 | # all_dataset = pd.concat([news_dataset, news_dataset_other]) 36 | all_dataset = news_dataset 37 | print "end load date", datetime.datetime.now() 38 | 39 | 40 | text_tags = [] 41 | lda_train_set = [] 42 | 43 | 44 | 45 | ## 8000 articles 5mins 46 | print "before cut segments", datetime.datetime.now() 47 | # 分词,关键字提取 48 | 49 | for content in all_dataset['content']: 50 | content = handleContent(content) 51 | seg = list(jieba.cut(content)) 52 | lda_train_set.append(seg) 53 | 54 | print "end cut segments", datetime.datetime.now() 55 | 56 | 57 | print "before LDA", datetime.datetime.now() 58 | # LDA主题模型 59 | dic = corpora.Dictionary(lda_train_set) 60 | corpus = [dic.doc2bow(text) for text in lda_train_set] 61 | tfidf = models.TfidfModel(corpus) 62 | tfidf.save("./data/tfidf_dict.model") 63 | corpus_tfidf = tfidf[corpus] 64 | 65 | # 8000 article 2mins 66 | lda = models.LdaModel(corpus_tfidf, id2word = dic, num_topics = 200) 67 | lda.save("./data/lda.model") 68 | corpus_lda = lda[corpus_tfidf] 69 | 70 | for i in range(0, lda.num_topics): 71 | print i, lda.print_topic(i) 72 | 73 | for p in corpus_lda: 74 | print p 75 | 76 | print "end LDA", datetime.datetime.now() 77 | 78 | 79 | topic_doc_dict = {} 80 | for i in xrange(0 , len(corpus_lda)): 81 | cnt = 0 82 | for pp in sorted(corpus_lda[i], lambda a,b: -cmp(a[1], b[1])): 83 | cnt += 1 84 | if cnt >= 2: break 85 | topic_id, weight = pp[0], pp[1], 86 | topic_doc_dict[topic_id][i] = topic_doc_dict.setdefault(topic_id, {}).setdefault(i, 0) + weight 87 | 88 | for topic_id in topic_doc_dict: 89 | tag_set = {} 90 | for doc_id in topic_doc_dict[topic_id]: 91 | for tag in text_tags[doc_id]: 92 | tag_set[tag] = tag_set.setdefault(tag, 0) + topic_doc_dict[topic_id][doc_id] 93 | print topic_id, len(tag_set), 94 | for tag in sorted(tag_set.items(), lambda a,b: -cmp(a[1], b[1])): 95 | print("%s %s" %(tag[0], tag[1])), 96 | print "" 97 | 98 | -------------------------------------------------------------------------------- /others/后台2/vectorize.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | ''' 3 | __author__ = 'Administrator' 4 | 2016.3.3 22:30 first version 5 | 将单词、文章、用户向量化,包括word2vec, doc2vec 6 | ''' 7 | import logging 8 | import re 9 | import sys 10 | import datetime 11 | import gensim 12 | from gensim.models.doc2vec import TaggedDocument 13 | import jieba 14 | import pandas as pd 15 | from gensim.models import Word2Vec, Doc2Vec 16 | from helper.textprocessing import handleContent, cut_sentence_2 17 | reload(sys) 18 | sys.setdefaultencoding('utf-8') 19 | 20 | stop_dict = {} 21 | for line in open("C:\Python27\Lib\site-packages\jieba-0.37-py2.7.egg\jieba\stop_chinese.txt"): 22 | stop_dict.setdefault(line.strip(), 0) 23 | #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 24 | 25 | ##Load documents 26 | print "before load date", datetime.datetime.now() 27 | 28 | news_dataset = pd.read_pickle("./data/news_dataset.pkl") 29 | news_dataset_other = pd.read_pickle("./data/news_dataset_other.pkl") 30 | all_dataset = pd.concat([news_dataset, news_dataset_other]) 31 | 32 | print "end load date", datetime.datetime.now() 33 | 34 | 35 | print "before word2vec", datetime.datetime.now() 36 | documents = [] 37 | sentences = [] 38 | for i in xrange(0, len(all_dataset)): 39 | title = all_dataset.iloc[i]['title'] 40 | content = all_dataset.iloc[i]['content'] 41 | doc = handleContent(title) + " " + handleContent(content) 42 | tokens = list(jieba.cut(doc)) 43 | new_tokens = [] 44 | 45 | for i in xrange( len(tokens) ): 46 | if tokens[i].isdigit() == True or len(tokens[i]) <= 1\ 47 | or (tokens[i].isalnum() == True and len(tokens[i]) > 20): 48 | #print tokens[i], 49 | continue 50 | if tokens[i] in stop_dict:#去停用词 51 | continue 52 | #u'数正', '下险企 53 | #if tokens[i] in[u'融系',u'办则',u'部是', u'若仅', u'虽同', u'或苏', u'由十']: 54 | #print tokens[i], title, content 55 | 56 | new_tokens.append(tokens[i]) 57 | # for token in new_tokens: 58 | # print token, 59 | # print "" 60 | # print len(tokens),len(new_tokens) 61 | documents.append(new_tokens) 62 | # content = content.replace("#r#", "\r").replace("#n#", "\n").replace("#t#", "\t") 63 | # sentence_list = [title] + cut_sentence_2(content) 64 | # for i in xrange(len(sentence_list)): 65 | # sentence_list[i] = handleContent(sentence_list[i]) 66 | sentences.append(doc) 67 | 68 | 69 | ## train a word2vec model 70 | num_features = 200 # Word vector dimensionality 71 | min_word_count = 1 # Minimum word count 72 | num_workers = 4 # Number of threads to run in parallel 73 | context = 10 # Context window size 74 | downsampling = 1e-5 # Downsample setting for frequent words 75 | 76 | print "Training Word2Vec model...", datetime.datetime.now() 77 | model = Word2Vec(documents, \ 78 | workers=num_workers,\ 79 | size=num_features,\ 80 | min_count=min_word_count,\ 81 | window=context, \ 82 | sample=downsampling,\ 83 | seed=1) 84 | 85 | model.init_sims(replace=True) 86 | model.save('./data/word2vec.model') 87 | print "here" 88 | for pp in model.most_similar(["陆金所".decode("utf8")],topn=30): 89 | print pp[0], pp[1], "\t", 90 | print "" 91 | for pp in model.most_similar(["P2P".decode("utf8")],topn=30): 92 | print pp[0], pp[1], "\t", 93 | print "" 94 | for pp in model.most_similar(["网贷".decode("utf8")],topn=30): 95 | print pp[0], pp[1], "\t", 96 | print "" 97 | for pp in model.most_similar(["e租宝".decode("utf8")],topn=30): 98 | print pp[0], pp[1], "\t", 99 | print "" 100 | print "end word2vec", datetime.datetime.now() 101 | 102 | 103 | print "before doc2vec", datetime.datetime.now() 104 | class DocIterator(object): 105 | def __init__(self, documents): 106 | self.documents = documents 107 | 108 | def __iter__(self): 109 | for i in xrange(len(self.documents)): 110 | words = self.documents[i] 111 | tags = [i] 112 | yield TaggedDocument(words, tags) 113 | 114 | ## train a doc2vec model 115 | print "Training DocVec model..." 116 | model = Doc2Vec(DocIterator(documents), \ 117 | size=100, \ 118 | window=8, \ 119 | min_count=5,\ 120 | workers=4) 121 | model.init_sims(replace=True) 122 | model.save('./data/doc2vec.model') 123 | print "end doc2vec", datetime.datetime.now() 124 | -------------------------------------------------------------------------------- /others/爬虫/wd/bbs_rong360/bbs_rong360/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/others/爬虫/wd/bbs_rong360/bbs_rong360/.DS_Store -------------------------------------------------------------------------------- /others/爬虫/wd/bbs_rong360/bbs_rong360/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/others/爬虫/wd/bbs_rong360/bbs_rong360/__init__.py -------------------------------------------------------------------------------- /others/爬虫/wd/bbs_rong360/bbs_rong360/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/others/爬虫/wd/bbs_rong360/bbs_rong360/__init__.pyc -------------------------------------------------------------------------------- /others/爬虫/wd/bbs_rong360/bbs_rong360/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class BbsRong360Item(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /others/爬虫/wd/bbs_rong360/bbs_rong360/middlewares.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/others/爬虫/wd/bbs_rong360/bbs_rong360/middlewares.pyc -------------------------------------------------------------------------------- /others/爬虫/wd/bbs_rong360/bbs_rong360/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class BbsRong360Pipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /others/爬虫/wd/bbs_rong360/bbs_rong360/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for bbs_rong360 project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'bbs_rong360' 13 | 14 | SPIDER_MODULES = ['bbs_rong360.spiders'] 15 | NEWSPIDER_MODULE = 'bbs_rong360.spiders' 16 | 17 | DOWNLOAD_HANDLERS = {'s3': None,} 18 | 19 | # COOKIES_ENABLED = False 20 | 21 | # DOWNLOAD_DELAY = 3 22 | 23 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 24 | #USER_AGENT = 'bbs_rong360 (+http://www.yourdomain.com)' 25 | 26 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 27 | #CONCURRENT_REQUESTS=32 28 | 29 | # Configure a delay for requests for the same website (default: 0) 30 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 31 | # See also autothrottle settings and docs 32 | # DOWNLOAD_DELAY=3 33 | # The download delay setting will honor only one of: 34 | #CONCURRENT_REQUESTS_PER_DOMAIN=16 35 | #CONCURRENT_REQUESTS_PER_IP=16 36 | 37 | # Disable cookies (enabled by default) 38 | #COOKIES_ENABLED=False 39 | 40 | # Disable Telnet Console (enabled by default) 41 | #TELNETCONSOLE_ENABLED=False 42 | 43 | # Override the default request headers: 44 | #DEFAULT_REQUEST_HEADERS = { 45 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 46 | # 'Accept-Language': 'en', 47 | #} 48 | 49 | # Enable or disable spider middlewares 50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 51 | #SPIDER_MIDDLEWARES = { 52 | # 'bbs_rong360.middlewares.MyCustomSpiderMiddleware': 543, 53 | #} 54 | 55 | # Enable or disable downloader middlewares 56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 57 | #DOWNLOADER_MIDDLEWARES = { 58 | # 'bbs_rong360.middlewares.MyCustomDownloaderMiddleware': 543, 59 | #} 60 | 61 | # Enable or disable extensions 62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 63 | #EXTENSIONS = { 64 | # 'scrapy.telnet.TelnetConsole': None, 65 | #} 66 | 67 | # Configure item pipelines 68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 69 | #ITEM_PIPELINES = { 70 | # 'bbs_rong360.pipelines.SomePipeline': 300, 71 | #} 72 | 73 | # Enable and configure the AutoThrottle extension (disabled by default) 74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 75 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay 76 | # AUTOTHROTTLE_ENABLED=True 77 | # The initial download delay 78 | # AUTOTHROTTLE_START_DELAY=5 79 | # The maximum download delay to be set in case of high latencies 80 | #AUTOTHROTTLE_MAX_DELAY=60 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG=False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED=True 87 | #HTTPCACHE_EXPIRATION_SECS=0 88 | #HTTPCACHE_DIR='httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES=[] 90 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /others/爬虫/wd/bbs_rong360/bbs_rong360/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/others/爬虫/wd/bbs_rong360/bbs_rong360/settings.pyc -------------------------------------------------------------------------------- /others/爬虫/wd/bbs_rong360/bbs_rong360/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /others/爬虫/wd/bbs_rong360/bbs_rong360/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/others/爬虫/wd/bbs_rong360/bbs_rong360/spiders/__init__.pyc -------------------------------------------------------------------------------- /others/爬虫/wd/bbs_rong360/bbs_rong360/spiders/bbs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | 5 | class BbsSpider(scrapy.Spider): 6 | name = "bbs" 7 | start_urls = [] 8 | 9 | def __init__(self): 10 | for page in range(1, 171+1): 11 | self.start_urls.append("http://bbs.rong360.com/forum-55-%d.html"%page) 12 | 13 | def parse(self, response): 14 | urls = response.xpath('//tbody[contains(@id, "normalthread")]/tr/td[@class="icn"]/a/@href').extract() 15 | for url in urls: 16 | df = open("urls.txt", "a") 17 | df.write(url+"\n") 18 | df.close() 19 | 20 | -------------------------------------------------------------------------------- /others/爬虫/wd/bbs_rong360/bbs_rong360/spiders/bbs.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/others/爬虫/wd/bbs_rong360/bbs_rong360/spiders/bbs.pyc -------------------------------------------------------------------------------- /others/爬虫/wd/bbs_rong360/bbs_rong360/spiders/content.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | reload(sys) 5 | sys.setdefaultencoding("utf-8") 6 | 7 | import os 8 | import json 9 | import scrapy 10 | 11 | class ContentSpider(scrapy.Spider): 12 | name = "content" 13 | start_urls = [] 14 | 15 | def __init__(self): 16 | df = open("融360.csv", "w") 17 | df.write("item_id,item_type,source,url,author,title,content,item_pub_time,tags,cmt_cnt,fav_cnt,gmt_create,exinfo1,exinfo2\n") 18 | df.close() 19 | for f in os.listdir("html"): 20 | self.start_urls.append("file:///Users/ziaoang/Documents/p2p/bbs_rong360/html/" + f) 21 | 22 | def parse(self, response): 23 | id = response.url.split("/")[-1].replace(".html","") 24 | title = response.xpath('//span[@id="thread_subject"]/text()').extract()[0] 25 | 26 | df = open("融360.csv", "a") 27 | tids = response.xpath('//div[re:test(@id, "post_\d+$")]/@id').extract() 28 | for i in range(len(tids)): 29 | try: 30 | tid = tids[i].replace("post_","") 31 | time = response.xpath('//em[@id="authorposton%s"]/text()'%tid).extract()[0].replace("发表于 ","") 32 | content = "".join(response.xpath('//td[@id="postmessage_%s"]//text()'%tid).extract()) 33 | content = '"' + content.strip().replace("\r","#r#").replace("\n","#n#").replace("\t","#t#") + '"' 34 | url = "http://bbs.rong360.com/thread-%s-1.html"%id 35 | if i == 0: 36 | df.write("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n"%(id,"opinion","融360",url,"",title,content,time,"","","","","","")) 37 | else: 38 | df.write("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n"%(id,"reply","融360",url,"","",content,time,"","","","","","")) 39 | except: 40 | pass 41 | df.close() 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /others/爬虫/wd/bbs_rong360/bbs_rong360/spiders/content.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/others/爬虫/wd/bbs_rong360/bbs_rong360/spiders/content.pyc -------------------------------------------------------------------------------- /others/爬虫/wd/bbs_rong360/bbs_rong360/spiders/detail.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | reload(sys) 5 | sys.setdefaultencoding("utf-8") 6 | 7 | import os 8 | import json 9 | import scrapy 10 | 11 | class DetailSpider(scrapy.Spider): 12 | name = "detail" 13 | start_urls = [] 14 | 15 | def __init__(self): 16 | df = open("融360.csv", "w") 17 | df.write("item_id,item_type,source,url,author,title,content,item_pub_time,tags,cmt_cnt,fav_cnt,gmt_create,exinfo1,exinfo2\n") 18 | df.close() 19 | for line in open("urls.txt"): 20 | self.start_urls.append(line.strip()) 21 | # break 22 | 23 | def parse(self, response): 24 | id = response.url.split("/")[-1].replace("thread-","").replace(".html","") 25 | df = open("html/%s.html"%id, "w") 26 | df.write(response.body) 27 | df.close() 28 | 29 | ''' 30 | id = response.url.split("/")[-1].replace("thread-","").replace(".html","") 31 | title = response.xpath('//span[@id="thread_subject"]/text()').extract()[0] 32 | 33 | df = open("融360.csv", "a") 34 | tids = response.xpath('//div[re:test(@id, "post_\d+$")]/@id').extract() 35 | for i in range(len(tids)): 36 | tid = tids[i].replace("post_","") 37 | time = response.xpath('//em[@id="authorposton%s"]/text()'%tid).extract()[0].replace("发表于 ","") 38 | content = "".join(response.xpath('//td[@id="postmessage_%s"]//text()'%tid).extract()) 39 | content = '"' + content.strip().replace("\r","#r#").replace("\n","#n#").replace("\t","#t#") + '"' 40 | if i == 0: 41 | df.write("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n"%(id,"opinion","融360",response.url,"",title,content,time,"","","","","","")) 42 | else: 43 | df.write("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n"%(id,"reply","融360",response.url,"","",content,time,"","","","","","")) 44 | df.close() 45 | ''' 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /others/爬虫/wd/bbs_rong360/bbs_rong360/spiders/detail.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/others/爬虫/wd/bbs_rong360/bbs_rong360/spiders/detail.pyc -------------------------------------------------------------------------------- /others/爬虫/wd/bbs_rong360/proxy_inuse.txt: -------------------------------------------------------------------------------- 1 | 119.188.94.145:80 2 | 125.123.81.153:3128 3 | 182.89.6.100:8123 4 | 222.82.161.217:8090 5 | 106.1.59.149:8123 6 | 111.176.154.126:3128 7 | 182.246.38.56:8090 8 | 110.72.35.111:8123 9 | 110.72.39.46:8123 10 | 118.193.48.114:4444 11 | 110.73.9.191:8123 12 | 106.2.111.207:80 13 | 27.9.156.128:8090 14 | 180.213.179.43:8090 15 | 121.31.145.239:8123 16 | 171.39.1.124:8123 17 | 171.37.164.247:8123 18 | 182.90.50.55:8123 19 | 171.39.96.123:8123 20 | 171.37.133.164:8123 21 | -------------------------------------------------------------------------------- /others/爬虫/wd/bbs_rong360/randomproxy.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2013 by Aivars Kalvans 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in 11 | # all copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | # THE SOFTWARE. 20 | 21 | import re 22 | import random 23 | import base64 24 | 25 | 26 | class RandomProxy(object): 27 | def __init__(self, settings): 28 | self.proxy_list = settings.get('PROXY_LIST') 29 | fin = open(self.proxy_list) 30 | 31 | self.proxies = {} 32 | for line in fin.readlines(): 33 | self.proxies[line.strip()] = '' 34 | 35 | fin.close() 36 | 37 | @classmethod 38 | def from_crawler(cls, crawler): 39 | return cls(crawler.settings) 40 | 41 | def process_request(self, request, spider): 42 | # Don't overwrite with a random one (server-side state for IP) 43 | if 'proxy' in request.meta: 44 | return 45 | 46 | proxy_address = random.choice(self.proxies.keys()) 47 | proxy_user_pass = self.proxies[proxy_address] 48 | 49 | request.meta['proxy'] = proxy_address 50 | if proxy_user_pass: 51 | basic_auth = 'Basic ' + base64.encodestring(proxy_user_pass) 52 | request.headers['Proxy-Authorization'] = basic_auth 53 | 54 | def process_exception(self, request, exception, spider): 55 | proxy = request.meta['proxy'] 56 | try: 57 | del self.proxies[proxy] 58 | except ValueError: 59 | pass -------------------------------------------------------------------------------- /others/爬虫/wd/bbs_rong360/randomproxy.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/others/爬虫/wd/bbs_rong360/randomproxy.pyc -------------------------------------------------------------------------------- /others/爬虫/wd/bbs_rong360/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = bbs_rong360.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = bbs_rong360 12 | -------------------------------------------------------------------------------- /others/爬虫/wd/爬虫文档.txt: -------------------------------------------------------------------------------- 1 | 爬虫文档 2 | 3 | 基本爬取方法 4 | ======== 5 | 1. 订制列表界面爬虫 6 | 该界面包含信息实体的概述信息,包括标题,作者,发表时间等,订制爬虫专门爬取对应详细界面的url 7 | 2. 订制详细信息界面爬虫 8 | 该界面包含信息实体的详细信息,包括标题,作者,发表时间,内容正文,评论信息,订制详细信息界面爬虫爬取对应的详细信息 9 | 10 | 进阶爬虫方法 11 | ======== 12 | 1. 抓取网站app后台获取数据API 13 | 下载要爬取网站的app版本,试用并对网络流进行抓包,分析出网站后台获取数据API 14 | 2. 模拟网站app进行API请求获取数据 15 | 根据分析出来的API和相应的数据请求格式模拟网站app进行数据请求抓包,获取数据 16 | 17 | 代码框架 18 | ======== 19 | scrapy + xpath + beautifulsoup 20 | 21 | 爬取主要字段 22 | ======== 23 | 新闻类每条新闻一行记录,保存格式: 24 | item_id string or int, 资讯id 25 | Item_type string, “news”、”industry”、”policy”、”opinion”、”reply” 26 | source string, 网站名称,包括金融之家、和讯网、P2P观察网 27 | url string, 新闻链接 28 | author string, 29 | title string, 30 | content string, 31 | item_pub_time datetime, 发布时间 yyyy-mm-dd 32 | tags sring, 新闻在页面的标签,用’,’分隔 33 | cmt_cnt int, 评论数 34 | fav_cnt int, 点赞或者收藏数 35 | gmt_create datetime, 该记录创建时间,yyyy-mm-dd mm:ss 36 | exinfo1 string, 保留字段,如果是reply的话,保留被回复的id 37 | exinfo2 string 38 | 39 | 平台&公司类每个平台一行记录,保存格式: 40 | platform_id string or int, 平台id 41 | platform_name string, 平台名字 42 | platform_type string, 平台类型”信用贷、企业贷车贷 、房贷 、债权流转、优选理财、票据抵押、其他” 43 | platform_status string, 平台状态”runing、close、issue” 44 | company string, 所属公司 45 | need_invest double, 最少需要投资的金额 46 | prospect_earn string, 预期收益,格式”xx%~xx%”,四舍五入 47 | Risk_weight int, 风险系数1,2,3,4,5 48 | source string, 来源 49 | Source_url string, 来源链接 50 | gmt_create datetime, 该记录创建时间,yyyy-mm-dd mm:ss 51 | exinfo1 string, 保留字段 52 | exinfo2 string 53 | 54 | 爬取网站列表 55 | ======== 56 | 新闻类 57 | http://news.jrzj.com/p2p 58 | 金融之家 59 | http://www.wdzj.com/news/hangye 60 | 网贷之家-行业(3300篇) 61 | http://www.wdzj.com/news/pingtai 62 | 网贷之家-平台(1100篇) 63 | http://p2p.hexun.com 64 | 和讯网 65 | http://www.p2pguancha.com 66 | P2P观察网 67 | http://www.caixin.com/ 68 | 财新网(作为预料参与NLP模型的训练) 69 | 70 | 71 | 国家政策 72 | http://p2p.hexun.com/zc 73 | 和讯网政策版块 74 | http://www.wdzj.com/news/zhengce 75 | 网贷之家政策版块(340篇) 76 | 77 | P2P平台&公司 78 | http://shuju.wdzj.com/platdata-1.html 79 | 网贷之家平台数据,按平台数据格式保存,不按文档里的格式 80 | http://www.rong360.com/licai-p2p/pingtai/rating 81 | 融360,P2P平台评级 82 | http://caifu.baidu.com/wealth 83 | 百度财富 有300左右个P2P公司信息 84 | http://licai.p2peye.com/lcdt 85 | p2p理财 86 | http://www.p2peye.com/platform/search/h0i0c0x0r0t0s0b0p1.html 87 | 网贷天眼网贷平台汇总 (4189个) 88 | 89 | 用户评论、观点类 90 | http://licai.p2peye.com/investshare 91 | 用户分享 92 | http://bbs.wdzj.com 93 | 网贷人论坛 94 | http://zhihu.com 95 | 知乎,相对专业的评论,应该较长 96 | http://bbs.rong360.com/forum-55-1.html 97 | 融360 P2P论坛,整个版块抓取 98 | 99 | 爬虫代码 100 | ======== 101 | 附件 102 | 103 | -------------------------------------------------------------------------------- /static/css/dashboard.css: -------------------------------------------------------------------------------- 1 | /* 2 | * Base structure 3 | */ 4 | 5 | /* Move down content because we have a fixed navbar that is 50px tall */ 6 | body { 7 | padding-top: 50px; 8 | } 9 | 10 | 11 | /* 12 | * Global add-ons 13 | */ 14 | 15 | .sub-header { 16 | padding-bottom: 10px; 17 | border-bottom: 1px solid #eee; 18 | } 19 | 20 | /* 21 | * Top navigation 22 | * Hide default border to remove 1px line. 23 | */ 24 | .navbar-fixed-top { 25 | border: 0; 26 | } 27 | 28 | /* 29 | * Sidebar 30 | */ 31 | 32 | /* Hide for mobile, show later */ 33 | .sidebar { 34 | display: none; 35 | } 36 | @media (min-width: 768px) { 37 | .sidebar { 38 | position: fixed; 39 | top: 51px; 40 | bottom: 0; 41 | left: 0; 42 | z-index: 1000; 43 | display: block; 44 | padding: 20px; 45 | overflow-x: hidden; 46 | overflow-y: auto; /* Scrollable contents if viewport is shorter than content. */ 47 | background-color: #f5f5f5; 48 | border-right: 1px solid #eee; 49 | } 50 | } 51 | 52 | /* Sidebar navigation */ 53 | .nav-sidebar { 54 | margin-right: -21px; /* 20px padding + 1px border */ 55 | margin-bottom: 20px; 56 | margin-left: -20px; 57 | } 58 | .nav-sidebar > li > a { 59 | padding-right: 20px; 60 | padding-left: 20px; 61 | } 62 | .nav-sidebar > .active > a, 63 | .nav-sidebar > .active > a:hover, 64 | .nav-sidebar > .active > a:focus { 65 | color: #fff; 66 | background-color: #428bca; 67 | } 68 | 69 | 70 | /* 71 | * Main content 72 | */ 73 | 74 | .main { 75 | padding: 20px; 76 | } 77 | @media (min-width: 768px) { 78 | .main { 79 | padding-right: 40px; 80 | padding-left: 40px; 81 | } 82 | } 83 | .main .page-header { 84 | margin-top: 0; 85 | } 86 | 87 | 88 | /* 89 | * Placeholder dashboard ideas 90 | */ 91 | 92 | .placeholders { 93 | margin-bottom: 30px; 94 | text-align: center; 95 | } 96 | .placeholders h4 { 97 | margin-bottom: 0; 98 | } 99 | .placeholder { 100 | margin-bottom: 20px; 101 | } 102 | .placeholder img { 103 | display: inline-block; 104 | border-radius: 50%; 105 | } 106 | -------------------------------------------------------------------------------- /static/css/sign_in.css: -------------------------------------------------------------------------------- 1 | body{ 2 | background-image: url("../img/dl.jpg"); 3 | } 4 | 5 | .sign_in_input { 6 | width: 100%; 7 | padding: 10px; 8 | font-size: 16px; 9 | height: auto; 10 | } -------------------------------------------------------------------------------- /static/data/hot_keyword.json: -------------------------------------------------------------------------------- 1 | {"dt": "2015-12-29", "month_hot_keywords": "e租宝:1.82543890332;宜人贷:0.874153359485;上市:0.746187357849;征求意见:0.730134647048;早报:0.681997022146;速报:0.681997022146;办法:0.65499315201;大大:0.610310583229;纽交所:0.609133857773;调查:0.592044634463;细则:0.591111017562;观察:0.581325950509;责任编辑:0.573899253426;来源:0.571275453897;经营:0.565265533198;中介机构:0.558812590894;美元:0.54405237903;登陆:0.543869515869;事件:0.530250975761;明年:0.52910574318;出借:0.525476762064;负面:0.525476762064;部门:0.520103820183;涉嫌:0.517892025776;报道:0.514530365141;三农:0.507585376415;存管:0.495620382607;合规:0.493234167371;活动:0.488883075296;清单:0.488827135502;信息:0.469753716953;第一股:0.465056291569;自己:0.465056291569;三板:0.463431791396;集团:0.459247500902;非法:0.458295818058;相关:0.450114072287;管理:0.448932761089;行为:0.448142819022;披露:0.447954406301;消息:0.447919840475;用户:0.446841980888;官网:0.446441747711;资本:0.445457354661;公开:0.438605088285;自融:0.436377762919;众筹:0.4348652715;中国:0.434501210367;旗下:0.433217055772;暂行办法:0.422769414488;开展:0.415197868207;之家:0.414457731708;爆料:0.41315088778;公告:0.412741424861;发布:0.411078425733;员工:0.409387060047;IPO:0.408123093166;我们:0.404047002981;或者:0.403879713313;暂停:0.403555227849;要求:0.402890841729;有限公司:0.40272320975;规范:0.401665895414;总部:0.4013465767;日报:0.401091747582;累计:0.400396419349;社会:0.397409011407;集资:0.396826830097;网络:0.396387069107;有关:0.391586051426;理财:0.389551862839;工作:0.388852803927;资产:0.387630540481;健康:0.387615260427;冻结:0.386870255234;如果:0.381635810468;防范:0.376002959141;促进:0.376002959141;互联网:0.37359091714;经济:0.372414661979;影响:0.372246364599;明确:0.371144932164;这个:0.366669700253;计划:0.366669700253;今年:0.362866066284;不能:0.361665530802;被查:0.361665530802;租赁:0.361201632796;经侦:0.358563748023;叶子:0.358213199291;制度:0.357709877673;代销:0.357324198204;之前:0.35723653541;了解:0.357153398169;来看:0.356916955951;上海:0.352456233467;美国:0.351113237814;借贷:0.349701473881;百度:0.348537829528;可能:0.347948601982", "day_hot_keywords": "中介机构:0.444331081007;征求意见:0.410639456923;细则:0.32659403459;暂行办法:0.323185939615;出借:0.317699616172;活动:0.211808213709;清单:0.211313883595;办法:0.20966181388;备案:0.182314620765;应当:0.175445184133;会同:0.17503951676;明确:0.174617280325;公安部:0.174023198254;金融监管:0.170591933492;负面:0.168777921092;禁止:0.168459646537;义务:0.167560918974;部门:0.154429737787;行为:0.154042337243;网络:0.152746162409;责任编辑:0.149741908033;中介:0.1492348664;披露:0.148921695081;信息化:0.148110360336;保本:0.144652324122;管理:0.14100822943;保息:0.136732512914;事后:0.134645782123;起草:0.133525222266;规定:0.132254971826;评估:0.12906546907;要求:0.128138967708;不得:0.128083612071;加强:0.127507825644;信息:0.125173488687;底线:0.12515939901;信息安全:0.124302284467;有关:0.123225051757;职责:0.121862486526;地方:0.121665300276;承担:0.121665300276;赵然:0.121181203911;意见:0.120656534782;经营:0.119458751105;合规:0.119007303935;原则:0.117281954276;健康:0.1157456173;保护:0.112964444567;从事:0.112956297444;办公室:0.111872056021;数据库:0.111872056021;存管:0.111206457535;公开:0.110031788639;HZ002:0.107716625699;报送:0.106629675711;自负:0.106629675711;现向:0.106629675711;注册地:0.106629675711;孙立欣:0.106629675711;风险管理:0.106629675711;HF017:0.106629675711;制度:0.104274871908;银监会:0.103750758415;责任:0.1001439167;利好:0.0998942338374;引导:0.0994418275739;实行:0.0994418275739;解读:0.0992811300538;借贷:0.0971273471917;实施:0.0962764607769;投融资:0.0962764607769;规范:0.0957610855902;撮合:0.095585299249;征求:0.0942520474863;指导:0.0938729309958;防范:0.0935886925203;内容:0.0935707648708;银行业:0.0924187888179;人民政府:0.0913968648948;电信业务:0.0913968648948;事中:0.0913968648948;成谜:0.0913968648948;空间:0.089016814844;教育:0.0870115991272;允许:0.0870115991272;日为:0.0870115991272;法律法规:0.0870115991272;美股:0.0870115991272;提出:0.0852959667462;定位:0.0852959667462;线下:0.0851376002741;促进:0.0842298232683;规则:0.0834970031342;期限:0.0832451948645;监管:0.0828451643252;有利于:0.0821500345048;基本:0.0821500345048;自律:0.0820709326995;社会:0.081874419262;机构:0.0817216048291", "week_hot_keywords": "征求意见:0.512874133053;中介机构:0.463553021821;三农:0.461664191032;细则:0.407558890648;暂行办法:0.327170601297;出借:0.321359239609;责任编辑:0.283931524078;退赔:0.27267824813;活动:0.262908148672;办法:0.248871597435;行为:0.243433470993;清单:0.23908620864;坚决:0.236462146626;资本:0.233570975199;宜人贷:0.22658333268;明确:0.224772072286;合规:0.223410054498;赵然:0.218142598504;e租宝:0.213626983439;负面:0.210892000993;管理:0.210667177524;开展:0.206735790424;大大:0.204967565038;经营:0.203641199889;部门:0.202840306082;公安部:0.201335754644;评估:0.200849524756;披露:0.200050329402;非法:0.199784264261;网络:0.198403696208;损失:0.197825066925;加强:0.197631971286;公告:0.196911064104;备案:0.193566798005;规范:0.192900809041;健康:0.189971300497;应当:0.189411109154;社会:0.189411109154;打击:0.186991424786;信息:0.185127945376;底线:0.185009437955;上市:0.184229571714;中介:0.183148712642;事件:0.18172577884;集资:0.181557557404;禁止:0.179823298583;HZ002:0.177240861284;会同:0.177240861284;要求:0.176848833409;金融监管:0.172604265295;地方:0.170358914447;防范:0.170358914447;风险管理:0.169725935399;义务:0.169725935399;责任:0.168901533304;意见:0.166435745022;存管:0.16571280236;某宝:0.163606948878;来看:0.161856872938;促进:0.160894530311;公开:0.158832415107;保本:0.157641431084;转型:0.156842391644;有关:0.155826187322;法院:0.155826187322;速报:0.154296304908;早报:0.154296304908;合法权益:0.154296304908;内容:0.153896526188;借贷:0.151923680813;办公室:0.151001815983;信息安全:0.151001815983;信息化:0.149973036471;规定:0.148434969746;相关:0.147906275946;互联网:0.146949980475;线下:0.146780275784;暂停:0.146075804864;制定:0.146060082596;纽交所:0.145437774834;发布:0.143339514423;利好:0.14303725871;依法:0.142058331866;受害人:0.141965762039;自己:0.141965762039;用户:0.141648856572;基金:0.140650760074;解决:0.140240965552;孙立欣:0.138866674418;引导:0.138418331318;保息:0.138418331318;美股:0.138418331318;来说:0.13748311046;事后:0.136339124065;强调:0.136339124065;起草:0.135121226643;总部:0.135121226643;海外:0.135049362345;资本市场:0.135033972346;或者:0.134880727449"} 2 | -------------------------------------------------------------------------------- /static/data/hot_topic/1/hot.json: -------------------------------------------------------------------------------- 1 | {"2015.12.05": 1, "2015.12.14": 4, "2015.12.23": 2, "2015.12.22": 1, "2015.12.21": 1, "2015.12.06": 1, "2015.12.10": 6, "2015.12.12": 2, "2015.12.03": 3, "2015.12.11": 2, "2015.12.16": 6, "2015.12.04": 8, "2015.12.07": 2, "2015.12.15": 1, "2015.12.09": 8, "2015.12.08": 9, "2015.12.18": 3, "2015.11.20": 1, "2015.12.17": 5} 2 | -------------------------------------------------------------------------------- /static/data/hot_topic/1/keywords.json: -------------------------------------------------------------------------------- 1 | {"带走": 0.23684967683127037, "分公司": 0.25956039027647254, "法定代表": 0.14870965406099504, "经侦": 0.18214517223841473, "e租宝": 1.1697296520291622, "大大": 0.372192349306282, "大厦": 0.16110212523274461, "调查": 0.3517286084603871, "违法": 0.14683508945988097, "官网": 0.19885367608039275, "支公司": 0.1296077773106976, "维权": 0.13216624658200335, "租赁": 0.1459274930542536, "冻结": 0.1858573816813141, "金易": 0.1845592914987935, "最新消息": 0.1458087494745348, "突查": 0.14501087189190917, "变更": 0.13182806535628105, "被查": 0.32707812514794476, "新华社": 0.14296721667793394, "工资": 0.1296077773106976, "北京": 0.16330256821375114, "张敏": 0.14501087189190917, "钰诚集团": 0.3163873568550746, "集团": 0.22058417824533577, "爆料": 0.17669159156054834, "今日": 0.14111584709219382, "申彤": 0.1944116659660464, "返回": 0.14296721667793394, "警察": 0.15726393834572733, "钰诚": 0.21061263812988357, "缅甸": 0.17156066001352072, "关联": 0.13124027760956694, "全部": 0.21429256475210176, "朝阳区": 0.14296721667793394, "暂停": 0.13426221997341273, "母公司": 0.2109249045700497, "警方": 0.16934238164432705, "遭查": 0.15726393834572733, "网络科技": 0.13631718288924546, "安徽": 0.13214033179644719, "代销": 0.2517172962169388, "查封": 0.1858870675762438, "之前": 0.162009721638372, "员工": 0.2677694384163143, "官方": 0.15381356460828002, "事件": 0.16455403752981024, "消息": 0.18501833500736867, "报道": 0.16787821200254346, "涉嫌": 0.25407694046566315} 2 | -------------------------------------------------------------------------------- /static/data/hot_topic/2/hot.json: -------------------------------------------------------------------------------- 1 | {"2015.12.29": 17, "2015.12.28": 36, "2015.11.16": 1, "2015.12.10": 1, "2015.12.30": 12, "2015.12.31": 3, "2015.12.14": 2, "2015.12.09": 1, "2015.12.08": 3} 2 | -------------------------------------------------------------------------------- /static/data/hot_topic/2/keywords.json: -------------------------------------------------------------------------------- 1 | {"要求": 0.2168430257033911, "信息": 0.17613858265776722, "经营": 0.23127291875921085, "办法": 0.5034924397242753, "红线": 0.2249833330449719, "网络": 0.27740940580158757, "解读": 0.1931281772409613, "暂行办法": 0.4476186986200558, "法律法规": 0.19774209803442158, "防范": 0.17775340094386952, "清单": 0.4476186986200558, "备案": 0.29997777739329584, "征求意见": 0.6297190964058518, "禁止": 0.3655979161980793, "金融监管": 0.3365070199890776, "管理": 0.2456868918341056, "有关": 0.2496664987015737, "地方": 0.25956039027647254, "报送": 0.1944116659660464, "意见": 0.19022984141392865, "信息化": 0.20015410334910752, "责任编辑": 0.19226695576035002, "义务": 0.2916174989490696, "存管": 0.19979699775334406, "保息": 0.2974193081219901, "会同": 0.2287475466846943, "明确": 0.319084603072442, "细则": 0.45070232635366, "公安部": 0.22306448109149257, "部门": 0.25507003024903296, "规定": 0.2689219817098808, "中介": 0.2177122623518977, "自担": 0.1782106938022092, "中介机构": 0.693978385617977, "活动": 0.30613085718110866, "行为": 0.2666301014158043, "合规": 0.18398674999237677, "保护": 0.17901629329788363, "整改": 0.24784942343499175, "应当": 0.33835668118752904, "起草": 0.17669159156054834, "资管": 0.1874861108708099, "披露": 0.19174454002591265, "不得": 0.3244741635681649, "出借": 0.5530105807142608, "众筹": 0.17400377997463243, "保本": 0.2355887887473978, "承担": 0.18265360797233252, "负面": 0.37609171357450355, "评估": 0.1782997514869983} 2 | -------------------------------------------------------------------------------- /static/data/hot_topic/3/hot.json: -------------------------------------------------------------------------------- 1 | {"2015.12.29": 1, "2015.12.28": 3, "2015.12.23": 1, "2015.12.22": 2, "2015.12.21": 5, "2015.12.20": 3, "2015.11.30": 1, "2015.11.17": 5, "2015.12.11": 1, "2015.12.16": 2, "2015.11.19": 1, "2015.12.07": 1, "2015.11.21": 1, "2015.11.24": 3, "2015.12.19": 6, "2015.12.18": 5, "2015.11.20": 1, "2015.11.18": 3} 2 | -------------------------------------------------------------------------------- /static/data/hot_topic/3/keywords.json: -------------------------------------------------------------------------------- 1 | {"股价": 0.13734352287459164, "净利润": 0.16110212523274461, "美股": 0.1845592914987935, "三季度": 0.10007705167455376, "每股": 0.11864525882065295, "递交": 0.12867049501014052, "万美元": 0.22713191449771541, "权则": 0.1134068051468604, "说明书": 0.12867049501014052, "LendingClub": 0.2238093493100279, "收为": 0.1296077773106976, "路演": 0.14501087189190917, "华兴": 0.162009721638372, "上市": 0.3784106006394457, "募资": 0.12867049501014052, "首日": 0.2372905176413059, "新股": 0.10007705167455376, "申请": 0.13216624658200335, "成功": 0.1968604164143504, "第一股": 0.3258812350637608, "国内": 0.09764199509110238, "海外": 0.11940571677031786, "承销商": 0.1296077773106976, "唐宁": 0.26428066359289437, "招股书": 0.2144508250169009, "宜信": 0.2699482995658228, "速报": 0.1296077773106976, "认购": 0.11437377334234715, "ADS": 0.1458087494745348, "发行价": 0.24784942343499175, "PPT": 0.17156066001352072, "YRD": 0.15726393834572733, "破发": 0.2109249045700497, "赴美": 0.2827065464968774, "美国": 0.25407694046566315, "提交": 0.14501087189190917, "资本市场": 0.12734221152487574, "净营收": 0.1134068051468604, "最新版": 0.10546245228502485, "百度": 0.17393375190474966, "美元": 0.3072984370060266, "IPO": 0.2835928328717615, "更新": 0.13182806535628105, "申请书": 0.10007705167455376, "净亏损": 0.1296077773106976, "招股": 0.1845592914987935, "区间": 0.10546245228502485, "登陆": 0.2605215638625117, "纽交所": 0.5101880625640854, "宜人贷": 0.7784574245709521} 2 | -------------------------------------------------------------------------------- /static/data/hot_topic/4/hot.json: -------------------------------------------------------------------------------- 1 | {"2015.12.22": 1, "2015.11.23": 4, "2015.12.02": 1, "2015.11.19": 4, "2015.11.26": 2, "2015.11.27": 1, "2015.11.24": 1, "2015.11.30": 2, "2015.11.20": 2, "2015.11.18": 2} 2 | -------------------------------------------------------------------------------- /static/data/hot_topic/4/keywords.json: -------------------------------------------------------------------------------- 1 | {"背书": 0.09439398608135204, "抢占": 0.049569884686998356, "控股": 0.058001259991544145, "果子": 0.0648038886553488, "黑天鹅": 0.0486029164915116, "联想": 0.10546245228502485, "e租宝": 0.09160533419505487, "电视广告": 0.0486029164915116, "新闻联播": 0.13631718288924546, "打广告": 0.11437377334234715, "招标会": 0.10007705167455376, "王思聪": 0.14870965406099504, "黄金": 0.1153704274327636, "电视": 0.0972058329830232, "金信网": 0.049569884686998356, "紫马财行": 0.0648038886553488, "标王": 0.24784942343499175, "广告主": 0.07148360833896697, "刘珺": 0.0486029164915116, "中赢": 0.0486029164915116, "翼龙贷": 0.2906034718497554, "拿下": 0.08578033000676036, "泛亚": 0.049569884686998356, "速报": 0.0648038886553488, "夺得": 0.081004860819186, "越描越黑": 0.0486029164915116, "熬过": 0.0486029164915116, "投放": 0.0648038886553488, "大干": 0.0486029164915116, "烧钱": 0.0541355310582768, "金银猫": 0.052731226142512425, "央视": 0.33477483700681704, "标版": 0.0486029164915116, "招标": 0.10007705167455376, "时段": 0.0972058329830232, "银谷": 0.11437377334234715, "寒冬": 0.05925113364795651, "标的物": 0.052731226142512425, "做广告": 0.081004860819186, "没好": 0.081004860819186, "资源": 0.08131712725935213, "媒体": 0.06562013880478347, "花费": 0.0486029164915116, "广告位": 0.1134068051468604, "亿成": 0.0486029164915116, "重金": 0.11864525882065295, "广告": 0.23378674087063733, "财经频道": 0.0486029164915116, "广告费用": 0.0648038886553488, "郭大刚": 0.07148360833896697} 2 | -------------------------------------------------------------------------------- /static/fonts/glyphicons-halflings-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/static/fonts/glyphicons-halflings-regular.eot -------------------------------------------------------------------------------- /static/fonts/glyphicons-halflings-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/static/fonts/glyphicons-halflings-regular.ttf -------------------------------------------------------------------------------- /static/fonts/glyphicons-halflings-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/static/fonts/glyphicons-halflings-regular.woff -------------------------------------------------------------------------------- /static/fonts/glyphicons-halflings-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/static/fonts/glyphicons-halflings-regular.woff2 -------------------------------------------------------------------------------- /static/img/bg.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/static/img/bg.jpg -------------------------------------------------------------------------------- /static/img/detail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/static/img/detail.png -------------------------------------------------------------------------------- /static/img/dl.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/static/img/dl.jpg -------------------------------------------------------------------------------- /static/img/hot_topic_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/static/img/hot_topic_1.jpg -------------------------------------------------------------------------------- /static/img/hot_topic_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/static/img/hot_topic_2.jpg -------------------------------------------------------------------------------- /static/img/hot_topic_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/static/img/hot_topic_3.jpg -------------------------------------------------------------------------------- /static/img/hot_topic_4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/static/img/hot_topic_4.jpg -------------------------------------------------------------------------------- /static/img/mh3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/static/img/mh3.jpg -------------------------------------------------------------------------------- /static/img/not_found.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindawei/p2p/b0771100b8d4740906a186d9668b7d662214409c/static/img/not_found.jpg -------------------------------------------------------------------------------- /static/js/jquery.cookie.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * jQuery Cookie Plugin v1.4.1 3 | * https://github.com/carhartl/jquery-cookie 4 | * 5 | * Copyright 2013 Klaus Hartl 6 | * Released under the MIT license 7 | */ 8 | (function (factory) { 9 | if (typeof define === 'function' && define.amd) { 10 | // AMD 11 | define(['jquery'], factory); 12 | } else if (typeof exports === 'object') { 13 | // CommonJS 14 | factory(require('jquery')); 15 | } else { 16 | // Browser globals 17 | factory(jQuery); 18 | } 19 | }(function ($) { 20 | 21 | var pluses = /\+/g; 22 | 23 | function encode(s) { 24 | return config.raw ? s : encodeURIComponent(s); 25 | } 26 | 27 | function decode(s) { 28 | return config.raw ? s : decodeURIComponent(s); 29 | } 30 | 31 | function stringifyCookieValue(value) { 32 | return encode(config.json ? JSON.stringify(value) : String(value)); 33 | } 34 | 35 | function parseCookieValue(s) { 36 | if (s.indexOf('"') === 0) { 37 | // This is a quoted cookie as according to RFC2068, unescape... 38 | s = s.slice(1, -1).replace(/\\"/g, '"').replace(/\\\\/g, '\\'); 39 | } 40 | 41 | try { 42 | // Replace server-side written pluses with spaces. 43 | // If we can't decode the cookie, ignore it, it's unusable. 44 | // If we can't parse the cookie, ignore it, it's unusable. 45 | s = decodeURIComponent(s.replace(pluses, ' ')); 46 | return config.json ? JSON.parse(s) : s; 47 | } catch(e) {} 48 | } 49 | 50 | function read(s, converter) { 51 | var value = config.raw ? s : parseCookieValue(s); 52 | return $.isFunction(converter) ? converter(value) : value; 53 | } 54 | 55 | var config = $.cookie = function (key, value, options) { 56 | 57 | // Write 58 | 59 | if (value !== undefined && !$.isFunction(value)) { 60 | options = $.extend({}, config.defaults, options); 61 | 62 | if (typeof options.expires === 'number') { 63 | var days = options.expires, t = options.expires = new Date(); 64 | t.setTime(+t + days * 864e+5); 65 | } 66 | 67 | return (document.cookie = [ 68 | encode(key), '=', stringifyCookieValue(value), 69 | options.expires ? '; expires=' + options.expires.toUTCString() : '', // use expires attribute, max-age is not supported by IE 70 | options.path ? '; path=' + options.path : '', 71 | options.domain ? '; domain=' + options.domain : '', 72 | options.secure ? '; secure' : '' 73 | ].join('')); 74 | } 75 | 76 | // Read 77 | 78 | var result = key ? undefined : {}; 79 | 80 | // To prevent the for loop in the first place assign an empty array 81 | // in case there are no cookies at all. Also prevents odd result when 82 | // calling $.cookie(). 83 | var cookies = document.cookie ? document.cookie.split('; ') : []; 84 | 85 | for (var i = 0, l = cookies.length; i < l; i++) { 86 | var parts = cookies[i].split('='); 87 | var name = decode(parts.shift()); 88 | var cookie = parts.join('='); 89 | 90 | if (key && key === name) { 91 | // If second argument (value) is a function it's a converter... 92 | result = read(cookie, value); 93 | break; 94 | } 95 | 96 | // Prevent storing a cookie that we couldn't decode. 97 | if (!key && (cookie = read(cookie)) !== undefined) { 98 | result[name] = cookie; 99 | } 100 | } 101 | 102 | return result; 103 | }; 104 | 105 | config.defaults = {}; 106 | 107 | $.removeCookie = function (key, options) { 108 | if ($.cookie(key) === undefined) { 109 | return false; 110 | } 111 | 112 | // Must not alter options, thus extending a fresh object... 113 | $.cookie(key, '', $.extend({}, options, { expires: -1 })); 114 | return !$.cookie(key); 115 | }; 116 | 117 | })); 118 | -------------------------------------------------------------------------------- /static/js/p2p/layout.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by Administrator on 2016/4/19. 3 | */ 4 | document.onkeydown = function (event) { 5 | var e = event || window.event || arguments.callee.caller.arguments[0]; 6 | if (e && e.keyCode == 13) { // enter 键 7 | $("#nav_search_btn").click(); 8 | } 9 | }; 10 | 11 | /** 显示错误 */ 12 | function my_alert(error_str) { 13 | $('#myModal').modal('show') 14 | $('#modal-alert').html(error_str); 15 | } 16 | 17 | 18 | $(document).ready(function () { 19 | $("#nav_search_btn").click(function () { 20 | var key_word = $("#search_key")[0].value; 21 | if (key_word.length == 0) 22 | my_alert("平台名称不能为空!") 23 | else 24 | window.location.href = "/search/" + key_word; 25 | }); 26 | 27 | if ($.cookie('username') == null) { // 未登录 28 | $('#sign_in_out').text('登录'); 29 | $('#register').removeClass('hidden') 30 | $('#grzx').addClass('hidden') 31 | $('#sign_in_out').click(function () { 32 | window.location.href = "/sign_in"; 33 | }) 34 | } else { // 已登录 35 | $('#sign_in_out').text('退出'); 36 | $('#register').addClass('hidden') 37 | $('#grzx').removeClass('hidden') 38 | $('#sign_in_out').click(function () { 39 | $.removeCookie('username'); 40 | window.location.href = "/home"; 41 | }) 42 | } 43 | }); 44 | 45 | 46 | -------------------------------------------------------------------------------- /static/js/p2p/qwzx_hot_topic.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by Administrator on 2016/4/19. 3 | */ 4 | // 根据返回信息添加元素 5 | function append_info(topic_id, items) { 6 | for (var i = 0; i < items.length; ++i) { 7 | $('#show').append( 8 | "" + 9 | "
" + 10 | "
" + items[i].title + "
" + 11 | "
" + items[i].item_pub_time + "
" + 12 | "
" + 13 | "
" 14 | ); 15 | } 16 | } 17 | 18 | // 显示热点词汇 19 | function createRandomItemStyle() { 20 | var base = 225; 21 | return { 22 | normal: { 23 | color: 'rgb(' + [ 24 | Math.round(Math.random() * base), 25 | Math.round(Math.random() * base), 26 | Math.round(Math.random() * base) 27 | ].join(',') + ')' 28 | } 29 | }; 30 | } 31 | 32 | 33 | function show_hot_word(word_data) { 34 | 35 | $("#hot_content").html(""); 36 | var size = 50; 37 | var show_data = []; 38 | for (var i = 0; i < word_data.length; ++i) { 39 | var word = word_data[i]; 40 | if (size > 40) 41 | size -= 4; 42 | else if (size > 24) 43 | size -= 2; 44 | else if (size > 8) 45 | size -= 1; 46 | 47 | var item = {}; 48 | item['name'] = word['name']; 49 | item['value'] = size; 50 | item['itemStyle'] = createRandomItemStyle(); 51 | show_data.push(item); 52 | } 53 | var cy_chart = echarts.init(document.getElementById('hot_content')); 54 | option = { 55 | series: [{ 56 | type: 'wordCloud', 57 | size: ['100%', '100%'], 58 | textRotation: [0, 45, -45, 90], 59 | textPadding: 1, 60 | autoSize: { 61 | enable: true, 62 | minSize: 40 63 | }, 64 | data: show_data 65 | }] 66 | }; 67 | cy_chart.setOption(option); 68 | } 69 | 70 | // 显示热点趋势 71 | function show_hot_trend(hot_map) { 72 | div_object = $("#hot_trend"); 73 | div_object.height(Math.round(div_object.width() * 0.45)) 74 | var hot_trend_chart = echarts.init(document.getElementById('hot_trend')); 75 | hot_trend_option = { 76 | tooltip: { 77 | trigger: 'axis' 78 | }, 79 | grid: { 80 | left: '3%', 81 | right: '4%', 82 | bottom: '3%', 83 | containLabel: true 84 | }, 85 | xAxis: [ 86 | { 87 | type: 'category', 88 | boundaryGap: false, 89 | data: hot_map.x 90 | } 91 | ], 92 | yAxis: [ 93 | { 94 | name: '热度值', 95 | type: 'value' 96 | } 97 | ], 98 | series: [ 99 | { 100 | name: '热度', 101 | type: 'line', 102 | label: { 103 | normal: { 104 | show: true, 105 | position: 'top' 106 | } 107 | }, 108 | areaStyle: {normal: {}}, 109 | data: hot_map.y 110 | } 111 | ] 112 | }; 113 | hot_trend_chart.setOption(hot_trend_option); 114 | } 115 | 116 | // 加载数据 117 | $(document).ready(function () { 118 | 119 | var topic_id = $("#topic_id").text(); 120 | if (topic_id == 1) { 121 | $("#title").text("e租宝涉嫌违法经营分崩离析"); 122 | } else if (topic_id == 2) { 123 | $("#title").text("P2P监管办法征求意见稿发布"); 124 | 125 | } else if (topic_id == 3) { 126 | $("#title").text("宜人贷上市"); 127 | 128 | } else if (topic_id == 4) { 129 | $("#title").text("翼龙贷3.7亿豪夺央视标王"); 130 | } 131 | 132 | $.getJSON("/info/hot/topic/preview/" + topic_id, function (data) { 133 | show_hot_word(data.keyword_list); 134 | append_info(topic_id, data.item_list); 135 | show_hot_trend(data.hot_map) 136 | 137 | }); 138 | 139 | }); -------------------------------------------------------------------------------- /static/js/p2p/qwzx_type.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by Administrator on 2016/4/19. 3 | */ 4 | 5 | var rowCount = 40; 6 | var current = 0; 7 | var type; 8 | 9 | // 加载数据 10 | $(document).ready(function () { 11 | 12 | type = $("#type").text(); 13 | // 重命名 14 | if (type == 'news') 15 | $("#type").text("新闻") 16 | else if (type == 'policy') 17 | $("#type").text("政策") 18 | else if (type == 'opinion') 19 | $("#type").text("观点") 20 | else if (type == 'ugc') 21 | $("#type").text("用户评论") 22 | 23 | // 导航栏 24 | $.getJSON("/info/" + type + "/list/size", function (data) { 25 | $('#num_info').text('每页最多 ' + rowCount + '条,共 ' + data.list_size + '条') 26 | // 计算页数 27 | var pages = Math.floor(data.list_size / rowCount); 28 | if ((data.list_size % rowCount) > 0) 29 | pages += 1; 30 | // 添加导航栏 31 | for (var i = 0; i < pages; ++i) { 32 | if (i == 0) 33 | $('#pages').append("
  • " + "" + (i + 1) + "
  • ") 34 | else 35 | $('#pages').append("
  • " + (i + 1) + "
  • ") 36 | } 37 | // 添加点击事件 38 | $(".mynav").click(reget_list); 39 | }); 40 | 41 | // 显示 42 | $.getJSON("/info/" + type + "/list/current=" + current + "&rowCount=" + rowCount, function (data) { 43 | show_list(data); 44 | }); 45 | }); 46 | 47 | // 重新刷新数据 48 | function reget_list() { 49 | var url = "/info/" + type + "/list/current=" + ($(this).text() - 1) + "&rowCount=" + rowCount; 50 | $.getJSON(url, function (data) { 51 | show_list(data); 52 | }); 53 | $(".mynav").attr("class", "mynav"); 54 | $(this).attr("class", "mynav active"); 55 | 56 | } 57 | 58 | // 显示数据 59 | function show_list(data) { 60 | $('#show').html(""); 61 | for (var i = 0; i < data.type_list.length; ++i) { 62 | var item = data.type_list[i]; 63 | if (type == 'ugc') { 64 | $('#show').append( 65 | "" + 66 | "
    " + 67 | "
    " + 68 | "Q:   " + item.title + "
    " + 69 | "
    " + 70 | "
    " + 71 | "
    " + 72 | "A:   " + item.content + "
    " + 73 | "
    " + item.author + "
    " + 74 | "
    " + item.item_pub_time + "
    " + 75 | "
    " + 76 | "
    " 77 | ); 78 | } else { 79 | var title = item.title; 80 | if (title.length == 0) { 81 | var content = item.content.substr(0, 40) 82 | title = content; 83 | } 84 | var head_str = ' '; 85 | if (type == 'news') { 86 | head_str = ' '; 87 | } else if (type == 'policy') { 88 | head_str = ' '; 89 | } else if (type == 'opinion') { 90 | head_str = ' '; 91 | } 92 | var tags_str = ""; 93 | var tags = item.tags; 94 | if (tags.length > 0) { 95 | var tags_list = tags.split(','); 96 | for (var j = 0; j < tags_list.length; ++j) { 97 | tag = tags_list[j] 98 | tags_str += head_str + tag + ''; 99 | } 100 | } 101 | 102 | $('#show').append( 103 | "" + 104 | "
    " + 105 | "
    " + title + tags_str + "
    " + 106 | "
    " + item.author + "
    " + 107 | "
    " + item.item_pub_time + "
    " + 108 | "
    " + 109 | "
    " 110 | ); 111 | } 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /static/js/p2p/qwzx_type_detail.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by Administrator on 2016/4/19. 3 | */ 4 | 5 | $(document).ready(function () { 6 | 7 | // 处理文本中的换行和空格 8 | var content = $("#content").text(); 9 | content = content.replace(new RegExp("#n#", "gm"), "
    "); 10 | content = content.replace(new RegExp("#r#", "gm"), ""); 11 | content = content.replace(new RegExp(" ", "gm"), " ") 12 | $("#content").html(content); 13 | 14 | if ($("#title").text().length == 0) 15 | $("#title").html(content.substr(0, 20) + "...") 16 | 17 | type = $("#type").text(); 18 | if (type == 'news') 19 | $("#type").text("新闻") 20 | else if (type == 'policy') 21 | $("#type").text("政策") 22 | else if (type == 'opinion') 23 | $("#type").text("观点") 24 | else if (type == 'ugc') 25 | $("#type").text("用户评论") 26 | }); 27 | -------------------------------------------------------------------------------- /templates/detail_problem.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | 3 | {% block title %} 问题平台档案 {% endblock %} 4 | 5 | {% block body %} 6 |
    7 |
    8 | 9 |
    10 |

    0条记录

    11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 |
    编号平台名称上线时间问题时间地区注册资本问题类型
    26 |
    27 | 28 |
    29 |
    30 | 31 | {% endblock %} -------------------------------------------------------------------------------- /templates/detail_problem_analyze.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | 3 | {% block title %} 问题平台分析 {% endblock %} 4 | 5 | {% block body %} 6 | 7 |
    8 |
    9 | 10 |
    11 |
    12 |
    13 | 14 |
    15 |
    16 |
    17 | 18 |
    19 |
    20 |
    21 | 22 |
    23 |
    24 |
    25 | 26 |
    27 |
    28 |
    29 | 30 |
    31 |
    32 | 33 | 34 | 35 | 36 | {% endblock %} -------------------------------------------------------------------------------- /templates/detail_rank.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | 3 | {% block title %} 平台档案 {% endblock %} 4 | 5 | {% block body %} 6 | 7 |
    8 |
    9 | 10 |
    11 |

    热门平台 Top0    排序:  

    13 |
    14 | 15 |
    16 | 17 | 18 | 19 | 20 | 21 | 22 | 35 | 48 | 61 | 73 | 74 | 75 | 76 | 77 | 78 | 79 |
    编号平台名称 23 | 34 | 36 | 47 | 49 | 60 | 62 | 72 | 所在地区平台详情
    80 |
    81 | 82 |
    83 | 84 | 85 |   用户观点向上轮播 86 | 87 |
    88 | 89 | 96 | 97 |
    99 | 100 |
    101 | 102 |
    103 |
    104 | 105 | 106 | 107 | 108 | {% endblock %} -------------------------------------------------------------------------------- /templates/grzx.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | 3 | {% block title %} 个人中心 {% endblock %} 4 | 5 | {% block body %} 6 |
    7 |
    8 |
    9 | 12 |
    13 | 14 | 15 | 24 | 25 |
    26 |
    27 | 28 | 29 | 30 | 31 |
    32 |
    33 |
    34 |
    35 | 36 |
    37 |
    38 | 39 | 40 | 41 | 42 | {% endblock %} -------------------------------------------------------------------------------- /templates/home.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | 3 | {% block title %}首页 {% endblock %} 4 | 5 | 6 | {% block body %} 7 | 8 |
    9 | 10 |
    11 |

    昆仑镜

    12 |

    昆仑镜为您提供全面的网贷行业资讯,为您的决策提供支持。

    13 |
    14 |
    15 | 16 | {% endblock %} 17 | -------------------------------------------------------------------------------- /templates/info_hot_topic.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | 3 | {% block title %} 热点话题 {% endblock %} 4 | 5 | {% block body %} 6 |
    7 |
    8 | 9 | 24 | 25 |
    26 |
    27 | 33 |
    34 | 35 |
    36 |
    37 | 38 | 39 | 40 | {% endblock %} -------------------------------------------------------------------------------- /templates/info_hot_topic_news_detail.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | 3 | {% block title %} 热点新闻详情 {% endblock %} 4 | 5 | {% block body %} 6 |
    7 |
    8 |
    9 |
    10 |

    {{ data_info.title }}

    11 |
    12 |

    13 | 日期:{{ data_info.item_pub_time }}
    14 |

    15 |

    {{ data_info.concent }}

    16 |
    17 |
    18 |
    19 |
    20 | 21 | 32 | {% endblock %} -------------------------------------------------------------------------------- /templates/info_type.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | 3 | {% block title %} 资讯类型 {% endblock %} 4 | 5 | {% block body %} 6 |
    7 |
    8 |
    9 | 13 |

    14 |
    15 |
      16 |
      17 |
      18 |
      19 | 20 | {% endblock %} -------------------------------------------------------------------------------- /templates/info_type_detail.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | 3 | {% block title %} 资讯详情 {% endblock %} 4 | 5 | {% block body %} 6 | 7 |
      8 | 13 |
      14 | 15 |
      16 |
      17 |
      18 |
      19 |

      {{ data_info.title }}

      20 |
      21 |

      22 | {{ data_info.item_pub_time }}     23 | {{ data_info.author }}     24 | {{ data_info.url }} 25 |

      26 |
      27 |

      {{ data_info.content }}

      28 |
      29 |
      30 |
      31 |
      32 | 33 | {% endblock %} -------------------------------------------------------------------------------- /templates/layout.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% block title %} 昆仑镜 {% endblock %} 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | {% block head %} 17 | 56 | 69 | 70 | 71 | {% endblock %} 72 | 73 | {% block body %} 74 | {% endblock %} 75 | 76 | {% block footer %} 77 | {% endblock %} 78 | -------------------------------------------------------------------------------- /templates/register.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | 3 | {% block title %} 注册 {% endblock %} 4 | 5 | 6 | {% block body %} 7 | 8 | 9 | 10 | 11 |
      12 | 13 |
      14 |
      15 |

      16 |

      17 |

      18 |
      19 | 24 |
      25 | 28 |
      29 | 30 |
      31 |
      32 | 33 | 34 | 88 | {% endblock %} 89 | -------------------------------------------------------------------------------- /templates/search_detail_info.html: -------------------------------------------------------------------------------- 1 | {% extends "detail_info.html" %} 2 | 3 | {% block title %} 搜索结果 {% endblock %} -------------------------------------------------------------------------------- /templates/search_info.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | 3 | {% block title %} 图谱搜索 {% endblock %} 4 | 5 | {% block body %} 6 | 7 |
      8 | 9 | 10 | 14 | 15 |
      16 |

      {{ data_info.key_word }}

      17 |
      18 |

      19 |          描述部分(示例):此前,在Facebook上搜索好友是一件痛苦的差事。尽管Facebook提供了搜索栏,但用户很难根据某些条件迅速找到好友。不过,扎克伯格正在测试Facebook将于2013年1月15日发布的一项新功能。这一功能将改变Facebook的用户体验,对竞争对手形成威胁,并可能引起隐私保护组织的抗议。对扎克伯格而言,他需要的搜索条件是“居住在Palo 20 | Alto附近的普里西拉和我的朋友”。扎克伯格表示:“我们邀请到5个人,他们都喜欢宠物犬。”
      21 |          Graph 22 | Search诞生之前业内观察家一直关注Facebook能否释放网站搜索栏的潜力。谷歌对此尤为关注,因为Facebook的搜索服务能获得谷歌搜索引擎无法获取的大量数据,成为谷歌的重要竞争对手。他们也关注Facebook的搜索产品如何运作。一切都水落石出。Facebook新推出的社交图谱搜索Graph 23 | Search与传统互联网搜索有着根本的不同。谷歌搜索引擎能检索全球的大量信息,帮助用户寻找问题答案。与此不同,Facebook的搜索服务利用该网站庞大的数据库,帮助用户更好地利用“社交图谱”。根据扎克伯格的描述,社交图谱包括用户与好友和熟人的关系,以及他们喜欢的明星和品牌。
      24 |          在Graph 25 | Search发布数周前,Facebook高管仍在研究如何命名这一搜索服务。他们希望避免使用“搜索”一词,从而使该服务区别于传统的互联网搜索。例如,在发布的几天前,Facebook一名高管还以“浏览”来称呼该服务。然而在经过几小时讨论后,他们做出了妥协:没有比Graph 26 | Search更好的名字。扎克伯格表示:“这清楚地阐明这是一款搜索服务。而社交图谱是一件大事。”
      27 |          Graph 28 | Search的理念在于,以谷歌搜索引擎发掘互联网信息的方式,发掘Facebook网站社交图谱的含义。扎克伯格表示:“人们使用搜索引擎去回答问题,而我们可以回答其他人无法回答的许多问题。其他搜索服务主要编目了公开信息,而Facebook的信息并不在此列。这是人们分享的内容。此前没有一种很好的方式去了解人们分享了哪些信息,以满足人类发现信息以及寻找他人的需求。我们可以在这一方面有所作为。我们也是全球唯一有能力做到这一点的公司。” 29 | 结果令人兴奋。这一具有变革意义的产品能帮助用户做许多事,而这些事是用户无法自己去做的。凭借Graph 30 | Search,人们可以以全新方式去使用Facebook,例如搜索日期、查找招聘信息、寻找一同出游的伙伴,或检索餐厅等商户。更重要的是,Graph 31 | Search扩展了Facebook的核心使命:不仅帮助用户与已知的他人建立联系,还能成为一款发现工具。
      32 |          扎克伯格表示,Graph 33 | Search帮助Facebook找回了自己的“根”。他表示:“在建立Facebook之初,我们提供了类似的功能,但仅仅覆盖了你的同学。随后,Facebook一方面关注如何帮你认识周围的新朋友,探索你的社区,一方面也帮助你与已认识的人保持联系。但对于几千人的团队来说,同时关注这两方面存在困难。因此,我们的关注重点从帮助你找到想要的人转向了与已认识的人保持联系。Graph 34 | Search是一个升级版的发现工具。探索你的社区是人类的核心需求,而这是我们向这一方向迈出的重要一步。” 35 | 这只是许多步中的第一步。Graph Search将根据用户使用该服务的方式持续改进,因此Facebook并不急于全面推出该服务。在发布之初,Graph 36 | Search仅面向一小部分用户开放。扎克伯格认为,到面向全球上亿用户全面开放时,Graph Search将得到极大的改进。例如扎克伯格认为,Graph 37 | Search将帮助用户更方便地确定,在宠物犬的生日派对上应当邀请哪些好友。他表示:“我们目前还没有提供‘谁养狗’的选项。” 38 | Graph Search得到了Facebook的全面支持,项目团队共70人左右。 39 |

      40 |
      41 | 42 |
      43 | 44 | {% endblock %} -------------------------------------------------------------------------------- /templates/search_not_found.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | 3 | {% block title %}未找到 {% endblock %} 4 | 5 | {% block body %} 6 | 7 |
      8 |
      9 |
      10 | 11 |
      12 |
      13 |
      14 | 15 | {% endblock %} 16 | -------------------------------------------------------------------------------- /templates/sign_in.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | 3 | {% block title %} 登录 {% endblock %} 4 | 5 | 6 | {% block body %} 7 | 8 | 9 | 10 | 11 |
      12 |
      13 |
      14 | 16 |

      17 |

      18 | 19 | 20 |
      21 |
      22 |
      23 | 24 | 25 | 50 | {% endblock %} 51 | -------------------------------------------------------------------------------- /templates/yqdp.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | 3 | {% block title %} 舆情大盘 {% endblock %} 4 | 5 | {% block body %} 6 |
      7 |
      8 | 9 |
      10 | 13 |
      14 |
      15 | 16 |
      17 | 20 |
      21 |
      22 |
      23 | 24 |
      25 | 28 |
      29 |
      30 |
      31 | 32 |
      33 | 36 |
      37 |
      38 |
      39 | 40 |
      41 | 44 |
      45 |
      46 |
      47 | 48 |
      49 |
      50 | 51 | 52 | 53 | 54 | {% endblock %} -------------------------------------------------------------------------------- /test_db.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from pymongo import MongoClient 3 | 4 | ## 连接 5 | conn = MongoClient('localhost', 27017) 6 | db = conn.p2p 7 | 8 | # 连接数据库 9 | #type = 'opinion' 10 | #print(db[type].find_one({'_id': '25754'})) 11 | #print(int(db[type].count())) 12 | import sqlite3 13 | def sign_in_valid(userName, password): 14 | result = db.user.find_one({'username':userName,'password':password}) 15 | print result['platform_name'] 16 | 17 | 18 | result = db.user.find() 19 | for r in result : 20 | print r 21 | sign_in_valid('mdw','123') 22 | 23 | ls = [] 24 | ls.append("mi") 25 | print ls 26 | ls.remove("mi") 27 | print ls 28 | 29 | --------------------------------------------------------------------------------