├── .gitignore ├── README.md ├── _config.yml ├── requirements.txt └── src ├── __init__.py ├── algorithm ├── __init__.py └── cluster │ ├── Kmeans │ ├── __init__.py │ └── k_means_cluster.py │ ├── LDA │ ├── __init__.py │ └── lda_cluster.py │ ├── __init__.py │ └── singlePass │ ├── __init__.py │ ├── singlePassCluster.py │ ├── singlePassCluster_copy.py │ └── singlepassrun.py ├── configure.py ├── data_reader.py ├── dynamic_update_event.py ├── event2mysql.py ├── event_update.sh ├── history_event.py ├── load_event_data.py ├── model └── __init__.py ├── parser ├── __init__.py ├── news_parser │ ├── __init__.py │ ├── dbscan.py │ ├── tonghuashun.py │ └── xueqiu.py ├── requirement.txt └── xueqiu │ ├── README.md │ ├── __init__.py │ ├── discuss_focus_statistics_daily.sh │ ├── discuss_parser │ ├── __init__.py │ ├── discuss_data │ │ ├── __init__.py │ │ ├── discuss.db │ │ └── discuss.db' │ ├── discuss_parser.py │ ├── format_transform.py │ ├── participle │ │ └── __init__.py │ ├── xueqiu_dicsuss_batch.py │ ├── xueqiu_discuss_batch_multi.py │ ├── xueqiu_discuss_csv.py │ ├── xueqiu_discuss_csv_bak.py │ ├── xueqiu_discuss_daily.py │ ├── xueqiu_discuss_daily_bak.py │ ├── xueqiu_discuss_parser.py │ └── xueqiu_discuss_parser_bak.py │ ├── focus_parser │ ├── __init__.py │ ├── xueqiu_focus_statistics.py │ └── 雪球大V关注股票.ipynb │ └── log │ ├── dict_log.log.2019-04-29 │ ├── discuss_stock_filter_daily.log.2019-04-29 │ ├── tokenization_log.log.2019-04-29 │ └── xueqiu_focus_statistic.log.2019-04-29 ├── singlepass_run.py ├── singlepass_test.py └── utils ├── Keywords.py ├── VSM ├── __init__.py ├── tfidf.py └── vector.py ├── __init__.py ├── cluster_test.py ├── corpus_update.py ├── data_process.py ├── dicts.py ├── engine ├── __init__.py ├── data_source.py └── mysql_util.py ├── event_util.py ├── file_util.py ├── keywords_extractor.py ├── log ├── __init__.py ├── log2.py └── log_util.py ├── news.py ├── test.py ├── time_util.py └── tokenization.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | .DS_Store 108 | .idea/ 109 | /src/corpus 110 | .log 111 | /src/parser/xueqiu/discuss_parser/hs_err_pid40588.log 112 | /src/algorithm/cluster/singlePass/c2.txt 113 | /src/data/text.txt 114 | /src/model/ 115 | /src/data/full_text.txt 116 | /src/data/corpus_train.txt 117 | /src/data/text_keyword.txt 118 | /src/data/cluster_keywords_lda.txt 119 | /src/data/cluster_keywords_lsi.txt 120 | /src/data/cluster_result_document.txt 121 | /src/data/cluster_result_keyword.txt 122 | /src/model/tfidf_model/feature.pkl 123 | /src/model/tfidf_model/tfidftransformer.pkl 124 | /src/model/model_300_2_1 125 | /src/data/text_full.txt 126 | /src/data/text_title.txt 127 | /src/data/text_title_cut.txt 128 | /src/data/text_full_full.txt 129 | /src/data/ 130 | /log/ 131 | /src/data_copy/ 132 | /src/log/ 133 | /src/parser/log/ 134 | /requ.txt 135 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 金融财经类新闻文本主题事件提取 2 | 3 | ## 代码结构 4 | . 5 | ├── configure.py # 配置文件模块 6 | ├── data_reader.py # 数据读取和预处理模块 7 | ├── dynamic_update.py # 事件实时更新模块 8 | ├── history_event.py # 构建历史事件模块 9 | ├── load_history_event.py # load历史事件代码 10 | ├── algorithm 11 | │   └── cluster # 聚类模块 12 | │ ├── Kmeans # kmeans聚类 13 | │ │   └── k_means_cluster.py 14 | │ ├── LDA # kmeans聚类 15 | │ │   └── lda_cluster.py 16 | │ └── singlePass # singlepass聚类 17 | │ ├── singlePassCluster.py 18 | │ └── singlepassrun.py 19 | │ 20 | ├── corpus 21 | ├── data # 预处理的数据 22 | ├── log # 分词,关键词提取等日志文件 23 | ├── model # 存放各类模型文件,如聚类结果、事件结果、tfidf结果 24 | │ ├── event_model # 存放事件结果 25 | │ └── tfidf_model # 存放tfidf结果 26 | └── utils 27 | ├── Keywords.py # 关键词提取代码 28 | ├── cluster.py 29 | ├── data_process.py # 数据预处理 30 | ├── data_source.py # 数据读取 31 | ├── dicts.py # 分词词典 32 | ├── event_util.py # 事件类库 33 | ├── keywords_extractor.py # 关键词提取 34 | ├── my_util.py # 工具类 35 | ├── mysql_util.py # sql类 36 | ├── news.py # 新闻处理类库 37 | ├── test.py # 测试代码 38 | ├── tfidf.py # tfidf模型训练 39 | ├── time_util.py # 时间工具类 40 | ├── tokenization.py # 分词模块 41 | └── vector.py # 空间向量模块 42 | 43 | # 主要流程 44 | ## step、1 数据准备 45 | - 涉及文件:data_reader.py 46 | - 从数据库中读取指定日期前的所有新闻,然后整理成两部分数据。 47 | - 第一部分数据新闻的标题,正文组合在一起,然后分词去停等预处理,保存为新闻ID,发布时间, 分词后的正文;[news_id, timestamp, contents] 48 | - 第二部分提取新闻的标题,保存新闻的新闻ID, 发布时间, 新闻标题;[news_id, timestamp, title] 49 | 50 | ## step、2 VSM训练 51 | - 涉及文件:/utils/tfidf.py 52 | - 构建TFIDF空间向量模型,训练预料为step、1中第一部分保存的内容。 53 | - 空间向量模型保存。 54 | 55 | ## step、3 singlePass聚类 56 | - 涉及文件:singlepass_run.py 57 | - 对step、1中第一部分生成的数据进行singlePass聚类 58 | 59 | ## step、4 历史事件准备 60 | - 涉及文件:history_event.py 61 | - 根据step3聚类的结果,构建事件库, 包括添加事件标题,筛选事件涉及的股票,提取事件关键词等,对事件的有效性进行判断。 62 | 63 | ## step、5 事件更新 64 | - 涉及文件:dynamic_update.py 65 | - 当数据库中出现新的新闻之后,将新的新闻和历史事件进行合并,若合并不成功生成新的事件。 66 | 67 | 68 | ## step、6 数据写入数据库 69 | - 涉及文件:event2mysql.py 70 | - 事件生成之后,根据项目需求整理成规定的格式,存入数据库,目前保存的是[事件ID,事件标题, 事件包含的股票, 事件包含的新闻], [股票, 股票涉及的事件]两张表 71 | 72 | ## step、7 雪球讨论历史数据统计 73 | - 涉及文件: xueqiu_dicsuss_batch.py, xueqiu_dicsuss_batch.py 74 | - 统计雪球讨论数据中涉及到的相关股票 75 | 76 | ## step、8 数据格式转换 77 | - 涉及文件: format_transform.py 78 | - 将step 7中统计得到的数据转换数据格式,转换成个股以及个股涉及的讨论数目, 股票代码形式修改了。 -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | joblib 2 | tqdm 3 | backports.functools-lru-cache==1.5 4 | boto==2.49.0 5 | boto3==1.9.34 6 | botocore==1.12.34 7 | bz2file==0.98 8 | certifi==2018.10.15 9 | chardet==3.0.4 10 | cycler==0.10.0 11 | docutils==0.14 12 | futures==3.2.0 13 | gensim==3.6.0 14 | idna==2.7 15 | jieba==0.39 16 | jmespath==0.9.3 17 | JPype1==0.6.3 18 | kiwisolver==1.0.1 19 | matplotlib==2.2.3 20 | mysql-connector==2.1.6 21 | numpy==1.15.3 22 | pandas==0.23.4 23 | pyhanlp==0.1.44 24 | pyparsing==2.2.2 25 | python-dateutil==2.7.5 26 | pytz==2018.7 27 | requests==2.20.0 28 | s3transfer==0.1.13 29 | scikit-learn==0.20.0 30 | scipy==1.1.0 31 | six==1.11.0 32 | sklearn==0.0 33 | smart-open==1.7.1 34 | SQLAlchemy==1.2.12 35 | subprocess32==3.5.3 36 | urllib3==1.26.5 37 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: __init__.py.py 8 | @time: 2018/10/30 10:44 AM 9 | """ 10 | import sys 11 | sys.path.append('../') 12 | sys.path.append('..') 13 | sys.path.append('../../') 14 | # import pandas as pd 15 | # df = pd.DataFrame({"A":['a','a','b','c','d'],"B":[4,5,6,7,8]}).set_index("A") 16 | # # print df 17 | # kk = ['a', 'b'] 18 | # hj = [] 19 | # for j in kk: 20 | # tmp_res = (df.loc[j].values.tolist()) 21 | # if len(tmp_res) > 1: 22 | # for k in range(len(tmp_res)): 23 | # hj.extend(tmp_res[k]) 24 | # else: 25 | # hj.extend(tmp_res) 26 | # print hj 27 | 28 | 29 | # from collections import Counter 30 | # 31 | # stock_lists = ['a','b','b','b','b','c','c','c','d','d','d','d'] 32 | # 33 | # stock_lists_dict = Counter(stock_lists).items() 34 | # stock_lists_dict.sort(key=lambda item: item[1], reverse=True) 35 | # 36 | # stock_list = [] 37 | # for i in stock_lists_dict: 38 | # stock_list.append(i[0]) 39 | # 40 | # print stock_list 41 | 42 | 43 | # import pandas as pd 44 | # # from itertools import groupby #itertool还包含有其他很多函数,比如将多个list联合起来。。 45 | # # 46 | # df = pd.DataFrame({'event_id': [1, 2, 3,4, 5], 47 | # 'event_stock': [['i1','i2'], ['i3', 'i2'], ['i3', 'i5'], ['i9', 'i7'], ['i9']]}) 48 | # print df 49 | # 50 | # lst = {} 51 | # 52 | # for i in range(len(df)): 53 | # event_id = df.loc[i]['event_id'] 54 | # event_stock = df.loc[i]['event_stock'] 55 | # if len(event_stock) > 0: 56 | # for symbol in event_stock: 57 | # lst.setdefault(symbol, []).append(event_id) 58 | # 59 | # print lst 60 | 61 | # from collections import defaultdict 62 | # # lst = [{'a': 123}, {'a': 456},{'b': 789}] 63 | # 64 | # dic = {} 65 | # for _ in lst: 66 | # for k, v in _.items(): 67 | # dic.setdefault(k, []).append(v) 68 | # 69 | # print dic 70 | 71 | 72 | # from utils import file_util, event_util, time_util 73 | # from configure import conf 74 | # import datetime 75 | # import time 76 | # 77 | # history_event_file = file_util.find_newest_file(conf.event_save_path) 78 | # 79 | # print history_event_file 80 | -------------------------------------------------------------------------------- /src/algorithm/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: __init__.py.py 8 | @time: 2019-03-05 10:34 9 | """ -------------------------------------------------------------------------------- /src/algorithm/cluster/Kmeans/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: __init__.py.py 8 | @time: 2018/11/26 9:49 AM 9 | """ -------------------------------------------------------------------------------- /src/algorithm/cluster/Kmeans/k_means_cluster.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: k_means_cluster.py 8 | @time: 2018/11/26 9:50 AM 9 | """ 10 | 11 | from sklearn.cluster import KMeans 12 | from sklearn import feature_extraction 13 | from sklearn.feature_extraction.text import TfidfTransformer 14 | from sklearn.feature_extraction.text import CountVectorizer 15 | 16 | 17 | def tfidf_vector(corpus_path): 18 | """vectorize the input documents""" 19 | corpus_train = [] 20 | # 利用train-corpus提取特征 21 | target_train = [] 22 | for line in open(corpus_path): 23 | line = line.strip().split('\t') 24 | if len(line) == 2: 25 | words = line[1] 26 | category = line[0] 27 | target_train.append(category) 28 | corpus_train.append(words) 29 | print "build train-corpus done!!" 30 | # count_v1 = CountVectorizer(max_df=0.4, min_df=0.01) 31 | count_v1 = CountVectorizer() 32 | counts_train = count_v1.fit_transform(corpus_train) 33 | 34 | word_dict = {} 35 | for index, word in enumerate(count_v1.get_feature_names()): 36 | word_dict[index] = word 37 | 38 | print "the shape of train is " + repr(counts_train.shape) 39 | tfidftransformer = TfidfTransformer() 40 | tfidf_train = tfidftransformer.fit(counts_train).transform(counts_train) 41 | return tfidf_train, word_dict 42 | 43 | 44 | def cluster_kmeans(tfidf_train, word_dict, cluster_docs, cluster_keywords, num_clusters): # K均值分类 45 | """topic cluster""" 46 | 47 | f_docs = open(cluster_docs, 'w+') 48 | km = KMeans(n_clusters=num_clusters) 49 | km.fit(tfidf_train) 50 | clusters = km.labels_.tolist() 51 | cluster_dict = {} 52 | order_centroids = km.cluster_centers_.argsort()[:, ::-1] 53 | doc = 1 54 | for cluster in clusters: 55 | f_docs.write(str(str(doc)) + ',' + str(cluster) + '\n') 56 | doc += 1 57 | if cluster not in cluster_dict: 58 | cluster_dict[cluster] = 1 59 | else: 60 | cluster_dict[cluster] += 1 61 | f_docs.close() 62 | cluster = 1 63 | 64 | f_clusterwords = open(cluster_keywords, 'w+') 65 | for ind in order_centroids: # 每个聚类选 50 个词 66 | words = [] 67 | for index in ind: 68 | words.append(word_dict[index]) 69 | print cluster, ','.join(words) 70 | # f_clusterwords.write(str(cluster) + '\t' + ','.join(words) + '\n') 71 | cluster += 1 72 | print '*****' * 5 73 | f_clusterwords.close() 74 | 75 | 76 | def best_kmeans(tfidf_matrix, word_dict): 77 | """select the best cluster num""" 78 | 79 | import matplotlib.pyplot as plt 80 | from matplotlib.font_manager import FontProperties 81 | from sklearn.cluster import KMeans 82 | from scipy.spatial.distance import cdist 83 | import numpy as np 84 | K = range(1, 300) 85 | meandistortions = [] 86 | for k in K: 87 | print k,'****'*5 88 | kmeans = KMeans(n_clusters=k) 89 | kmeans.fit(tfidf_matrix) 90 | meandistortions.append(sum(np.min(cdist(tfidf_matrix.toarray(), kmeans.cluster_centers_, 'euclidean'), axis=1)) / tfidf_matrix.shape[0]) 91 | plt.plot(K, meandistortions, 'bx-') 92 | plt.grid(True) 93 | plt.xlabel('Number of clusters') 94 | plt.ylabel('Average within-cluster sum of squares') 95 | plt.title('Elbow for Kmeans clustering') 96 | plt.show() 97 | 98 | 99 | if __name__=='__main__': 100 | corpus_train = "/Users/li/PycharmProjects/event_parser/src/text.txt" 101 | cluster_docs = "/Users/li/PycharmProjects/event_parser/src/cluster_result_document.txt" 102 | cluster_keywords = "/Users/li/PycharmProjects/event_parser/src/cluster_result_keyword.txt" 103 | num_clusters = 15 104 | tfidf_train, word_dict = tfidf_vector(corpus_train) 105 | best_kmeans(tfidf_train, word_dict) 106 | cluster_kmeans(tfidf_train, word_dict, cluster_docs, cluster_keywords, num_clusters) -------------------------------------------------------------------------------- /src/algorithm/cluster/LDA/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: __init__.py.py 8 | @time: 2018/11/26 9:49 AM 9 | """ -------------------------------------------------------------------------------- /src/algorithm/cluster/LDA/lda_cluster.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: lda_cluster.py 8 | @time: 2018/11/26 5:01 PM 9 | """ 10 | 11 | import os, sys 12 | 13 | reload(sys) 14 | sys.setdefaultencoding('utf-8') 15 | from gensim.models import LdaModel, TfidfModel, LsiModel 16 | from gensim import similarities 17 | from gensim import corpora 18 | 19 | 20 | def create_data(corpus_path): # 构建数据,先后使用doc2bow和tfidf model对文本进行向量表示 21 | sentences = [] 22 | sentence_dict = {} 23 | count = 0 24 | for line in open(corpus_path): 25 | # print line 26 | line = line.strip().split('\t') 27 | if len(line) == 2: 28 | sentence_dict[count] = line[1] 29 | count += 1 30 | sentences.append(line[1].split(',')) 31 | else: 32 | break 33 | # 对文本进行处理,得到文本集合中的词表 34 | dictionary = corpora.Dictionary(sentences) 35 | # 利用词表,对文本进行cbow表示 36 | corpus = [dictionary.doc2bow(text) for text in sentences] 37 | # 利用cbow,对文本进行tfidf表示 38 | tfidf = TfidfModel(corpus) 39 | corpus_tfidf = tfidf[corpus] 40 | return sentence_dict, dictionary, corpus, corpus_tfidf 41 | 42 | 43 | def lda_model(sentence_dict, dictionary, corpus, corpus_tfidf, cluster_keyword_lda): # 使用lda模型,获取主题分布 44 | lda = LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=11) 45 | f_keyword = open(cluster_keyword_lda, 'w+') 46 | for topic in lda.print_topics(11, 53): 47 | print '****' * 5 48 | words = [] 49 | for word in topic[1].split('+'): 50 | word = word.split('*')[1].replace(' ', '') 51 | words.append(word) 52 | f_keyword.write(str(topic[0]) + '\t' + ','.join(words) + '\n') 53 | # 利用lsi模型,对文本进行向量表示,这相当于与tfidf文档向量表示进行了降维,维度大小是设定的主题数目 54 | corpus_lda = lda[corpus_tfidf] 55 | for doc in corpus_lda: 56 | print len(doc), doc 57 | return lda 58 | 59 | 60 | def lsi_model(sentence_dict, dictionary, corpus, corpus_tfidf, cluster_keyword_lsi): # 使用lsi模型,获取主题分布 61 | lsi = LsiModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=11) 62 | f_keyword = open(cluster_keyword_lsi, 'w+') 63 | for topic in lsi.print_topics(11, 50): 64 | print topic[0] 65 | words = [] 66 | for word in topic[1].split('+'): 67 | word = word.split('*')[1].replace(' ', '') 68 | words.append(word) 69 | f_keyword.write(str(topic[0]) + '\t' + ','.join(words) + '\n') 70 | 71 | return lsi 72 | 73 | 74 | if __name__ == "__main__": 75 | corpus_path = "/Users/li/PycharmProjects/event_parser/src/text.txt" 76 | # corpus_path = "/Users/li/PycharmProjects/event_parser/src/corpus_train.txt" 77 | cluster_keyword_lda = '/Users/li/PycharmProjects/event_parser/src/cluster_keywords_lda.txt' 78 | cluster_keyword_lsi = '/Users/li/PycharmProjects/event_parser/src/cluster_keywords_lsi.txt' 79 | sentence_dict, dictionary, corpus, corpus_tfidf = create_data(corpus_path) 80 | # for i in corpus_tfidf: 81 | # print i 82 | lsi_model(sentence_dict, dictionary, corpus, corpus_tfidf, cluster_keyword_lsi) 83 | lda_model(sentence_dict, dictionary, corpus, corpus_tfidf, cluster_keyword_lda) 84 | -------------------------------------------------------------------------------- /src/algorithm/cluster/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | # OnePassCluster 6 | 7 | @version: ?? 8 | @author: li 9 | @file: __init__.py.py 10 | @time: 2018/11/8 10:41 AM 11 | """ 12 | 13 | import sys 14 | sys.path.append('../') 15 | sys.path.append('..') 16 | sys.path.append('../../') -------------------------------------------------------------------------------- /src/algorithm/cluster/singlePass/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: __init__.py.py 8 | @time: 2018/11/28 3:49 PM 9 | """ 10 | -------------------------------------------------------------------------------- /src/algorithm/cluster/singlePass/singlePassCluster.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: singlePassCluster.py 8 | @time: 2018/11/26 9:48 AM 9 | """ 10 | import gc 11 | import time 12 | import numpy as np 13 | from math import sqrt 14 | from tqdm import tqdm 15 | 16 | 17 | class ClusterUnit(object): 18 | """ 19 | # 定义一个簇单元 20 | """ 21 | 22 | def __init__(self): 23 | self.node_list = [] # 该簇存在的节点列表 24 | self.node_num = 0 # 该簇节点数 25 | self.centroid = None # 该簇质心 26 | 27 | def add_node(self, node_id, node_vec): 28 | """ 29 | 为本簇添加指定节点,并更新簇心 30 | :param node_id: 节点ID 31 | :param node_vec: 该节点对应的特征向量 32 | :return: null 33 | """ 34 | self.node_list.append(node_id) 35 | try: 36 | self.centroid = (self.node_num * self.centroid + node_vec) / (self.node_num + 1) # 更新簇心 37 | except TypeError: 38 | self.centroid = np.array(node_vec) * 1 # 初始化质心 39 | self.node_num += 1 # 节点数加1 40 | 41 | def remove_node(self, node_id): 42 | # 移除本簇指定节点 43 | try: 44 | self.node_list.remove(node_id) 45 | # 更新簇心 46 | self.node_num -= 1 47 | except ValueError: 48 | raise ValueError("%s not in this cluster" % node_id) # 该簇本身就不存在该节点,移除失败 49 | 50 | def move_node(self, node_id, another_cluster): 51 | # 将本簇中的其中一个节点移至另一个簇 52 | self.remove_node(node_id=node_id) 53 | another_cluster.add_node(node_id=node_id) 54 | 55 | 56 | def euclidean_distance(vec_a, vec_b): 57 | # 计算向量a与向量b的欧式距离 58 | diff = vec_a - vec_b 59 | return sqrt(np.dot(diff, diff)) # dot计算矩阵内积 60 | 61 | 62 | def cosine_distance(vec_a, vec_b): 63 | # 计算向量a与向量b的余弦距离 64 | dot_product = 0.0 65 | norm_a = 0.0 66 | norm_b = 0.0 67 | for a, b in zip(vec_a, vec_b): 68 | dot_product += a * b 69 | norm_a += a ** 2 70 | norm_b += b ** 2 71 | if norm_a == 0.0 or norm_b == 0.0: 72 | return 0 73 | else: 74 | return round(dot_product / ((norm_a ** 0.5) * (norm_b ** 0.5)) * 100, 2) 75 | 76 | 77 | def cosine_distance_numpy(vector1, vector2): 78 | vector1 = vector1.reshape([-1]) 79 | vector2 = vector2.reshape([-1]) 80 | cos_v12 = np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2)) 81 | return cos_v12 82 | 83 | 84 | class OnePassCluster: 85 | def __init__(self, threshold, vector_tuple): 86 | # t:一趟聚类的阈值 87 | self.threshold = threshold # 一趟聚类的阈值 88 | # self.vectors = np.array(vector_tuple) 89 | self.vectors = vector_tuple 90 | self.cluster_list = [] # 聚类后簇的列表 91 | t1 = time.time() 92 | self.clustering() 93 | t2 = time.time() 94 | self.cluster_num = len(self.cluster_list) # 聚类完成后 簇的个数 95 | self.spend_time = t2 - t1 # 聚类花费的时间 96 | 97 | def clustering(self): 98 | self.cluster_list.append(ClusterUnit()) # 初始新建一个簇 99 | self.cluster_list[0].add_node(self.vectors[0][0], self.vectors[0][1]) # 将读入的第一个节点归于该簇 100 | for index in tqdm(range(len(self.vectors))[1:]): 101 | # min_distance = euclidean_distance(vec_a=self.vectors[index][1], 102 | # vec_b=self.cluster_list[0].centroid) # 与簇的质心的最小欧式距离 103 | min_distance = cosine_distance(vec_a=self.vectors[index][1], 104 | vec_b=self.cluster_list[0].centroid) # 与簇的质心的最小cosine距离 105 | 106 | # print("index:{}, min_distance:{}".format(index, min_distance)) 107 | min_cluster_index = 0 # 最小距离的簇的索引 108 | # print "len of cluster_list %s " % len(self.cluster_list) 109 | for cluster_index, cluster in enumerate(self.cluster_list[1:]): 110 | # enumerate会将数组或列表组成一个索引序列 111 | # 寻找距离最小的簇,记录下距离和对应的簇的索引 112 | # distance = euclidean_distance(vec_a=self.vectors[index][1], 113 | # vec_b=cluster.centroid) 114 | distance = cosine_distance(vec_a=self.vectors[index][1], 115 | vec_b=cluster.centroid) 116 | # print("cluster_index:{}, distance:{}".format(cluster_index, distance)) 117 | if distance > min_distance: # 使用欧式距离是改为小于号 118 | min_distance = distance 119 | min_cluster_index = cluster_index + 1 120 | # print 'max_dist: %s' % min_distance 121 | # print 'min_cluster_index: %s' % min_cluster_index 122 | if min_distance > self.threshold: # 最小距离小于阈值,则归于该簇 # 使用欧式距离时改为小于号 123 | self.cluster_list[min_cluster_index].add_node(self.vectors[index][0], self.vectors[index][1]) 124 | else: # 否则新建一个簇 125 | new_cluster = ClusterUnit() 126 | new_cluster.add_node(self.vectors[index][0], self.vectors[index][1]) 127 | self.cluster_list.append(new_cluster) 128 | del new_cluster 129 | gc.collect() 130 | 131 | def print_result(self, label_dict=None): 132 | # 打印出聚类结果 133 | # label_dict:节点对应的标签字典 134 | print("******* one-pass cluster result ***********") 135 | for index, cluster in enumerate(self.cluster_list): 136 | print("cluster:%s" % index) # 簇的序号 137 | print("簇心: %s" % cluster.centroid) # 簇心 138 | print(cluster.node_list) # 该簇的节点列表 139 | if label_dict is not None: 140 | print(" ".join([label_dict[n] for n in cluster.node_list])) # 若有提供标签字典,则输出该簇的标签 141 | print("node num: %s" % cluster.node_num) 142 | print("-------------") 143 | print("the number of nodes %s" % len(self.vectors)) 144 | print("the number of cluster %s" % self.cluster_num) 145 | print("spend time %.9fs" % (self.spend_time / 1000)) 146 | 147 | 148 | if __name__ == '__main__': 149 | # cluster unit 测试 150 | # cluster_unit = ClusterUnit() 151 | # cluster_unit.add_node(1, [1, 1, 2]) 152 | # cluster_unit.add_node(5, [2, 1, 2]) 153 | # cluster_unit.add_node(3, [3, 1, 2]) 154 | # print cluster_unit.centroid 155 | 156 | # 读取测试集 157 | temperature_all_city = np.loadtxt('c2.txt', delimiter=",", usecols=(3, 4)) # 读取聚类特征:[最高温度, 最低温度] 158 | temperature_all_city_index = np.loadtxt('c2.txt', delimiter=",", usecols=0) # 索引 159 | 160 | result = [] 161 | for i in range(len(temperature_all_city_index)): 162 | result.append((temperature_all_city_index[i], temperature_all_city[i])) 163 | 164 | xy_ = dict() 165 | xy = np.loadtxt('c2.txt', delimiter=",", usecols=(8, 9)) # 读取各地经纬度 166 | for i in range(len(temperature_all_city_index)): 167 | xy_[temperature_all_city_index[i]] = xy[i] 168 | 169 | f = open('c2.txt', 'r') 170 | lines = f.readlines() 171 | zone = [i.split(',')[1] for i in lines] # 读取地区并转化为字典 172 | zone_dict = dict() 173 | for i in range(len(zone)): 174 | zone_dict[temperature_all_city_index[i]] = zone[i] 175 | f.close() 176 | 177 | # 构建一趟聚类器 178 | clustering = OnePassCluster(vector_tuple=result, threshold=97) 179 | # clustering.print_result() 180 | clustering.print_result(label_dict=zone_dict) 181 | 182 | # 将聚类结果导出图 183 | # import matplotlib.pylab as pl 184 | # fig, ax = pl.subplots() 185 | # fig = zone_dict 186 | # c_map = pl.get_cmap('jet', clustering.cluster_num) 187 | # c = 0 188 | # for cluster in clustering.cluster_list: 189 | # for node in cluster.node_list: 190 | # ax.scatter(xy_[node][0], xy_[node][1], c=c, s=30, cmap=c_map, vmin=0, vmax=clustering.cluster_num) 191 | # c += 1 192 | # pl.show() 193 | -------------------------------------------------------------------------------- /src/algorithm/cluster/singlePass/singlePassCluster_copy.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: singlePassCluster.py 8 | @time: 2018/11/26 9:48 AM 9 | """ 10 | 11 | import numpy as np 12 | from math import sqrt 13 | import time 14 | import matplotlib.pylab as pl 15 | 16 | 17 | class ClusterUnit: 18 | """ 19 | # 定义一个簇单元 20 | """ 21 | def __init__(self): 22 | self.node_list = [] # 该簇存在的节点列表 23 | self.node_num = 0 # 该簇节点数 24 | self.centroid = None # 该簇质心 25 | 26 | def add_node(self, node, node_vec): 27 | """ 28 | 为本簇添加指定节点,并更新簇心 29 | node_vec:该节点的特征向量 30 | node:节点 31 | return:null 32 | """ 33 | self.node_list.append(node) 34 | try: 35 | self.centroid = (self.node_num * self.centroid + node_vec) / (self.node_num + 1) # 更新簇心 36 | except TypeError: 37 | self.centroid = np.array(node_vec) * 1 # 初始化质心 38 | self.node_num += 1 # 节点数加1 39 | 40 | def remove_node(self, node): 41 | # 移除本簇指定节点 42 | try: 43 | self.node_list.remove(node) 44 | # 更新簇心 45 | self.node_num -= 1 46 | except ValueError: 47 | raise ValueError("%s not in this cluster" % node) # 该簇本身就不存在该节点,移除失败 48 | 49 | def move_node(self, node, another_cluster): 50 | # 将本簇中的其中一个节点移至另一个簇 51 | self.remove_node(node=node) 52 | another_cluster.add_node(node_id=node) 53 | 54 | 55 | # cluster_unit = ClusterUnit() 56 | # cluster_unit.add_node(1, [1, 1, 2]) 57 | # cluster_unit.add_node(5, [2, 1, 2]) 58 | # cluster_unit.add_node(3, [3, 1, 2]) 59 | # print cluster_unit.centroid 60 | 61 | 62 | def euclidean_distance(vec_a, vec_b): 63 | # 计算向量a与向量b的欧式距离 64 | diff = vec_a - vec_b 65 | return sqrt(np.dot(diff, diff)) # dot计算矩阵内积 66 | 67 | 68 | def cosine_distance(vec_a, vec_b): 69 | # 计算向量a与向量b的余弦距离 70 | dot_product = 0.0 71 | normA = 0.0 72 | normB = 0.0 73 | for a, b in zip(vec_a, vec_b): 74 | dot_product += a * b 75 | normA += a ** 2 76 | normB += b ** 2 77 | if normA == 0.0 or normB == 0.0: 78 | return 0 79 | else: 80 | return round(dot_product / ((normA**0.5)*(normB**0.5)) * 100, 2) 81 | 82 | 83 | class OnePassCluster: 84 | def __init__(self, threshold, vector_list): 85 | # t:一趟聚类的阈值 86 | self.threshold = threshold # 一趟聚类的阈值 87 | self.vectors = np.array(vector_list) 88 | self.cluster_list = [] # 聚类后簇的列表 89 | t1 = time.time() 90 | self.clustering() 91 | t2 = time.time() 92 | self.cluster_num = len(self.cluster_list) # 聚类完成后 簇的个数 93 | self.spend_time = t2 - t1 # 聚类花费的时间 94 | 95 | def clustering(self): 96 | self.cluster_list.append(ClusterUnit()) # 初始新建一个簇 97 | self.cluster_list[0].add_node(0, self.vectors[0]) # 将读入的第一个节点归于该簇 98 | for index in range(len(self.vectors))[1:]: 99 | # min_distance = euclidean_distance(vec_a=self.vectors[index], 100 | # vec_b=self.cluster_list[0].centroid) # 与簇的质心的最小欧式距离 101 | min_distance = cosine_distance(vec_a=self.vectors[index], 102 | vec_b=self.cluster_list[0].centroid) # 与簇的质心的最小cosine距离 103 | 104 | # print("index:{}, min_distance:{}".format(index, min_distance)) 105 | min_cluster_index = 0 # 最小距离的簇的索引 106 | # print "len of cluster_list %s " % len(self.cluster_list) 107 | for cluster_index, cluster in enumerate(self.cluster_list[1:]): 108 | # enumerate会将数组或列表组成一个索引序列 109 | # 寻找距离最小的簇,记录下距离和对应的簇的索引 110 | # distance = euclidean_distance(vec_a=self.vectors[index], 111 | # vec_b=cluster.centroid) 112 | distance = cosine_distance(vec_a=self.vectors[index], 113 | vec_b=cluster.centroid) 114 | # print("cluster_index:{}, distance:{}".format(cluster_index, distance)) 115 | if distance > min_distance: # 使用欧式距离是改为小于号 116 | min_distance = distance 117 | min_cluster_index = cluster_index + 1 118 | print 'max_dist: %s' % min_distance 119 | print 'min_cluster_index: %s' % min_cluster_index 120 | if min_distance > self.threshold: # 最小距离小于阈值,则归于该簇 # 使用欧式距离时改为小于号 121 | self.cluster_list[min_cluster_index].add_node(index, self.vectors[index]) 122 | else: # 否则新建一个簇 123 | new_cluster = ClusterUnit() 124 | new_cluster.add_node(index, self.vectors[index]) 125 | self.cluster_list.append(new_cluster) 126 | del new_cluster 127 | 128 | def print_result(self, label_dict=None): 129 | # 打印出聚类结果 130 | # label_dict:节点对应的标签字典 131 | print "******* one-pass cluster result ***********" 132 | for index, cluster in enumerate(self.cluster_list): 133 | print "cluster:%s" % index # 簇的序号 134 | print "簇心: %s" % cluster.centroid # 簇心 135 | print cluster.node_list # 该簇的节点列表 136 | if label_dict is not None: 137 | print " ".join([label_dict[n] for n in cluster.node_list]) # 若有提供标签字典,则输出该簇的标签 138 | print "node num: %s" % cluster.node_num 139 | print "-------------" 140 | print "the number of nodes %s" % len(self.vectors) 141 | print "the number of cluster %s" % self.cluster_num 142 | print "spend time %.9fs" % (self.spend_time / 1000) 143 | 144 | 145 | if __name__ == '__main__': 146 | # 读取测试集 147 | temperature_all_city = np.loadtxt('c2.txt', delimiter=",", usecols=(3, 4)) # 读取聚类特征:[最高温度, 最低温度] 148 | xy = np.loadtxt('c2.txt', delimiter=",", usecols=(8, 9)) # 读取各地经纬度 149 | # print(temperature_all_city) 150 | f = open('c2.txt', 'r') 151 | lines = f.readlines() 152 | zone_dict = [i.split(',')[1] for i in lines] # 读取地区并转化为字典 153 | f.close() 154 | 155 | # 构建一趟聚类器 156 | clustering = OnePassCluster(vector_list=temperature_all_city, threshold=90) 157 | clustering.print_result(label_dict=zone_dict) 158 | 159 | # 将聚类结果导出图 160 | fig, ax = pl.subplots() 161 | fig = zone_dict 162 | c_map = pl.get_cmap('jet', clustering.cluster_num) 163 | c = 0 164 | for cluster in clustering.cluster_list: 165 | for node in cluster.node_list: 166 | ax.scatter(xy[node][0], xy[node][1], c=c, s=30, cmap=c_map, vmin=0, vmax=clustering.cluster_num) 167 | c += 1 168 | pl.show() 169 | -------------------------------------------------------------------------------- /src/algorithm/cluster/singlePass/singlepassrun.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: singlepass_run.py 8 | @time: 2018/11/29 8:04 PM 9 | """ 10 | import sys 11 | sys.path.append('..') 12 | sys.path.append('../') 13 | sys.path.append('../../') 14 | import pickle 15 | from src.configure import conf 16 | from src.utils.VSM import tfidf 17 | from src.algorithm.cluster.singlePass import singlePassCluster 18 | 19 | # corpus_train_path = "/Users/li/PycharmProjects/event_parser/src/data/text_full_index.txt" 20 | corpus_train_path = conf.corpus_train_path 21 | # tfidf_train, word_dict = tfidf_vector(corpus_train) 22 | # tfidf_train, word_dict = tfidf.tfidf_vector(corpus_train) 23 | tfidf_train_dict, tfidf_train_tuple, word_dict = tfidf.tfidf_vectorizer(corpus_train_path) 24 | # print np.shape(tfidf_train.toarray()) 25 | # print tfidf_train.toarray()[1] 26 | 27 | # clustering = OnePassCluster(vector_tuple=tfidf_train.toarray(), threshold=10) 28 | clustering = singlePassCluster.OnePassCluster(vector_tuple=tfidf_train_tuple, threshold=10) 29 | clustering.print_result() 30 | 31 | # 将聚好的类簇保存下来,为后面的事件表示和有效事件判断使用。 32 | # clustering_path = '/Users/li/PycharmProjects/event_parser/src/model/clustering_new.pkl' 33 | clustering_path = conf.clustering_save_path 34 | with open(clustering_path, 'wb') as fw: 35 | pickle.dump(clustering, fw) 36 | 37 | # for cluster_index, cluster in enumerate(cluster_list): 38 | # print "cluster:%s" % cluster_index # 簇的序号 39 | # print cluster.node_list # 该簇的节点列表 -------------------------------------------------------------------------------- /src/configure.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: configure.py 8 | @time: 2018/10/31 1:52 PM 9 | 配置文件 10 | """ 11 | 12 | import os 13 | 14 | 15 | class Configure(object): 16 | 17 | project_path = "/Users/li/PycharmProjects/event_parser/src" 18 | # project_path = os.getcwd() 19 | 20 | # singlepass_run 和history_event 使用同一个时间段前的新闻,动态更新使用该时间之后的新闻 21 | data_time = '1545235200' 22 | 23 | # 词典目录 24 | dic_path = project_path + '/corpus' 25 | stock_new_path = dic_path + "/stock.csv" 26 | 27 | # 停用词目录 28 | stop_words_path = project_path + '/corpus/stop_words_CN' 29 | 30 | # tf-idf 训练语料文件位置,标题和正文合并在一起 31 | corpus_train_path = project_path + "/data/text_full_index.txt" 32 | 33 | # 新闻标题的保存路径 34 | corpus_news_title = project_path + "/data/text_title_index.txt" 35 | 36 | # singlePass聚类结果保存目录文件 37 | clustering_save_path = project_path + '/model/clustering_new_10.pkl' 38 | # clustering_save_path = project_path + '/model/clustering_new_20.pkl' 39 | # clustering_save_path = project_path + '/model/clustering_new_30.pkl' 40 | # clustering_save_path = project_path + '/model/clustering_new_40.pkl' 41 | 42 | corpus_news = corpus_train_path 43 | 44 | event_unit_path = project_path + '/model/event_units_new_10.pkl' 45 | # event_unit_path = project_path + '/model/event_units_new_20.pkl' 46 | # event_unit_path = project_path + '/model/event_units_new_30.pkl' 47 | # event_unit_path = project_path + '/model/event_units_new_40.pkl' 48 | 49 | event_save_path = project_path + '/model/event_model/' 50 | 51 | # TF-IDF计算相关文件 52 | tfidf_feature_path = project_path + '/model/tfidf_model/feature_full.pkl' 53 | tfidftransformer_path = project_path + '/model/tfidf_model/tfidftransformer_full.pkl' 54 | word_dict_path = project_path + '/model/tfidf_model/word_dict_full.pkl' 55 | 56 | 57 | conf = Configure() 58 | -------------------------------------------------------------------------------- /src/dynamic_update_event.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: dynamic_update_event.py 8 | @time: 2018/12/5 2:23 PM 9 | 增量式事件更新,基于历史事件库,将新增新闻实时与历史事件库进行相似度计算,最后合并 10 | # 每天十二点之前更新一次 11 | # 每天开盘前更新一次 12 | """ 13 | import sys 14 | import gc 15 | import time 16 | import datetime 17 | from src import data_reader 18 | import pandas as pd 19 | from tqdm import tqdm 20 | 21 | sys.path.append('../') 22 | sys.path.append('..') 23 | sys.path.append('../../') 24 | 25 | from src.utils.log import log_util 26 | from src.configure import conf # noqa: E402 27 | from src.utils import event_util, file_util, data_process, dicts, tokenization, time_util # noqa: E402 28 | from src.utils.VSM import tfidf 29 | from src.algorithm.cluster.singlePass import singlePassCluster 30 | 31 | logging = log_util.Logger('dynamic_update', level='debug') 32 | logging.logger.info('事件库动态更新启动时间: {}'.format(time_util.timestamp_to_time(time.time()))) 33 | # step 1、读取指定日期之后的新闻 34 | # 初次动态更新时,event_save_path下保存的是event 35 | latest_event_file = file_util.find_newest_file(conf.event_save_path) 36 | if latest_event_file is None or latest_event_file is 'NULL': 37 | # 如果没有动态更新过事件, 则today_timestamp 38 | # 读取当前时间段时间 39 | now = datetime.date.today() 40 | today_timestamp = int(time.mktime(now.timetuple())) 41 | today = time_util.timestamp_to_time(today_timestamp) 42 | # logging.logger.info('读取新闻的起始时间: {}'.format(today)) 43 | # ordered_df = data_reader.get_ordered_data(timestamp=today_timestamp) 44 | else: 45 | # 使用事件的最后更新时间作为新闻的起止时间 46 | latest_event_time = latest_event_file.split('.')[0] 47 | today_timestamp = int(latest_event_time) 48 | today = time_util.timestamp_to_time(today_timestamp) 49 | 50 | logging.logger.info('读取新闻的起始时间: {}'.format(today)) 51 | ordered_df = data_reader.get_ordered_data(timestamp=today_timestamp) 52 | 53 | # load tf-idf VSM 54 | # tfidf_feature_path = '/Users/li/PycharmProjects/event_parser/src/model/tfidf_model/feature_1.pkl' 55 | # tfidftransformer_path = '/Users/li/PycharmProjects/event_parser/src/model/tfidf_model/tfidftransformer_1.pkl' 56 | tfidf_feature_path = conf.tfidf_feature_path 57 | tfidf_transformer_path = conf.tfidftransformer_path 58 | tfidf_feature = tfidf.load_tfidf_feature(tfidf_feature_path) 59 | tfidf_transformer = tfidf.load_tfidf_transformer(tfidf_transformer_path) 60 | 61 | # 导入词典,停用词,数据处理接口,分词接口 62 | dp, dict_init, stop_words = data_process.DataPressing(), dicts.init(), tokenization.load_stop_words() 63 | tk = tokenization.Tokenizer(dp, stop_words) 64 | 65 | # 提取dataFrame中的内容 66 | ordered_news_lists = data_reader.trans_df_data(ordered_df, tfidf_feature, tfidf_transformer, dp, tk) 67 | 68 | # 如果当天没有新闻更新,则直接退出程序,事件单元不需要更新。 69 | # 文章重复更新, 70 | if len(ordered_news_lists) <= 0: 71 | # print '今天没有新新闻,事件单元不更新' 72 | logging.logger.info('[事件库未更新]: 今天没有新新闻,事件单元不更新') 73 | sys.exit() 74 | 75 | # for tmp in ordered_news_lists: 76 | # print tmp[0], tmp[1] 77 | 78 | # step 2、导入历史事件 79 | # 如果第一次执行dynamic_update_event文件,则event_save_path 80 | # history_event_file = file_util.find_newest_file(conf.event_save_path) 81 | # history_event_file = conf.event_save_path + latest_event_file 82 | history_event_units = event_util.load_history_event(latest_event_file) 83 | # print "[Info] 事件库中事件的个数 %s" % len(history_event_units) 84 | logging.logger.info("[事件库中事件的个数:] {}".format(len(history_event_units))) 85 | # for index, event_unit in enumerate(history_event_units): 86 | # print "cluster: %s" % index # 簇的序号 87 | # print event_unit.node_list # 该簇的节点列表 88 | # print event_unit.centroid 89 | 90 | len_news = len(ordered_news_lists) 91 | new_event_units = [] 92 | new_event_units.extend(history_event_units) 93 | # step 3、遍历新新闻,然后将新新闻添加到事件单元中,更新事件单元的节点和簇心 94 | for news_index in tqdm(range(len_news)): # 遍历每一篇新的新闻 95 | # 新的节点id 96 | new_node_id = ordered_news_lists[news_index][0] 97 | # 新的节点的VSM 98 | new_node_vec = ordered_news_lists[news_index][2] 99 | # max_dist = singlePassCluster.cosine_distance(history_event_units[0].centroid, ordered_news_lists[news_index][2]) 100 | max_dist = singlePassCluster.cosine_distance(new_event_units[0].centroid, new_node_vec) 101 | min_event_index = 0 102 | for event_index, new_event_unit in enumerate(new_event_units[1:]): # 遍历每一个事件单元 103 | # 计算当前新闻和每个事件元之间距离 104 | # dist = singlePassCluster.cosine_distance(history_event_unit.centroid, ordered_news_lists[news_index][2]) 105 | dist = singlePassCluster.cosine_distance(new_event_unit.centroid, new_node_vec) 106 | # print 'dist: %s' % dist 107 | # 找出最大的距离的事件元 108 | if dist > max_dist: 109 | max_dist = dist 110 | min_event_index = event_index + 1 111 | logging.logger.info('[Info] new_node_id: %s' % new_node_id) 112 | logging.logger.info('[Info] len of new_event_unit: %s' % len(new_event_units)) 113 | logging.logger.info('[Info] max_dist: %s' % max_dist) 114 | logging.logger.info('[Info] min_cluster_index: %s\n' % min_event_index) 115 | # 如果最大距离大于某一个阈值,则将该新闻归并到该事件单元 116 | if max_dist > 10: 117 | # new_node_id = ordered_news_lists[news_index][0] 118 | # new_node_vec = ordered_news_lists[news_index][2] 119 | new_event_units[min_event_index].add_node(new_node_id, new_node_vec) 120 | # new_event_units[min_event_index].add_unit_title() 121 | # new_event_units[min_event_index].event_expression() 122 | else: 123 | # 否则则新建一个事件单元 124 | index = len(new_event_units) 125 | new_event = event_util.EventUnit() 126 | new_event.event_id = index 127 | new_event.add_node(new_node_id, new_node_vec) 128 | # new_event.add_unit_title() 129 | # new_event.event_expression() 130 | new_event_units.append(new_event) 131 | del new_event 132 | gc.collect() 133 | 134 | logging.logger.info('[更新后的事件个数]: {}'.format(len(new_event_units))) 135 | # step 4、对更新的事件库进行标题和关键词更新 136 | # 事件库更新,更新标题,关键词,股票代码。 137 | # 读取数据库中的所有新闻数据 138 | full_df_data = data_reader.get_all_data().set_index('id') 139 | 140 | # 股票及股票代码 141 | stock_df = pd.read_csv(conf.stock_new_path, encoding='utf-8').set_index('SESNAME') 142 | 143 | for unit in tqdm(new_event_units): 144 | if unit.event_tag == 1: 145 | # 更新标题,股票代码,关键词等 146 | logging.logger.info("事件 [%s] 是新事件" % unit.event_id) 147 | # 读取每个事件 148 | node_df_data = full_df_data.loc[set(unit.node_list)] 149 | node_news_lists = data_reader.trans_df_data(node_df_data.reset_index(), tfidf_feature, tfidf_transformer, dp, 150 | tk) 151 | news_list = [] 152 | news_title_list = [] 153 | for i in node_news_lists: 154 | # print i[1], i[4] 155 | news_list.append(i[1]) 156 | news_title_list.append(i[4]) 157 | # 更新股票列表 158 | unit.event_expression(news_title_list, news_list, stock_df) 159 | logging.logger.info("股票列表: %s" % ','.join(tmp for tmp in unit.stocks)) 160 | logging.logger.info("关键词列表: %s" % unit.keywords) 161 | # 更新标题 162 | node_news_dict = {} 163 | for node in node_news_lists: 164 | node_news_dict[node[0]] = (node[1], node[2], node[3], node[4]) 165 | unit.title_update(node_news_dict) 166 | unit.event_tag = 0 # 所有内容更新完成之后将事件表示为0 167 | else: 168 | continue 169 | 170 | # step 5、将更新后的事件单元保存下来 171 | event_save_name = int(time.time()) 172 | event_save_path = conf.event_save_path 173 | # event_save_path = "/Users/li/PycharmProjects/event_parser/src/model/event_model/" 174 | event_util.event_save(new_event_units, event_save_name, event_save_path) 175 | 176 | # step 6、load最新的事件单元库 177 | file_new = file_util.find_newest_file(event_save_path) 178 | logging.logger.info('[最新的文件: %s]' % file_new) 179 | # new_event_units = event_util.load_history_event(file_new) 180 | # for i in new_event_units: 181 | # print i.topic_title 182 | # print i.event_id 183 | # print i.node_list 184 | # print i.stocks 185 | -------------------------------------------------------------------------------- /src/event2mysql.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: event2mysql.py 8 | @time: 2018-12-17 13:53 9 | 将更新好的事件按固定格式保存到mysql中 10 | """ 11 | 12 | import json 13 | import pandas as pd 14 | from src import data_reader 15 | from src.configure import conf 16 | from src.utils import file_util, event_util 17 | from src.utils.log import log_util 18 | from src.utils.engine import data_source 19 | 20 | logging = log_util.Logger('event2mysql') 21 | event_save_path = conf.event_save_path 22 | # event_save_path = "/Users/li/PycharmProjects/event_parser/src/model/event_model/" 23 | 24 | # 从文件目录中导入最新的更新文件 25 | file_new = file_util.find_newest_file(event_save_path) 26 | new_event_units = event_util.load_history_event(event_save_path + file_new) 27 | 28 | # 从数据库中读取最新的新闻的id,title,url和timestamp 29 | total_data = data_reader.get_all_data() 30 | # 将id设为index,方面后面根据id提取title和url 31 | total_data_df = total_data.set_index('id') 32 | 33 | # 将事件单元的信息整理成规定格式 34 | result = [] 35 | for item in new_event_units: 36 | # 如果是有效事件 37 | if item.effectiveness == 1: 38 | logging.logger.info('[effective event ID]: %s' % item.event_id) 39 | logging.logger.info('[effective event title]: %s' % item.topic_title) 40 | event_stock = ','.join(k for k in set(item.stocks)) 41 | logging.logger.info('[effective stock]: %s\n' % event_stock) 42 | logging.logger.info('[effective node list]: %s' % item.node_list) 43 | # 从dataFrame中获取事件单元中node的标题和出处 44 | title_url = [] 45 | time_list = [] 46 | for node_id in item.node_list: 47 | title, url, unix_time = total_data_df.loc[node_id][['title', 'url', 'unix_time']] 48 | time_list.append(unix_time) 49 | title_url.append({node_id: {'news_title': title.encode('utf-8'), 'url': url, 'unix_time': unix_time}}) 50 | event_detail = json.dumps(title_url) 51 | stop_time = max(time_list) 52 | start_time = min(time_list) 53 | logging.logger.info("[event start-stop time]start_time {}, stop_time {}".format(start_time, stop_time)) 54 | result.append((item.event_id, item.topic_title.encode('utf-8'), event_stock, start_time, stop_time, event_detail)) 55 | else: 56 | continue 57 | 58 | # 整理成dataFrame的格式 59 | result_df = pd.DataFrame(result, 60 | columns=['event_id', 'event_title', 'event_stock', 'start_time', 'stop_time', 'event_detail']) 61 | 62 | """ 63 | # 将整理好「事件以及事件涉及股票列表」数据保存到{event_detail} 64 | """ 65 | # # 创建数据库引擎 66 | engine_mysql = data_source.GetDataEngine("XAVIER_DB") 67 | # # 将「事件以及事件涉及股票列表」的数据保存到mysql中 68 | result_df.to_sql('event_detail', engine_mysql, if_exists='replace', index=False) 69 | logging.logger.info('event_detail update success') 70 | 71 | """ 72 | # 整理出「股票以及股票涉及的事件列表」数据保存到{symbol_event_detail} 73 | """ 74 | event_symbol = result_df[['event_id', 'event_stock']] 75 | # print event_symbol 76 | lst = {} 77 | for i in range(len(event_symbol)): 78 | event_id = event_symbol.loc[i]['event_id'] 79 | event_stock = event_symbol.loc[i]['event_stock'].strip() 80 | # if event_stock != '': # 剔除没有股票的事件 81 | # for symbol in event_stock.split(','): 82 | # lst.setdefault(symbol, []).append("'" + str(event_id) + "'") 83 | for symbol in event_stock.split(','): 84 | lst.setdefault(symbol, []).append("'" + str(event_id) + "'") 85 | 86 | tmp_result = pd.DataFrame(list(lst.items()), columns=['SYMBOL', 'event_id']) 87 | tmp_result['event_id'] = tmp_result['event_id'].apply(lambda x: ','.join(x)) 88 | # print tmp_result 89 | # 将「股票以及股票涉及的事件列表」数据保存到mysql中 90 | tmp_result.to_sql('symbol_event_detail', engine_mysql, if_exists='replace', index=False) 91 | logging.logger.info('symbol_event_detail update success') 92 | -------------------------------------------------------------------------------- /src/event_update.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | echo $(date) 4 | 5 | source ../venv/bin/activate 6 | 7 | U_V1=`python -V 2>&1|awk '{print $2}'|awk -F '.' '{print $1}'` 8 | U_V2=`python -V 2>&1|awk '{print $2}'|awk -F '.' '{print $2}'` 9 | U_V3=`python -V 2>&1|awk '{print $2}'|awk -F '.' '{print $3}'` 10 | 11 | if [[ ${U_V1}.${U_V2}.${U_V3} == '2.7.14' ]];then 12 | echo 'dynamic_update_event.py start' 13 | python dynamic_update_event.py 14 | 15 | sleep 5s 16 | 17 | echo 'Save Event to MySQL' 18 | python event2mysql.py 19 | echo 'Event Update Finished' 20 | else 21 | echo 'Virtualenv Start Sailed,Event Update failed' 22 | fi 23 | -------------------------------------------------------------------------------- /src/history_event.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: history_event.py 8 | @time: 2018/11/14 3:33 PM 9 | 将类簇转换成事件单元,并根据类簇中的节点id从文本中提取每个类簇对应的新闻,构成事件单元,然后提取每个事件单元涉及的股票。并且对每个事件单元提取关键词代表每个事件单元。所有的结果打包成pickle文件保存到本地。 10 | """ 11 | import gc 12 | import pickle 13 | import pandas as pd 14 | 15 | import sys 16 | sys.path.append('..') 17 | sys.path.append('../') 18 | sys.path.append('../../') 19 | 20 | from src.configure import conf 21 | from src.utils import event_util 22 | from src.utils.log import log_util 23 | from src.utils.VSM import tfidf 24 | from src.data_reader import import_news, import_title, get_event_news 25 | 26 | # import logger 27 | logging = log_util.Logger('history_event') 28 | # 导入通过singlePass聚类生成的类簇 29 | # clustering_path = '/Users/li/PycharmProjects/event_parser/src/model/clustering_new.pkl' 30 | clustering_path = conf.clustering_save_path 31 | try: 32 | with open(clustering_path, 'rb') as fr: 33 | clustering = pickle.load(fr) 34 | logging.logger.info('load cluster units from: {}'.format(clustering_path)) 35 | except IOError as err: 36 | logging.logger.error('cluster units pickle file load failed: {} and program stopped'.format(clustering_path)) 37 | sys.exit() 38 | # clustering.print_result() 39 | 40 | # 读取新闻文本 41 | # 新闻保存的路径 42 | # corpus_news = "/Users/li/PycharmProjects/event_parser/src/data/text_full_index.txt" 43 | corpus_news = conf.corpus_news 44 | # 新闻标题保存的路径 45 | # corpus_news_title = "/Users/li/PycharmProjects/event_parser/src/data/text_title_index.txt" 46 | corpus_news_title = conf.corpus_news_title 47 | logging.logger.info('load corpus_news_title from: {}'.format(corpus_news_title)) 48 | # 构建新闻正文词典 49 | news_dict = import_news(corpus_news) 50 | # 构建新闻标题词典 51 | news_title_dict = import_title(corpus_news_title) 52 | 53 | # load tf-idf VSM 54 | tfidf_feature_path = conf.tfidf_feature_path 55 | tfidf_transformer_path = conf.tfidftransformer_path 56 | try: 57 | tfidf_feature = tfidf.load_tfidf_feature(tfidf_feature_path) 58 | logging.logger.info("TF-IDF feature load success") 59 | tfidf_transformer = tfidf.load_tfidf_transformer(tfidf_transformer_path) 60 | logging.logger.info("TF-IDF transformer load success") 61 | except: 62 | logging.logger.info("TF-IDF model load failed, please check path %s,%s" % (tfidf_feature_path, 63 | tfidf_transformer_path)) 64 | sys.exit() 65 | # 股票及股票代码 66 | stock_df = pd.read_csv(conf.stock_new_path, encoding='utf-8').set_index('SESNAME') 67 | # 事件有效性判断 68 | # effectiveness_events, non_effectiveness_events = event_util.events_effectiveness(clustering.cluster_list, news_dict) 69 | 70 | ''' 71 | 构建事件单元 72 | ''' 73 | event_unit_lists = [] 74 | # for cluster_index, cluster in enumerate(effectiveness_events): 75 | for cluster_index, cluster in enumerate(clustering.cluster_list): 76 | logging.logger.info('[event_id]: {}'.format(cluster_index)) # 簇的序号 77 | logging.logger.info('[event_node_id]: {}'.format(cluster.node_list)) # 该簇的节点列表 78 | 79 | event_unit = event_util.EventUnit() 80 | event_unit.node_list = cluster.node_list 81 | event_unit.node_num = cluster.node_num 82 | event_unit.centroid = cluster.centroid 83 | event_unit.event_id = cluster_index 84 | 85 | # 获取事件单元中的标题 86 | event_title_lists = get_event_news(news_title_dict, cluster.node_list) 87 | # 获取事件单元中的新闻正文 88 | event_news_lists = get_event_news(news_dict, cluster.node_list) 89 | # # 事件表示,提取事件中涉及的股票,对所有新闻提取关键词, 添加事件标题 90 | # stock_list, keywords_list = event_util.event_expression(event_title_lists, event_news_lists) 91 | # # 事件表示, 计算事件的标题 92 | # topic_title = event_util.units_title(cluster, news_dict, news_title_dict) 93 | # print "[事件标题]:\n %s " % topic_title 94 | # event_unit.topic_title, event_unit.stocks, event_unit.keywords = topic_title, stock_list, keywords_list 95 | 96 | # 添加涉及的股票和事件关键词 97 | event_unit.event_expression(event_title_lists, event_news_lists, stock_df) 98 | # 添加事件标题 99 | event_unit.add_unit_title(news_dict, news_title_dict, tfidf_feature, tfidf_transformer) 100 | event_unit_lists.append(event_unit) 101 | del event_unit 102 | gc.collect() 103 | logging.logger.info('[聚类类簇的个数]: {}'.format(len(clustering.cluster_list))) 104 | logging.logger.info('[事件库中事件的个数]: {}'.format(len(event_unit_lists))) 105 | 106 | # event_lib = EventLib() 107 | # event_lib.event_unit_list = event_unit_lists 108 | 109 | # 保存事件库 110 | # event_unit_path = '/Users/li/PycharmProjects/event_parser/src/model/event_units_new.pkl' 111 | # event_unit_path = conf.event_unit_path 112 | event_save_name = conf.data_time 113 | event_unit_path = conf.event_save_path + event_save_name + '.pkl' 114 | 115 | with open(event_unit_path, 'wb') as fw: 116 | # pickle.dump(event_lib, fw) 117 | pickle.dump(event_unit_lists, fw) 118 | logging.logger.info('[历史事件运行结束]事件库保存目录为:{}'.format(event_unit_path)) 119 | -------------------------------------------------------------------------------- /src/load_event_data.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: load_event_data.py 8 | @time: 2018-12-25 18:18 9 | """ 10 | from src.configure import conf 11 | from src.utils import file_util, event_util 12 | 13 | 14 | event_save_path = conf.event_save_path 15 | # event_save_path = "/Users/li/PycharmProjects/event_parser/src/model/event_model/" 16 | 17 | # 从文件目录中导入最新的更新文件 18 | file_new = file_util.find_newest_file(event_save_path) 19 | new_event_units = event_util.load_history_event(file_new) 20 | 21 | for i in new_event_units: 22 | print("topic_title %s" % i.topic_title) 23 | print("event_id %s" % i.event_id) 24 | print("node_list %s" % i.node_list) 25 | print("stock_list %s\n" % i.stocks) 26 | -------------------------------------------------------------------------------- /src/model/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: __init__.py.py 8 | @time: 2018/11/5 3:43 PM 9 | """ -------------------------------------------------------------------------------- /src/parser/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: __init__.py.py 8 | @time: 2019-03-05 10:33 9 | """ 10 | import argparse 11 | import datetime 12 | if __name__ == '__main__': 13 | parser = argparse.ArgumentParser() 14 | 15 | date = str(datetime.date.today().strftime("%Y-%m-%d")) 16 | parser.add_argument('--start_date', type=str, default=date) 17 | parser.add_argument('--end_date', type=int, default=0) 18 | parser.add_argument('--count', type=int, default=10) 19 | parser.add_argument('--rebuild', type=bool, default=False) 20 | parser.add_argument('--schedule', type=bool, default=False) 21 | 22 | args = parser.parse_args() 23 | print(args.start_date) -------------------------------------------------------------------------------- /src/parser/news_parser/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: __init__.py.py 8 | @time: 2018/10/30 10:44 AM 9 | """ 10 | import sys 11 | sys.path.append('../') 12 | sys.path.append('..') 13 | sys.path.append('../../') 14 | -------------------------------------------------------------------------------- /src/parser/news_parser/dbscan.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: dbscan.py 8 | @time: 2018/10/31 10:51 AM 9 | """ 10 | 11 | from sklearn import cluster 12 | from sklearn.metrics import adjusted_rand_score 13 | import numpy as np 14 | import matplotlib.pyplot as plt 15 | from sklearn.datasets.samples_generator import make_blobs 16 | from sklearn import mixture 17 | from sklearn.svm.libsvm import predict 18 | 19 | 20 | def create_data(centers, num=100, std=0.7): 21 | # 产生数据 22 | X, labels_true = make_blobs(n_samples=num, centers=centers, cluster_std=std) 23 | return X, labels_true 24 | 25 | 26 | def plot_data(*data): 27 | """ 28 | 数据作图 29 | """ 30 | X,labels_true = data 31 | labels=np.unique(labels_true) 32 | fig=plt.figure() 33 | ax=fig.add_subplot(1,1,1) 34 | colors='rgbycm' 35 | for i,label in enumerate(labels): 36 | position=labels_true==label 37 | ax.scatter(X[position,0],X[position,1],label="cluster %d"%label), 38 | color=colors[i%len(colors)] 39 | 40 | ax.legend(loc="best",framealpha=0.5) 41 | ax.set_xlabel("X[0]") 42 | ax.set_ylabel("Y[1]") 43 | ax.set_title("data") 44 | plt.show() 45 | 46 | 47 | # 测试函数 48 | def test_DBSCAN(*data): 49 | X,labels_true = data 50 | clst = cluster.DBSCAN() 51 | predict_labels = clst.fit_predict(X) 52 | print("ARI:%s"%adjusted_rand_score(labels_true, predict_labels)) 53 | print("Core sample num:%d"%len(clst.core_sample_indices_)) 54 | 55 | 56 | def test_DBSCAN_epsilon(*data): 57 | X,labels_true = data 58 | epsilons = np.logspace(-1,1.5) 59 | ARIs=[] 60 | Core_nums = [] 61 | for epsilon in epsilons: 62 | clst = cluster.DBSCAN(eps=epsilon) 63 | predicted_labels = clst.fit_predict(X) 64 | ARIs.append(adjusted_rand_score(labels_true,predicted_labels)) 65 | Core_nums.append(len(clst.core_sample_indices_)) 66 | 67 | fig = plt.figure(figsize=(10,5)) 68 | ax = fig.add_subplot(1,2,1) 69 | ax.plot(epsilons,ARIs,marker = '+') 70 | ax.set_xscale('log') 71 | ax.set_xlabel(r"$\epsilon$") 72 | ax.set_ylim(0,1) 73 | ax.set_ylabel('ARI') 74 | 75 | ax = fig.add_subplot(1,2,2) 76 | ax.plot(epsilons,Core_nums,marker='o') 77 | ax.set_xscale('log') 78 | ax.set_xlabel(r"$\epsilon$") 79 | ax.set_ylabel('Core_num') 80 | 81 | fig.suptitle("DBSCAN") 82 | plt.show() 83 | 84 | 85 | def test_DBSCAN_min_samples(*data): 86 | X,labels_true=data 87 | min_samples = range(1,100) 88 | ARIs = [] 89 | Core_nums = [] 90 | for num in min_samples: 91 | clst = cluster.DBSCAN(min_samples=num) 92 | predicted_labels = clst.fit_predict(X) 93 | ARIs.append(adjusted_rand_score(labels_true, predicted_labels)) 94 | Core_nums.append(len(clst.core_sample_indices_)) 95 | 96 | fig=plt.figure(figsize=(10,5)) 97 | ax=fig.add_subplot(1,2,1) 98 | ax.plot(min_samples,ARIs,marker='+') 99 | ax.set_xlabel("min_samples") 100 | ax.set_ylim(0,1) 101 | ax.set_ylabel('ARI') 102 | 103 | ax=fig.add_subplot(1,2,2) 104 | ax.plot(min_samples,Core_nums,marker='o') 105 | ax.set_xlabel("min_samples") 106 | ax.set_ylabel('Core_nums') 107 | 108 | fig.suptitle("DBSCAN") 109 | plt.show() 110 | 111 | 112 | if __name__ == '__main__': 113 | X, labels_true = create_data(4) 114 | # plot_data(data) 115 | test_DBSCAN(X,labels_true) 116 | 117 | centers = [[1,1],[1,2],[2,2],[10,20]] 118 | X,labels_true = create_data(centers,1000,0.5) 119 | test_DBSCAN_epsilon(X, labels_true) 120 | 121 | centers = [[1,1],[1,2],[2,2],[10,20]] 122 | X,labels_true = create_data(centers,1000,0.5) 123 | test_DBSCAN_min_samples(X,labels_true) -------------------------------------------------------------------------------- /src/parser/news_parser/tonghuashun.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: tonghuashun.py 8 | @time: 2018/10/30 11:13 AM 9 | """ 10 | 11 | import pyhanlp 12 | import jpype 13 | from jpype import * 14 | # jvmPath = jpype.getDefaultJVMPath() 15 | # print(jvmPath) 16 | # # jpype.startJVM(jvmPath) 17 | # jpype.java.lang.System.out.println("hello world!") 18 | # java.lang.System.out.println("hello world") 19 | # # jpype.shutdownJVM() 20 | 21 | 22 | HanLP = JClass('com.hankcs.hanlp.HanLP') 23 | #中文分词 24 | print(HanLP.segment("你好,欢迎在Python中调用HanLP的API").toString()) 25 | testCases = [ 26 | "商品和服务", 27 | "结婚的和尚未结婚的确实在干扰分词啊", 28 | "买水果然后来世博园最后去世博会", 29 | "中国的首都是北京", 30 | "欢迎新老师生前来就餐", 31 | "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", 32 | "随着页游兴起到现在的页游繁盛,依赖于存档进行逻辑判断的设计减少了,但这块也不能完全忽略掉。"] 33 | for sentence in testCases: print(HanLP.segment(sentence)) 34 | # 命名实体识别与词性标注 35 | NLPTokenizer = JClass('com.hankcs.hanlp.tokenizer.NLPTokenizer') 36 | print(NLPTokenizer.segment('中国科学院计算技术研究所的宗成庆教授正在教授自然语言处理课程')) 37 | # 关键词提取 38 | document = "水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露," \ 39 | "根据刚刚完成了水资源管理制度的考核,有部分省接近了红线的指标," \ 40 | "有部分省超过红线的指标。对一些超过红线的地方,陈明忠表示,对一些取用水项目进行区域的限批," \ 41 | "严格地进行水资源论证和取水许可的批准。" 42 | print(HanLP.extractKeyword(document, 2)) 43 | # 自动摘要 44 | print(HanLP.extractSummary(document, 2)) 45 | # 依存句法分析 46 | print(HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。")) 47 | jpype.shutdownJVM() 48 | 49 | -------------------------------------------------------------------------------- /src/parser/news_parser/xueqiu.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: xueqiu.py 8 | @time: 2018/10/30 2:31 PM 9 | """ 10 | import sys 11 | sys.path.append("../") 12 | 13 | from pyhanlp import * 14 | from src.data_reader import read_full_data 15 | 16 | news = read_full_data() 17 | # print news['title'] 18 | # print news['content'] 19 | 20 | for index, item in news.iterrows(): 21 | title, content = item['title'], item['content'] 22 | title_list = HanLP.extractKeyword(title, 8) 23 | content_list = HanLP.extractSummary(content, 2) 24 | print(title, title_list) 25 | print(content, content_list) 26 | 27 | 28 | 29 | # 新闻标题聚合 30 | 31 | # 个股事件 32 | # 新闻事件 33 | 34 | 35 | # 关键词提取 36 | # document = "水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露," \ 37 | # "根据刚刚完成了水资源管理制度的考核,有部分省接近了红线的指标," \ 38 | # "有部分省超过红线的指标。对一些超过红线的地方,陈明忠表示,对一些取用水项目进行区域的限批," \ 39 | # "严格地进行水资源论证和取水许可的批准。" 40 | # print(HanLP.extractKeyword(document, 8)) 41 | 42 | 43 | # 自动摘要 44 | # print(HanLP.extractSummary(document, 3)) 45 | 46 | 47 | # 依存句法分析 48 | # print(HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。")) -------------------------------------------------------------------------------- /src/parser/requirement.txt: -------------------------------------------------------------------------------- 1 | appnope==0.1.0 2 | attrs==19.1.0 3 | backcall==0.1.0 4 | backports.functools-lru-cache==1.5 5 | bleach==3.1.0 6 | boto==2.49.0 7 | boto3==1.9.34 8 | botocore==1.12.34 9 | bz2file==0.98 10 | certifi==2018.10.15 11 | chardet==3.0.4 12 | cycler==0.10.0 13 | Cython==0.29.6 14 | decorator==4.4.0 15 | defusedxml==0.6.0 16 | docutils==0.14 17 | entrypoints==0.3 18 | gensim==3.6.0 19 | hanlp==5.0.0 20 | idna==2.7 21 | ipykernel==5.1.0 22 | ipython==7.5.0 23 | ipython-genutils==0.2.0 24 | ipywidgets==7.4.2 25 | jedi==0.13.3 26 | jieba==0.39 27 | Jinja2==2.10.1 28 | jmespath==0.9.3 29 | joblib==0.13.2 30 | JPype1==0.6.3 31 | jsonschema==3.0.1 32 | jupyter==1.0.0 33 | jupyter-client==5.2.4 34 | jupyter-console==6.0.0 35 | jupyter-core==4.4.0 36 | kiwisolver==1.0.1 37 | MarkupSafe==1.1.1 38 | matplotlib==2.2.3 39 | mistune==0.8.4 40 | mysql-connector==2.1.6 41 | nbconvert==5.5.0 42 | nbformat==4.4.0 43 | notebook==5.7.8 44 | numpy==1.16.2 45 | pandas==0.23.4 46 | pandocfilters==1.4.2 47 | parso==0.4.0 48 | pexpect==4.7.0 49 | pickleshare==0.7.5 50 | prometheus-client==0.6.0 51 | prompt-toolkit==2.0.9 52 | ptyprocess==0.6.0 53 | Pygments==2.3.1 54 | pyhanlp==0.1.44 55 | pymssql==2.1.4 56 | pyparsing==2.2.2 57 | pyrsistent==0.15.1 58 | python-dateutil==2.7.5 59 | pytz==2018.7 60 | pyzmq==18.0.1 61 | qtconsole==4.4.3 62 | requests==2.20.0 63 | s3transfer==0.1.13 64 | scikit-learn==0.20.0 65 | scipy==1.1.0 66 | Send2Trash==1.5.0 67 | six==1.11.0 68 | sklearn==0.0 69 | smart-open==1.7.1 70 | SQLAlchemy==1.2.12 71 | subprocess32==3.5.3 72 | terminado==0.8.2 73 | testpath==0.4.2 74 | tornado==6.0.2 75 | tqdm==4.31.1 76 | traitlets==4.3.2 77 | urllib3==1.24 78 | wcwidth==0.1.7 79 | webencodings==0.5.1 80 | widgetsnbextension==3.4.2 81 | -------------------------------------------------------------------------------- /src/parser/xueqiu/README.md: -------------------------------------------------------------------------------- 1 | 2 | # 代码维护 3 | ## discuss_parser/xueqiu_discuss_daily.py 4 | ### 描述 5 | 实时统计每天新增讨论的中涉及股票, 并且转换成股票以及股票的讨论数。 6 | ### 运行方式 7 | 每天定时运行 8 | 9 | ### 保存格式 10 | ["stock", "xid_list", "xid_count", "created_at"] 11 | 12 | 13 | ## focus_parser/xueqiu_focus_statistics.py 14 | ### 描述 15 | 每天定时统计大V关注的股票, 增量式计算每只股票的大V关注数。然后跟当天的时间一起入库 16 | 17 | ### 运行方式 18 | 每天定时运行 19 | 20 | ### 保存格式 21 | ["symbol", "focus_total_count", "created_at"] 22 | 23 | 24 | # 定时任务维护 25 | ## discuss_focus_statistic_daily.sh 26 | 使用crontab每天定时运行该脚本文件,运行之前注意配置python的虚拟环境。 27 | 28 | 29 | 30 | 31 | # 存在的问题 32 | 将stock.csv中的股票代码需要重新转换, 将引号去掉,比如['300315'] 33 | 34 | -------------------------------------------------------------------------------- /src/parser/xueqiu/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: __init__.py.py 8 | @time: 2019-04-29 14:46 9 | """ -------------------------------------------------------------------------------- /src/parser/xueqiu/discuss_focus_statistics_daily.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | echo $(date) 4 | 5 | #source ../venv/bin/activate 6 | 7 | U_V1=`python -V 2>&1|awk '{print $2}'|awk -F '.' '{print $1}'` 8 | U_V2=`python -V 2>&1|awk '{print $2}'|awk -F '.' '{print $2}'` 9 | U_V3=`python -V 2>&1|awk '{print $2}'|awk -F '.' '{print $3}'` 10 | 11 | if [[ ${U_V1}.${U_V2}.${U_V3} == '3.7.2' ]];then 12 | echo 'xueqiu_discuss_daily.py start' 13 | python ./discuss_parser/xueqiu_discuss_daily.py 14 | 15 | sleep 5s 16 | 17 | echo 'xueqiu_focus_statistics.py start' 18 | python ./focus_parser/xueqiu_focus_statistics.py 19 | echo 'Finished' 20 | else 21 | echo 'Virtualenv Start Failed,Event Update failed' 22 | fi 23 | -------------------------------------------------------------------------------- /src/parser/xueqiu/discuss_parser/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/STHSF/EventsParser/2e3b8100e1e9d7140a6215d07070a90381b3007f/src/parser/xueqiu/discuss_parser/__init__.py -------------------------------------------------------------------------------- /src/parser/xueqiu/discuss_parser/discuss_data/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: __init__.py.py 8 | @time: 2019-04-09 15:19 9 | """ -------------------------------------------------------------------------------- /src/parser/xueqiu/discuss_parser/discuss_data/discuss.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/STHSF/EventsParser/2e3b8100e1e9d7140a6215d07070a90381b3007f/src/parser/xueqiu/discuss_parser/discuss_data/discuss.db -------------------------------------------------------------------------------- /src/parser/xueqiu/discuss_parser/discuss_data/discuss.db': -------------------------------------------------------------------------------- https://raw.githubusercontent.com/STHSF/EventsParser/2e3b8100e1e9d7140a6215d07070a90381b3007f/src/parser/xueqiu/discuss_parser/discuss_data/discuss.db' -------------------------------------------------------------------------------- /src/parser/xueqiu/discuss_parser/discuss_parser.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: 1.0 6 | @author: LiYu 7 | @file: discuss_parser.py 8 | @time: 2019-03-26 22:51 9 | # 识别评论中的股票实体。 10 | # 对讨论进行分词,然后提取评论中的股票实体 11 | # 代码中使用了多进程来处理DataFrame数据。 12 | """ 13 | import os 14 | import glob 15 | import time 16 | import pandas as pd 17 | import multiprocessing 18 | from joblib import Parallel, delayed 19 | from src.utils import dicts 20 | from src.utils.data_process import DataPressing 21 | from src.utils.tokenization import Tokenizer, load_stop_words 22 | 23 | 24 | class DiscussParser(object): 25 | """ 26 | 讨论解析器 27 | """ 28 | def __init__(self): 29 | # 加载分词自定义词典 30 | dicts.init() 31 | self.data_process = DataPressing() 32 | # 停用词 33 | self.stop_words = load_stop_words() 34 | # 股票-股票代码对, 并且对股票代码做一些变换,比如 35 | _, self.stocks_df = dicts.load_stock_data() 36 | self.tokenizer = Tokenizer(self.data_process, self.stop_words) 37 | 38 | def __cut_process(self, text): 39 | """ 40 | 数据处理模块, 分词、提取股票实体词 41 | :param text: 42 | :return: 43 | """ 44 | print('cut_process进程: %sd 父进程ID:%s' % (os.getpid(), os.getppid())) 45 | # 分词 46 | # 用到多进程处理DataFrame,所以将类申明放到每个进程中,不然在调用token的时候,每个子进程不能再调用初始化词典 47 | text_list = self.tokenizer.token(text) 48 | # print("text_list %s" % text_list) 49 | # 提取text中涉及到的股票实体,并且转换成股票代码 50 | stock_list = self.data_process.find_stocks(text_list, self.stocks_df) 51 | # stock_list = ','.join(stock_list) # 展示使用 52 | return stock_list 53 | 54 | def tmp_func(self, tmp_df, column="text"): 55 | """ 56 | apply函数封装 57 | :param column: 需要处理的列名 58 | :param tmp_df: 59 | :return: 60 | """ 61 | print('tmp_func进程: %sd 父进程ID:%s' % (os.getpid(), os.getppid())) 62 | tmp_df['stock_list'] = tmp_df[column].apply(self.__cut_process) 63 | return tmp_df 64 | 65 | @staticmethod 66 | def __apply_parallel(df_grouped, func): 67 | """ 68 | # 多进程处理dataframe 69 | :param df_grouped: 70 | :param func: 71 | :return: 72 | """ 73 | print('apply_parallel是进程: %sd 父进程ID:%s' % (os.getpid(), os.getppid())) 74 | num_cpu = multiprocessing.cpu_count() 75 | 76 | # Parallel不使用参数的时候, 程序多进程运行, 但是字典没有加载 77 | # res_list = Parallel(n_jobs=num_cpu - 2)(delayed(func)(group) for name, group in df_grouped) 78 | # 单独使用prefer参数, 依然是单进程 79 | # res_list = Parallel(n_jobs=num_cpu - 2, prefer="threads")(delayed(func)(group) for name, group in df_grouped) 80 | # 单独使用backend, 词典可以加载成功 81 | # res_list = Parallel(n_jobs=num_cpu - 2, backend="multiprocessing")(delayed(func)(group) for name, group in df_grouped) 82 | # 两个参数都设置, 词典加载成功, 而且运行时间略有缩短 83 | res_list = Parallel(n_jobs=(num_cpu - 2), backend="multiprocessing", prefer="threads")(delayed(func)(group) for name, group in df_grouped) 84 | return pd.concat(res_list) 85 | 86 | def run(self, target_df): 87 | """ 88 | 多进程处理主程序 89 | :param target_df: 90 | :return: 91 | """ 92 | # print('run进程: %sd 父进程ID:%s' % (os.getpid(), os.getppid())) 93 | # 将输入数据按照 94 | df_grouped = target_df.groupby(target_df.index) 95 | res_df = self.__apply_parallel(df_grouped, self.tmp_func) 96 | return res_df 97 | 98 | 99 | # 测试用接口 100 | def read_csv(): 101 | path = '/Users/li/Desktop/sets1' 102 | file_list = glob.glob(os.path.join(path, "*.csv")) 103 | data_list = [] 104 | for f in file_list: 105 | data_list.append(pd.read_csv(f, 106 | header=0, 107 | names=[u'create_at', u'desc', u'id', u'rcreate_at', u'rdesc', 108 | u'rid', u'rtitle', u'ruid', u'screen_sname', u'uid'], encoding='utf-8')) 109 | # data_list.append(pd.read_csv(f)) 110 | 111 | df_result = pd.concat(data_list, sort=True) 112 | print(len(df_result)) 113 | return df_result 114 | 115 | 116 | def create_dic(target_df, xid, unix_time): 117 | tmp_dic = dict() 118 | tmp_dic['xid'] = target_df[xid] 119 | tmp_dic['unix_time'] = target_df[unix_time] 120 | return [target_df[xid], target_df[unix_time]] 121 | 122 | 123 | if __name__ == '__main__': 124 | 125 | print('main进程: %sd 父进程ID:%s' % (os.getpid(), os.getppid())) 126 | discuss_parser = DiscussParser() 127 | 128 | # test_df = pd.DataFrame() 129 | # target_df = read_csv() 130 | # # 测试用 131 | # target_df = target_df.head(5) 132 | # # 可以优化 133 | # funccc = lambda x: str(x) # 类型转换 134 | # test_df['xid'] = target_df['id'].apply(funccc) 135 | # test_df['uid'] = target_df['uid'].apply(funccc) 136 | # # 转换时间格式 137 | # test_df['unix_time'] = target_df['create_at'].apply(time_util.timestamp_to_time, style='%Y-%m-%d') 138 | # # 将id和unix_time构建成一个整体 139 | # test_df['information_dic'] = test_df[['xid', 'unix_time']].apply(create_dic, axis=1, args=('xid', 'unix_time')) 140 | # 141 | # # desc和rdesc两个讨论合并在一起处理 142 | # funcc = lambda x: str(x[0]) + '.' + str(x[1]) 143 | # test_df['text'] = target_df[['desc', 'rdesc']].apply(funcc, axis=1) 144 | # 145 | # res = discuss_parser.run(test_df) 146 | df = pd.DataFrame({'text': ['大智慧的股票真烂,中美贸易战打得好,中美贸易摩擦擦出爱情火花!科创板也上市了,中信证券不错'] * 1000}) 147 | start_time = time.time() 148 | 149 | res = discuss_parser.run(df) 150 | print('spend time %s' % (time.time() - start_time)) 151 | -------------------------------------------------------------------------------- /src/parser/xueqiu/discuss_parser/format_transform.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: format_transform.py 8 | @time: 2019-04-09 10:41 9 | 数据转换,统计讨论数目 10 | """ 11 | import pandas as pd 12 | from src.utils.engine import data_source 13 | 14 | 15 | def county(x): 16 | """ 17 | 统计list中元素的个数 18 | :param x: 19 | :return: 20 | """ 21 | tmp = x.split(',') 22 | return len(tmp) 23 | 24 | 25 | def symbol_format(x): 26 | """ 27 | 转换股票代码的格式 28 | :param x: 29 | :return: 30 | """ 31 | symbol = x.split('\'')[1] 32 | head = int(symbol[:1]) 33 | if head == 6 or head == 9: 34 | return symbol + '.' + 'XSHG' 35 | elif head == 0 or head == 3 or head == 2: 36 | return str(symbol) + '.' + 'XSHE' 37 | elif head == 8 or head == 4: 38 | return str(symbol) + '.' + 'OC' 39 | else: 40 | return str(symbol) 41 | 42 | 43 | if __name__ == '__main__': 44 | 45 | engine_sqlite = data_source.GetDataEngine("XAVIER_SQLITE") 46 | 47 | sql = "SELECT * FROM history_stock_discuss_filter" 48 | # sql = "SELECT * FROM history_discuss_stock_filter" 49 | df = pd.read_sql(sql, engine_sqlite) 50 | 51 | df['symbol'] = df['stock'].apply(symbol_format) 52 | 53 | df['xid_count'] = df['xid_list'].apply(county) 54 | 55 | df_grouped = df.groupby('created_date') 56 | 57 | engine_mysql = data_source.GetDataEngine("VISION") 58 | 59 | for i, j in df_grouped: 60 | print(len(j)) 61 | # 这边可以统计每天的总讨论数量,由此作为大盘的特征因子 62 | # 数据按天分批插入数据库,数据如果需要重跑,则需要删掉原始的表 63 | j.to_sql('history_discuss_stock_filter', engine_mysql, if_exists='append', index=False) 64 | 65 | 66 | -------------------------------------------------------------------------------- /src/parser/xueqiu/discuss_parser/participle/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: __init__.py.py 8 | @time: 2018/10/30 10:45 AM 9 | """ 10 | 11 | from pyhanlp import * 12 | import jieba 13 | 14 | 15 | print(HanLP.segment('你好,欢迎在Python中调用HanLP的API')) 16 | 17 | for term in HanLP.segment('下雨天地面积水'): 18 | print('{}\t{}'.format(term.word, term.nature)) # 获取单词与词性 19 | 20 | # jieba和hanlp分词结果对比 21 | testCases = [ 22 | "中美贸易战开打了,大家小心钱包", 23 | '该来的没来,不该来的来了一大堆', 24 | "商品和服务", 25 | "结婚的和尚未结婚的确实在干扰分词啊", 26 | "买水果然后来世博园最后去世博会", 27 | "中国的首都是北京", 28 | "欢迎新老师生前来就餐", 29 | "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", 30 | "随着页游兴起到现在的页游繁盛,依赖于存档进行逻辑判断的设计减少了,但这块也不能完全忽略掉。"] 31 | for sentence in testCases: 32 | print('\t'.join(jieba.cut(sentence))) 33 | 34 | testCases = [ 35 | "中美贸易战开打了,大家小心钱包", 36 | '该来的没来,不该来的来了一大堆', 37 | "商品和服务", 38 | "结婚的和尚未结婚的确实在干扰分词啊", 39 | "买水果然后来世博园最后去世博会", 40 | "中国的首都是北京", 41 | "欢迎新老师生前来就餐", 42 | "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", 43 | "随着页游兴起到现在的页游繁盛,依赖于存档进行逻辑判断的设计减少了,但这块也不能完全忽略掉。"] 44 | for sentence in testCases: print(HanLP.segment(sentence)) 45 | 46 | # 关键词提取 47 | document = "水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露," \ 48 | "根据刚刚完成了水资源管理制度的考核,有部分省接近了红线的指标," \ 49 | "有部分省超过红线的指标。对一些超过红线的地方,陈明忠表示,对一些取用水项目进行区域的限批," \ 50 | "严格地进行水资源论证和取水许可的批准。" 51 | print(HanLP.extractKeyword(document, 2)) 52 | 53 | # 自动摘要 54 | print(HanLP.extractSummary(document, 3)) 55 | 56 | # 依存句法分析 57 | print(HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。")) -------------------------------------------------------------------------------- /src/parser/xueqiu/discuss_parser/xueqiu_dicsuss_batch.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: xueqiu_dicsuss_batch.py 8 | @time: 2019-03-29 14:05 9 | """ 10 | import gc 11 | import os 12 | import glob 13 | import pandas as pd 14 | from datetime import datetime, timedelta 15 | from src.utils import time_util, dicts 16 | from src.utils.log import log_util 17 | 18 | from src.utils.data_process import DataPressing 19 | from src.utils.tokenization import Tokenizer, load_stop_words 20 | 21 | logging = log_util.Logger('xueqiu_discuss_batch') 22 | 23 | 24 | # 使用desc和rdesc作为当前用户的讨论数据,用户id为当前用户id,讨论id为 25 | def read_csv(path=None): 26 | """ 27 | 读取原始csv文件 28 | :param path: 29 | :return: 30 | """ 31 | if path is None: 32 | path = '/Users/li/Desktop/sets1' 33 | file_list = glob.glob(os.path.join(path, "*.csv")) 34 | data_list = [] 35 | for f in file_list: 36 | data_list.append(pd.read_csv(f, header=0, 37 | names=[u'create_at', u'desc', u'id', u'rcreate_at', u'rdesc', u'rid', u'rtitle', 38 | u'ruid', u'screen_sname', u'uid'], dtype={u'id': str, u'uid': str}, 39 | encoding='utf-8')) 40 | # data_list.append(pd.read_csv(f)) 41 | 42 | df_result = pd.concat(data_list, sort=True) 43 | 44 | return df_result 45 | 46 | 47 | def create_dic(df, xid, unix_time): 48 | """ 49 | 将xid和unix_time构建成一个list 50 | :param df: 51 | :param xid: 52 | :param unix_time: 53 | :return: 54 | """ 55 | tmp = dict() 56 | tmp['xid'] = df[xid] 57 | tmp['unix_time'] = df[unix_time] 58 | return [df[xid], df[unix_time]] 59 | 60 | 61 | # 数据结构调整 62 | def transform_fuc(id, stock_list): 63 | """ 64 | 将user_id和stock_list两两组合成tuple的list集合 65 | :param id: str 66 | :param stock_list: list 67 | :return: 68 | """ 69 | if len(stock_list) <= 0: 70 | pass 71 | user_id_list = [id] * len(stock_list) 72 | tuple_zip = zip(stock_list, user_id_list) 73 | tuple_list = list(tuple_zip) 74 | return tuple_list 75 | 76 | 77 | def cut_process(text, data_process, tokenizer, stocks_df): 78 | text_list = tokenizer.token(text) 79 | # 提取text中涉及到的股票实体,并且转换成股票代码 80 | stock_list = data_process.find_stocks(text_list, stocks_df) 81 | del data_process, tokenizer 82 | gc.collect() 83 | logging.logger.info("__cut_process ing") 84 | return stock_list 85 | 86 | 87 | def discuss_batch(discuss_df, data_process, tokenizer, stocks_df): 88 | # 对讨论数据做分词并提取股票列表 89 | discuss_df['stock_list'] = discuss_df['text'].apply(cut_process, args=(data_process, tokenizer, stocks_df)) 90 | 91 | # 对result_df下的文章id和股票集合进行结构调整 92 | # 可以改进成直接调用transform_fuc 93 | apply_func = lambda x: transform_fuc(x[0], x[1]) 94 | discuss_df['transform_res'] = discuss_df[['xid', 'stock_list']].apply(apply_func, axis=1) 95 | # print(result_df['transform_res']) 96 | 97 | # 将若干个list合并成一个list 98 | transform_res_list = [] 99 | for i in discuss_df['transform_res'].values: 100 | transform_res_list += i 101 | 102 | # 转换成DataFrame格式 103 | transform_res_df = pd.DataFrame(transform_res_list, columns=['stock', 'xid']) 104 | 105 | # 将数据根据股票分组 106 | transform_res_grouped = transform_res_df.groupby('stock') 107 | 108 | # 合并每个分组中的文章id 109 | res_grouped = [] 110 | for group_index, group_df in transform_res_grouped: 111 | res_grouped.append([group_index, ','.join(group_df['xid'])]) 112 | # print(res_grouped) 113 | 114 | # 构建成dataFrame格式,结合运行日期,保存到数据库中 115 | batch_result = pd.DataFrame(res_grouped, columns=['stock', 'xid_list']) 116 | 117 | logging.logger.info("length of batch_result: %s" % len(batch_result)) 118 | del transform_res_df, transform_res_grouped, discuss_df 119 | gc.collect() 120 | return batch_result 121 | 122 | 123 | if __name__ == '__main__': 124 | st_time = datetime.now() 125 | 126 | # 解析器所需要的数据初始化 127 | stop_words = load_stop_words() 128 | _, stocks_df = dicts.load_stock_data() 129 | data_process = DataPressing() 130 | tokenizer = Tokenizer(data_process, stop_words) 131 | 132 | # 新建空df用于存放预处理的数据 133 | test_df = pd.DataFrame() 134 | # 读取数据 135 | target_df = read_csv() 136 | logging.logger.info('length of target from csv:{}'.format(len(target_df))) 137 | # 测试用 138 | target_df = target_df.head(500) 139 | # 可以优化, 在read_csv中添加dtype 140 | # funccc = lambda x: str(x) # 类型转换 141 | # test_df['xid'] = target_df['id'].apply(funccc) 142 | # test_df['uid'] = target_df['uid'].apply(funccc) 143 | test_df['xid'] = target_df['id'] 144 | # 转换时间格式 145 | test_df['unix_time'] = target_df['create_at'].apply(time_util.timestamp_to_time, style='%Y-%m-%d') 146 | # 将id和unix_time构建成一个整体 147 | test_df['information_dic'] = test_df[['xid', 'unix_time']].apply(create_dic, axis=1, args=('xid', 'unix_time')) 148 | 149 | # desc和rdesc两个讨论合并在一起处理 150 | funcc = lambda x: str(x[0]) + '.' + str(x[1]) 151 | test_df['text'] = target_df[['desc', 'rdesc']].apply(funcc, axis=1) 152 | 153 | test_df['unix_time'] = target_df['create_at'].apply(time_util.timestamp_to_time, style='%Y-%m-%d') 154 | # print(test_df[['unix_time', 'xid', 'text']]) 155 | del target_df 156 | gc.collect() 157 | 158 | # 获取时间的最大值和最小值 159 | # start_date = time_util.timestamp_to_time(target_df['create_at'].min(), style='%Y-%m-%d') 160 | # stop_date = time_util.timestamp_to_time(target_df['create_at'].max(), style='%Y-%m-%d') 161 | # 获取时间的最大值和最小值 162 | start_date = test_df['unix_time'].min() 163 | stop_date = test_df['unix_time'].max() 164 | logging.logger.info('start_time: {}, stop_time: {}'.format(start_date, stop_date)) 165 | 166 | start_time = datetime.strptime(str(start_date), "%Y-%m-%d") 167 | stop_time = datetime.strptime(str(stop_date), "%Y-%m-%d") 168 | tmp_time = stop_time 169 | while tmp_time >= start_time: 170 | # 从最大的一天开始倒数着计算每一天 171 | # 读取当天的数据 172 | tmp_date = datetime.strftime(tmp_time, "%Y-%m-%d") 173 | discuss_df = test_df.loc[test_df['unix_time'] == tmp_date] 174 | if len(discuss_df) == 0: 175 | logging.logger.warning("{} has no discuss data".format(tmp_date)) 176 | tmp_time = tmp_time - timedelta(days=1) 177 | continue 178 | logging.logger.info("computing {} data at ".format(len(discuss_df), tmp_date)) 179 | # 单进程调用解析器 180 | result = discuss_batch(discuss_df, data_process, tokenizer, stocks_df) 181 | result['created_date'] = tmp_date 182 | logging.logger.info("{} has {} result data".format(tmp_date, len(result))) 183 | 184 | # # 创建数据库引擎 185 | # engine_mysql_test = data_source.GetDataEngine("XAVIER_DB") 186 | # engine_mysql = data_source.GetDataEngine("XAVIER") 187 | # result.to_sql('history_discuss_filter', engine_mysql, if_exists='append', index=False) 188 | # engine_sqlite = data_source.GetDataEngine('XAVIER_SQLITE') 189 | # result.to_sql('history_discuss_filter', engine_sqlite, if_exists='append', index=False) 190 | 191 | # 计算前一天时间 192 | del result, discuss_df 193 | gc.collect() 194 | tmp_time = tmp_time - timedelta(days=1) 195 | 196 | end_time = datetime.now() 197 | print((end_time - st_time).seconds) 198 | 199 | 200 | 201 | 202 | -------------------------------------------------------------------------------- /src/parser/xueqiu/discuss_parser/xueqiu_discuss_batch_multi.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: xueqiu_dicsuss_batch.py 8 | @time: 2019-03-29 14:05 9 | """ 10 | import gc 11 | import os 12 | import glob 13 | import pandas as pd 14 | from datetime import datetime, timedelta 15 | from src.utils import time_util 16 | from src.utils.log import log_util 17 | from src.parser.xueqiu.discuss_parser import discuss_parser 18 | 19 | logging = log_util.Logger('xueqiu_discuss_batch') 20 | 21 | 22 | # 使用desc和rdesc作为当前用户的讨论数据,用户id为当前用户id,讨论id为 23 | def read_csv(path=None): 24 | """ 25 | 读取原始csv文件 26 | :param path: 27 | :return: 28 | """ 29 | if path is None: 30 | path = '/Users/li/Desktop/sets1' 31 | file_list = glob.glob(os.path.join(path, "*.csv")) 32 | data_list = [] 33 | for f in file_list: 34 | data_list.append(pd.read_csv(f, header=0, 35 | names=[u'create_at', u'desc', u'id', u'rcreate_at', u'rdesc', u'rid', u'rtitle', 36 | u'ruid', u'screen_sname', u'uid'], dtype={u'id': str, u'uid': str}, encoding='utf-8')) 37 | # data_list.append(pd.read_csv(f)) 38 | 39 | df_result = pd.concat(data_list, sort=True) 40 | 41 | return df_result 42 | 43 | 44 | def create_dic(df, xid, unix_time): 45 | """ 46 | 将xid和unix_time构建成一个list 47 | :param df: 48 | :param xid: 49 | :param unix_time: 50 | :return: 51 | """ 52 | tmp = dict() 53 | tmp['xid'] = df[xid] 54 | tmp['unix_time'] = df[unix_time] 55 | return [df[xid], df[unix_time]] 56 | 57 | 58 | # 数据结构调整 59 | def transform_fuc(id, stock_list): 60 | """ 61 | 将user_id和stock_list两两组合成tuple的list集合 62 | :param id: str 63 | :param stock_list: list 64 | :return: 65 | """ 66 | if len(stock_list) <= 0: 67 | pass 68 | user_id_list = [id] * len(stock_list) 69 | tuple_zip = zip(stock_list, user_id_list) 70 | tuple_list = list(tuple_zip) 71 | return tuple_list 72 | 73 | 74 | def discuss_batch(discuss_df, xq_discuss_parser): 75 | # 对讨论数据做分词并提取股票列表,使用多进程解析器 76 | result_df = xq_discuss_parser.run(discuss_df) 77 | 78 | # 对result_df下的文章id和股票集合进行结构调整 79 | # 可以改进成直接调用transform_fuc 80 | apply_func = lambda x: transform_fuc(x[0], x[1]) 81 | result_df['transform_res'] = result_df[['xid', 'stock_list ']].apply(apply_func, axis=1) 82 | # print(result_df['transform_res']) 83 | 84 | # 将若干个list合并成一个list 85 | transform_res_list = [] 86 | for i in result_df['transform_res'].values: 87 | transform_res_list += i 88 | 89 | # 转换成DataFrame格式 90 | transform_res_df = pd.DataFrame(transform_res_list, columns=['stock', 'xid']) 91 | 92 | # 将数据根据股票分组 93 | transform_res_grouped = transform_res_df.groupby('stock') 94 | 95 | # 合并每个分组中的文章id 96 | res_grouped = [] 97 | for group_index, group_df in transform_res_grouped: 98 | res_grouped.append([group_index, ','.join(group_df['xid'])]) 99 | # print(res_grouped) 100 | 101 | # 构建成dataFrame格式,结合运行日期,保存到数据库中 102 | result = pd.DataFrame(res_grouped, columns=['stock', 'xid_list']) 103 | 104 | logging.logger.info("length of result: %s" % len(result)) 105 | del xq_discuss_parser, result_df, transform_res_df, transform_res_grouped 106 | gc.collect() 107 | return result 108 | 109 | 110 | if __name__ == '__main__': 111 | st_time = datetime.now() 112 | 113 | # 新建雪球多进程解析器 114 | xq_discuss_parser = discuss_parser.DiscussParser() 115 | 116 | # 新建空df用于存放预处理的数据 117 | test_df = pd.DataFrame() 118 | target_df = read_csv() 119 | logging.logger.info('length of target from csv:{}'.format(len(target_df))) 120 | # 测试用 121 | target_df = target_df.head(50) 122 | # 可以优化 123 | # funccc = lambda x: str(x) # 类型转换 124 | # test_df['xid'] = target_df['id'].apply(funccc) 125 | # test_df['uid'] = target_df['uid'].apply(funccc) 126 | test_df['xid'] = target_df['id'] 127 | # 转换时间格式 128 | test_df['unix_time'] = target_df['create_at'].apply(time_util.timestamp_to_time, style='%Y-%m-%d') 129 | # 将id和unix_time构建成一个整体 130 | test_df['information_dic'] = test_df[['xid', 'unix_time']].apply(create_dic, axis=1, args=('xid', 'unix_time')) 131 | 132 | # desc和rdesc两个讨论合并在一起处理 133 | funcc = lambda x: str(x[0]) + '.' + str(x[1]) 134 | test_df['text'] = target_df[['desc', 'rdesc']].apply(funcc, axis=1) 135 | 136 | test_df['unix_time'] = target_df['create_at'].apply(time_util.timestamp_to_time, style='%Y-%m-%d') 137 | # print(test_df[['unix_time', 'xid', 'text']]) 138 | del target_df 139 | gc.collect() 140 | 141 | # 获取时间的最大值和最小值 142 | # start_date = time_util.timestamp_to_time(target_df['create_at'].min(), style='%Y-%m-%d') 143 | # stop_date = time_util.timestamp_to_time(target_df['create_at'].max(), style='%Y-%m-%d') 144 | # 获取时间的最大值和最小值 145 | start_date = test_df['unix_time'].min() 146 | stop_date = test_df['unix_time'].max() 147 | logging.logger.info('start_time: {}, stop_time: {}'.format(start_date, stop_date)) 148 | 149 | start_time = datetime.strptime(str(start_date), "%Y-%m-%d") 150 | stop_time = datetime.strptime(str(stop_date), "%Y-%m-%d") 151 | tmp_time = stop_time 152 | while tmp_time >= start_time: 153 | # 从最大的一天开始倒数着计算每一天 154 | # 读取当天的数据 155 | tmp_date = datetime.strftime(tmp_time, "%Y-%m-%d") 156 | discuss_df = test_df.loc[test_df['unix_time'] == tmp_date] 157 | if len(discuss_df) == 0: 158 | logging.logger.warning("{} has no discuss data".format(tmp_date)) 159 | tmp_time = tmp_time - timedelta(days=1) 160 | continue 161 | logging.logger.info("computing {} data at ".format(len(discuss_df), tmp_date)) 162 | # 调用多进程解析器 163 | result = discuss_batch(discuss_df, xq_discuss_parser) 164 | result['created_date'] = tmp_date 165 | logging.logger.info("{} has {} result data".format(tmp_date, len(result))) 166 | 167 | # # 创建数据库引擎 168 | # engine_mysql_test = data_source.GetDataEngine("XAVIER_DB") 169 | # engine_mysql = data_source.GetDataEngine("XAVIER") 170 | # result.to_sql('history_discuss_filter', engine_mysql, if_exists='append', index=False) 171 | 172 | # 计算前一天时间 173 | del result, discuss_df 174 | gc.collect() 175 | tmp_time = tmp_time - timedelta(days=1) 176 | 177 | end_time = datetime.now() 178 | print((end_time - st_time).seconds) 179 | 180 | 181 | 182 | 183 | -------------------------------------------------------------------------------- /src/parser/xueqiu/discuss_parser/xueqiu_discuss_csv.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: xueqiu_discuss_csv.py 8 | @time: 2019-03-23 14:23 9 | """ 10 | 11 | # 从csv文件中读取文件 12 | import time 13 | import os 14 | import glob 15 | import pandas as pd 16 | from src.utils import time_util 17 | from src.utils.engine import data_source 18 | from src.utils.log import log_util 19 | from src.parser.xueqiu.discuss_parser import discuss_parser 20 | 21 | logging = log_util.Logger('xueqiu_discuss_csv') 22 | 23 | 24 | # 使用desc和rdesc作为当前用户的讨论数据,用户id为当前用户id,讨论id为 25 | def read_csv(path=None): 26 | """ 27 | 读取原始csv文件 28 | :param path: 29 | :return: 30 | """ 31 | if path is None: 32 | path = '/Users/li/Desktop/sets1' 33 | file_list = glob.glob(os.path.join(path, "*.csv")) 34 | data_list = [] 35 | for f in file_list: 36 | data_list.append(pd.read_csv(f, header=0, 37 | names=[u'create_at', u'desc', u'id', u'rcreate_at', u'rdesc', u'rid', u'rtitle', 38 | u'ruid', u'screen_sname', u'uid'], encoding='utf-8')) 39 | # data_list.append(pd.read_csv(f)) 40 | 41 | df_result = pd.concat(data_list, sort=True) 42 | 43 | return df_result 44 | 45 | 46 | def create_dic(df, xid, unix_time): 47 | """ 48 | 将xid和unix_time构建成一个list 49 | :param df: 50 | :param xid: 51 | :param unix_time: 52 | :return: 53 | """ 54 | tmp = dict() 55 | tmp['xid'] = df[xid] 56 | tmp['unix_time'] = df[unix_time] 57 | return [df[xid], df[unix_time]] 58 | 59 | 60 | # 数据结构调整 61 | def transform_fuc(id, stock_list): 62 | """ 63 | 将user_id和stock_list两两组合成tuple的list集合 64 | :param id: str 65 | :param stock_list: list 66 | :return: 67 | """ 68 | if len(stock_list) <= 0: 69 | pass 70 | user_id_list = [id] * len(stock_list) 71 | tuple_zip = zip(stock_list, user_id_list) 72 | tuple_list = list(tuple_zip) 73 | return tuple_list 74 | 75 | 76 | if __name__ == '__main__': 77 | start_time = time.time() 78 | xq_discuss_parser = discuss_parser.DiscussParser() 79 | 80 | test_df = pd.DataFrame() 81 | target_df = read_csv() 82 | logging.logger.info('length of target from csv:{}'.format(len(target_df))) 83 | # 测试用 84 | target_df = target_df.head(50) 85 | # 可以优化 86 | funccc = lambda x: str(x) # 类型转换 87 | test_df['xid'] = target_df['id'].apply(funccc) 88 | test_df['uid'] = target_df['uid'].apply(funccc) 89 | # 转换时间格式 90 | test_df['unix_time'] = target_df['create_at'].apply(time_util.timestamp_to_time, style='%Y-%m-%d') 91 | # 将id和unix_time构建成一个整体 92 | test_df['information_dic'] = test_df[['xid', 'unix_time']].apply(create_dic, axis=1, args=('xid', 'unix_time')) 93 | 94 | # desc和rdesc两个讨论合并在一起处理 95 | funcc = lambda x: str(x[0]) + '.' + str(x[1]) 96 | test_df['text'] = target_df[['desc', 'rdesc']].apply(funcc, axis=1) 97 | 98 | # 提取股票实体词 99 | result_df = xq_discuss_parser.run(test_df) 100 | # print(result_df[['stock_list', 'information_dic']]) 101 | 102 | # 对result_df下的文章id和股票集合进行结构调整 103 | apply_func = lambda x: transform_fuc(x[0], x[1]) 104 | result_df['transform_res'] = result_df[['information_dic', 'stock_list']].apply(apply_func, axis=1) 105 | # print(result_df['transform_res']) 106 | # print(result_df['transform_res']) 107 | 108 | # 将若干个list合并成一个list 109 | transform_res_list = [] 110 | for i in result_df['transform_res'].values: 111 | transform_res_list += i 112 | 113 | # 转换成DataFrame格式 114 | transform_res_df = pd.DataFrame(transform_res_list, columns=['stock', 'information_list']) 115 | # print(transform_res_df[['stock', 'information_list']]) 116 | # 将数据根据股票分组 117 | transform_res_grouped = transform_res_df.groupby('stock') 118 | 119 | # 合并每个分组中的文章id 120 | res_grouped = [] 121 | for group_index, group_value in transform_res_grouped: 122 | if group_index is None or group_index: 123 | pass 124 | tmp_res = [] 125 | for value in group_value['information_list']: 126 | tmp_res.append(value) 127 | res_grouped.append([group_index, tmp_res]) 128 | 129 | # 构建成dataFrame格式,结合运行日期,保存到数据库中 130 | result = pd.DataFrame(res_grouped, columns=['stock', 'information_list']) 131 | # print("result %s" % result) 132 | 133 | result_dataframe = pd.DataFrame() 134 | for i, j in result.iterrows(): 135 | tt = pd.DataFrame(j['information_list'], columns=['xid', 'created_time']) 136 | # print('tt %s' % tt) 137 | tt_grouped = tt.groupby('created_time') 138 | 139 | # 合并每个分组中的文章id 140 | res_grouped = [] 141 | for i1, j1 in tt_grouped: 142 | # print('i1 %s' % i1) 143 | # print('j1 %s' % j1) 144 | res_grouped.append([i1, ','.join(j1['xid'])]) 145 | # print(res_grouped) 146 | 147 | result_df = pd.DataFrame(res_grouped, columns=['creates_time', 'xid_list']) 148 | result_df['stock'] = j['stock'] 149 | 150 | # print(result_df) 151 | result_dataframe = result_dataframe.append(result_df, ignore_index=True) 152 | logging.logger.info('spend %s' % (time.time() - start_time)) 153 | logging.logger.info('length of result dataframe: %s' % len(result_dataframe)) 154 | 155 | # 数据库存储 156 | # engine_sqlite = data_source.GetDataEngine('XAVIER_SQLITE') 157 | engine_mysql = data_source.GetDataEngine("XAVIER") 158 | 159 | result_dataframe.to_sql('history_discuss_stock_filter_test', engine_mysql, if_exists='replace', index=False) 160 | # result_dataframe.to_sql('history_discuss_stock_filter', engine_sqlite, if_exists='replace', index=False) 161 | # -------------------------------------------------------------------------------- /src/parser/xueqiu/discuss_parser/xueqiu_discuss_csv_bak.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: xueqiu_discuss_csv.py 8 | @time: 2019-03-23 14:23 9 | """ 10 | 11 | # 从csv文件中读取文件] 12 | import sys 13 | sys.path.append('../') 14 | sys.path.append('../../') 15 | sys.path.append('../../../') 16 | sys.path.append('../../../../') 17 | sys.path.append('../../../../../') 18 | import os 19 | import glob 20 | import time 21 | import pandas as pd 22 | from joblib import Parallel, delayed 23 | import multiprocessing 24 | from src.configure import conf 25 | from src.utils import time_util, dicts 26 | from src.utils.engine import data_source 27 | from src.utils.data_process import DataPressing 28 | from src.utils.tokenization import Tokenizer, load_stop_words 29 | 30 | 31 | # 使用desc和rdesc作为当前用户的讨论数据,用户id为当前用户id,讨论id为 32 | def read_csv(): 33 | path = '/Users/li/Desktop/sets1' 34 | file_list = glob.glob(os.path.join(path, "*.csv")) 35 | data_list = [] 36 | for f in file_list: 37 | data_list.append(pd.read_csv(f, header=0, 38 | names=[u'create_at', u'desc', u'id', u'rcreate_at', u'rdesc', u'rid', u'rtitle', 39 | u'ruid', u'screen_sname', u'uid'], encoding='utf-8')) 40 | # data_list.append(pd.read_csv(f)) 41 | 42 | df_result = pd.concat(data_list, sort=True) 43 | print(len(df_result)) 44 | # print(df_result.keys()) 45 | # print(df_result.head()) 46 | 47 | return df_result 48 | 49 | 50 | # 导入股票实体词 51 | stock_code_dict = [] # 股票代码 52 | stock_dict = [] 53 | 54 | 55 | def load_stock_data(): 56 | dic_path = conf.dic_path 57 | st_path = dic_path + "/stock_words.txt" 58 | st_new_path = dic_path + "/stock.csv" 59 | for st in open(st_path): 60 | # st = st.decode("utf8") 61 | code1, st_code = st.split("\t") 62 | code, stock = st_code.split(",") 63 | stock_code_dict.append(code.strip("\n")) 64 | stock_dict.append(stock.strip("\n")) 65 | 66 | stocks_df = pd.read_csv(st_new_path, encoding='utf-8') 67 | # stock_df.append(stocks_df.set_index('SESNAME')) 68 | for index, row in stocks_df.iterrows(): 69 | stock_dict.append(row.SESNAME) 70 | stock_dict.append(row.SYMBOL) 71 | return stock_dict, stocks_df 72 | 73 | 74 | _, stocks_df = load_stock_data() 75 | 76 | # 识别评论中的股票实体。 77 | # 对讨论进行分词,然后提取评论中的股票实体。 78 | data_process = DataPressing() 79 | dict_init = dicts.init() 80 | stop_words = load_stop_words() 81 | tokenizer = Tokenizer(data_process, stop_words) 82 | 83 | 84 | # 整理股票代码 85 | stocks_df = stocks_df.set_index('SESNAME') 86 | # print('stocks_df %s' % stocks_df) 87 | 88 | 89 | def cut_process(text): 90 | """ 91 | 数据处理模块, 分词、提取股票实体词 92 | :param text: 93 | :return: 94 | """ 95 | # 分词 96 | dicts.init() 97 | text_list = tokenizer.token(text) 98 | # 提取text中涉及到的股票实体,并且转换成股票代码 99 | stock_list = data_process.find_stocks(text_list, stocks_df) 100 | # stock_list = ','.join(stock_list) 101 | return stock_list 102 | 103 | 104 | def tmp_func(df): 105 | """ 106 | apply函数封装 107 | :param df: 108 | :return: 109 | """ 110 | df['stock_list'] = df['text'].apply(cut_process) 111 | return df[['xid', 'uid', 'stock_list', 'unix_time', 'information_dic']] 112 | 113 | 114 | def apply_parallel(df_grouped, func): 115 | """ 116 | # 多进程处理 117 | :param df_grouped: 118 | :param func: 119 | :return: 120 | """ 121 | ret_lst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(group) for name, group in df_grouped) 122 | 123 | return pd.concat(ret_lst) 124 | 125 | 126 | def run(target_df): 127 | """ 128 | 多进程处理主程序 129 | :param target_df: 130 | :return: 131 | """ 132 | # 将输入数据按照 133 | df_grouped = target_df.groupby(target_df.index) 134 | res = apply_parallel(df_grouped, tmp_func) 135 | return res 136 | 137 | 138 | def create_dic(df, xid, unix_time): 139 | tmp = dict() 140 | tmp['xid'] = df[xid] 141 | tmp['unix_time'] = df[unix_time] 142 | # return str(tmp 143 | return [df[xid], df[unix_time]] 144 | 145 | 146 | # 数据结构调整 147 | def transform_fuc(id, stock_list): 148 | """ 149 | 将user_id和stock_list两两组合成tuple的list集合 150 | :param id: str 151 | :param stock_list: list 152 | :return: 153 | """ 154 | if len(stock_list) <= 0: 155 | pass 156 | user_id_list = [id] * len(stock_list) 157 | tuple_zip = zip(stock_list, user_id_list) 158 | tuple_list = list(tuple_zip) 159 | return tuple_list 160 | 161 | 162 | if __name__ == '__main__': 163 | 164 | start_time = time.time() 165 | test_df = pd.DataFrame() 166 | target_df = read_csv() 167 | # 测试用 168 | target_df = target_df.head(5) 169 | # 可以优化 170 | funccc = lambda x: str(x) # 类型转换 171 | test_df['xid'] = target_df['id'].apply(funccc) 172 | test_df['uid'] = target_df['uid'].apply(funccc) 173 | # 转换时间格式 174 | test_df['unix_time'] = target_df['create_at'].apply(time_util.timestamp_to_time, style='%Y-%m-%d') 175 | # 将id和unix_time构建成一个整体 176 | test_df['information_dic'] = test_df[['xid', 'unix_time']].apply(create_dic, axis=1, args=('xid', 'unix_time')) 177 | 178 | # desc和rdesc两个讨论合并在一起处理 179 | funcc = lambda x: str(x[0]) + '.' + str(x[1]) 180 | test_df['text'] = target_df[['desc', 'rdesc']].apply(funcc, axis=1) 181 | 182 | # 提取股票实体词 183 | result_df = run(test_df) 184 | # print(result_df[['stock_list', 'information_dic']]) 185 | 186 | # 对result_df下的文章id和股票集合进行结构调整 187 | apply_func = lambda x: transform_fuc(x[0], x[1]) 188 | result_df['transform_res'] = result_df[['information_dic', 'stock_list']].apply(apply_func, axis=1) 189 | # print(result_df['transform_res']) 190 | # print(result_df['transform_res']) 191 | 192 | # 将若干个list合并成一个list 193 | transform_res_list = [] 194 | for i in result_df['transform_res'].values: 195 | transform_res_list += i 196 | 197 | # 转换成DataFrame格式 198 | transform_res_df = pd.DataFrame(transform_res_list, columns=['stock', 'information_list']) 199 | # print(transform_res_df[['stock', 'information_list']]) 200 | # 将数据根据股票分组 201 | transform_res_grouped = transform_res_df.groupby('stock') 202 | 203 | # 合并每个分组中的文章id 204 | res_grouped = [] 205 | for i, j in transform_res_grouped: 206 | if i is None or i: 207 | pass 208 | tmp_res = [] 209 | for k in j['information_list']: 210 | tmp_res.append(k) 211 | res_grouped.append([i, tmp_res]) 212 | 213 | # 构建成dataFrame格式,结合运行日期,保存到数据库中 214 | result = pd.DataFrame(res_grouped, columns=['stock', 'information_list']) 215 | # print("result %s" % result) 216 | 217 | result_dataframe = pd.DataFrame() 218 | for i, j in result.iterrows(): 219 | tt = pd.DataFrame(j['information_list'], columns=['xid', 'created_time']) 220 | # print('tt %s' % tt) 221 | tt_grouped = tt.groupby('created_time') 222 | 223 | # 合并每个分组中的文章id 224 | res_grouped = [] 225 | for i1, j1 in tt_grouped: 226 | # print('i1 %s' % i1) 227 | # print('j1 %s' % j1) 228 | res_grouped.append([i1, ','.join(j1['xid'])]) 229 | # print(res_grouped) 230 | 231 | result_df = pd.DataFrame(res_grouped, columns=['creates_time', 'xid_list']) 232 | result_df['stock'] = j['stock'] 233 | 234 | # print(result_df) 235 | result_dataframe = result_dataframe.append(result_df, ignore_index=True) 236 | print('spend %s' % (time.time() - start_time)) 237 | print('result_dataframe %s' % result_dataframe) 238 | 239 | # 数据库存储 240 | engine_sqlite = data_source.GetDataEngine('XAVIER_SQLITE') 241 | engine_mysql = data_source.GetDataEngine("XAVIER") 242 | 243 | result_dataframe.to_sql('history_discuss_stock_filter', engine_mysql, if_exists='replace', index=False) 244 | result_dataframe.to_sql('history_discuss_stock_filter', engine_sqlite, if_exists='replace', index=False) 245 | -------------------------------------------------------------------------------- /src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: xueqiu_discuss_daily.py 8 | @time: 2019-03-18 16:35 9 | 对大V评论进行分析,提取大V评论中的股票实体,并且整合成股票ID:{评论ID, 评论大VID, 评论时间, 评论内容} 10 | 计算时间粒度,一天 11 | 每天定时统计, 每天早上八点定时运行 12 | """ 13 | import sys 14 | sys.path.append('../') 15 | sys.path.append('../../') 16 | sys.path.append('../../../') 17 | sys.path.append('../../../../') 18 | sys.path.append('../../../../../') 19 | 20 | from src.utils import time_util 21 | from src.utils.log import log_util 22 | import pandas as pd 23 | from sqlalchemy import create_engine 24 | from src.data_reader import read_all_data 25 | from src.parser.xueqiu.discuss_parser import discuss_parser, format_transform 26 | 27 | logging = log_util.Logger('discuss_stock_filter_daily') 28 | 29 | 30 | # 数据结构调整 31 | def transform_fuc(id, stock_list): 32 | """ 33 | 将user_id和stock_list两两组合成tuple的list集合 34 | :param id: str 35 | :param stock_list: list 36 | :return: 37 | """ 38 | if len(stock_list) <= 0: 39 | pass 40 | user_id_list = [id] * len(stock_list) 41 | tuple_zip = zip(stock_list, user_id_list) 42 | tuple_list = list(tuple_zip) 43 | return tuple_list 44 | 45 | 46 | if __name__ == '__main__': 47 | pd.set_option('display.max_rows', None, 'display.max_columns', None, "display.max_colwidth", 1000, 'display.width', 1000) 48 | # engine_mysql_test = GetDataEngine("VISIONTEST") 49 | # engine_mysql = data_source.GetDataEngine("VISION") 50 | engine_mysql_test = create_engine('mysql+mysqlconnector://test_edit:test_edit_2019@db1.irongliang.com:3306/test') 51 | 52 | xq_parser = discuss_parser.DiscussParser() 53 | 54 | '''数据读取部分''' 55 | '''根据指定的时间格式, 从指定数据库中读取指定表中的数据''' 56 | # 获取两个指定的时间点 57 | # 起始时间 58 | stop_time = time_util.get_integral_point_time(0) 59 | # 截止时间为起始时间的前一天 60 | start_time = time_util.get_integral_point_time(0) - 86400 # (24*60*60) 61 | 62 | # 测试用 63 | # start_time = 1556380800 64 | # stop_time = 1556380800 + 86400 65 | 66 | logging.logger.info("program start at {}".format(time_util.timestamp_to_time(start_time), "%Y-%m-%d")) 67 | logging.logger.info("program stop at {}".format(time_util.timestamp_to_time(stop_time), "%Y-%m-%d")) 68 | # 读取原始雪球评论数据 69 | # sheet_name = 'xueqiu_discuss' 70 | # sql = "SELECT xid, uid, title, text, unix_time FROM xavier.{} WHERE unix_time >={} AND unix_time <= {} order by unix_time".format(sheet_name, str(start_time), str(stop_time)) 71 | # 雪球评论所保存的表 72 | sheet_name = 'xq_comment' 73 | # 读取指定时间段的所有数据 74 | sql = "SELECT * FROM test.{} WHERE created_at >={} AND created_at <= {} order by created_at".format(sheet_name, str(start_time * 1000), str(stop_time * 1000)) 75 | 76 | # 读取需要处理的数据,从数据库中以DataFrame的格式读取。 77 | discuss_df = read_all_data(sheet_name, engine_mysql_test, sql) 78 | # 测试用 79 | # discuss_df = discuss_df.head() 80 | # print('discuss_df %s' % discuss_df) 81 | '''数据读取部分''' 82 | 83 | if len(discuss_df) <= 0: 84 | logging.logger.warning('there is no new discuss yesterday') 85 | exit() 86 | else: 87 | logging.logger.info('load discuss data from mysql successful') 88 | # 进行分词,提取股票特征词等操作 89 | result_df = xq_parser.run(discuss_df) 90 | 91 | # 对result_df下的文章id和股票集合进行结构调整 92 | # 可以改进成直接调用transform_fuc 93 | apply_func = lambda x: transform_fuc(str(x[0]), x[1]) 94 | # id: 文章id, stock_list: 提取的的股票集合 95 | result_df['transform_res'] = result_df[['id', 'stock_list']].apply(apply_func, axis=1) 96 | # print(result_df['transform_res']) 97 | 98 | # 将dataframe中每一行的若干个list合并成一个list 99 | transform_res_list = [] 100 | for i in result_df['transform_res'].values: 101 | transform_res_list += i 102 | 103 | # 转换成DataFrame格式 104 | transform_res_df = pd.DataFrame(transform_res_list, columns=['stock', 'xid']) 105 | # print(transform_res_df) 106 | 107 | # # 将数据根据股票分组 108 | transform_res_grouped = transform_res_df.groupby('stock') 109 | # 110 | # 合并每个分组中的文章id 111 | res_grouped = [] 112 | for stock, group_df in transform_res_grouped: 113 | res_grouped.append([stock, ','.join(group_df['xid'])]) 114 | # print(res_grouped) 115 | 116 | # # 构建成dataFrame格式,结合运行日期,保存到数据库中 117 | result = pd.DataFrame(res_grouped, columns=['stock', 'xid_list']) 118 | # 格式化股票代码 119 | result['stock'] = result['stock'].apply(format_transform.symbol_format) 120 | # 统计讨论的数目 121 | result['xid_count'] = result['xid_list'].apply(format_transform.county) 122 | # result['created_at'] = str(datetime.date.today().strftime("%Y-%m-%d")) 123 | result['created_at'] = str(time_util.timestamp_to_time(start_time, "%Y-%m-%d")) 124 | 125 | print(result) 126 | logging.logger.info("length of result: %s" % len(result)) 127 | 128 | # #存储到表中 129 | # 创建数据库引擎 130 | result.to_sql('xueqiu_discuss_count', engine_mysql_test, if_exists='append', index=False) 131 | logging.logger.info('数据保存到第 %s 天' % str(time_util.timestamp_to_time(start_time, "%Y-%m-%d"))) 132 | 133 | logging.logger.info('program finished') 134 | 135 | 136 | -------------------------------------------------------------------------------- /src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily_bak.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: xueqiu_discuss_daily_bak.py 8 | @time: 2019-03-18 16:35 9 | 对大V评论进行分析,提取大V评论中的股票实体,并且整合成股票ID:{评论ID, 评论大VID, 评论时间, 评论内容} 10 | 计算时间粒度,一天 11 | 每天定时统计, 每天早上八点定时运行 12 | """ 13 | 14 | import datetime 15 | from joblib import Parallel, delayed 16 | import multiprocessing 17 | from src.utils import dicts, time_util 18 | import pandas as pd 19 | from src.data_reader import read_all_data 20 | from src.configure import conf 21 | from src.utils.log import log_util 22 | from src.utils.data_process import DataPressing 23 | from src.utils.tokenization import Tokenizer, load_stop_words 24 | 25 | logging = log_util.Logger('discuss_stock_filter') 26 | 27 | # 获取两个时间点 28 | # 指定时间 29 | stop_time = time_util.get_integral_point_time(9) 30 | # 指定时间前一天 31 | start_time = time_util.get_integral_point_time(9) - 86400 # (24*60*60) 32 | 33 | start_time = 1552179600 34 | stop_time = 1552179600 + 86400 35 | 36 | logging.logger.info("program start at {}".format(start_time)) 37 | # 读取原始雪球评论数据 38 | sheet_name = 'xueqiu_discuss' 39 | sql = "SELECT xid, uid, title, text, unix_time FROM xavier.{} WHERE unix_time >={} AND unix_time <= {} order by unix_time".format(sheet_name, str(start_time), str(stop_time)) 40 | # sql = "SELECT count(*) FROM xavier_db.%s ORDER BY unix_time" % sheet_name 41 | 42 | # 读取需要处理的数据,从数据库中以DataFrame的格式读取。 43 | discuss_df = read_all_data(sheet_name, sql) 44 | # 测试用 45 | discuss_df = discuss_df.head() 46 | if len(discuss_df) <= 0: 47 | logging.logger.warning('there is no new discuss yesterday') 48 | exit() 49 | else: 50 | logging.logger.info('load discuss data from mysql successful') 51 | 52 | # print('discuss_df %s' % discuss_df) 53 | 54 | 55 | # 导入股票实体词 56 | stock_code_dict = [] # 股票代码 57 | stock_dict = [] 58 | 59 | 60 | def load_stock_data(): 61 | dic_path = conf.dic_path 62 | st_path = dic_path + "/stock_words.txt" 63 | st_new_path = dic_path + "/stock.csv" 64 | for st in open(st_path): 65 | st = st.decode("utf8") 66 | code1, st_code = st.split("\t") 67 | code, stock = st_code.split(",") 68 | stock_code_dict.append(code.strip("\n")) 69 | stock_dict.append(stock.strip("\n")) 70 | 71 | stocks_df = pd.read_csv(st_new_path, encoding='utf-8') 72 | # stock_df.append(stocks_df.set_index('SESNAME')) 73 | for index, row in stocks_df.iterrows(): 74 | stock_dict.append(row.SESNAME) 75 | stock_dict.append(row.SYMBOL) 76 | return stock_dict, stocks_df 77 | 78 | 79 | _, stocks_df = load_stock_data() 80 | 81 | # 识别评论中的股票实体。 82 | # 对讨论进行分词,然后提取评论中的股票实体。 83 | data_process = DataPressing() 84 | dict_init = dicts.init() 85 | stop_words = load_stop_words() 86 | tokenizer = Tokenizer(data_process, stop_words) 87 | 88 | 89 | # 整理股票代码 90 | stocks_df = stocks_df.set_index('SESNAME') 91 | # print('stocks_df %s' % stocks_df) 92 | 93 | 94 | def cut_process(text): 95 | """ 96 | 数据处理模块, 分词、提取股票实体词 97 | :param text: 98 | :return: 99 | """ 100 | # 分词 101 | dicts.init() 102 | text_list = tokenizer.token(text) 103 | # 提取text中涉及到的股票实体,并且转换成股票代码 104 | stock_list = data_process.find_stocks(text_list, stocks_df) 105 | # res = ','.join(stock_list) 106 | # return res 107 | return stock_list 108 | 109 | 110 | def tmp_func(df): 111 | """ 112 | apply函数封装 113 | :param df: 114 | :return: 115 | """ 116 | df['stock_list'] = df['text'].apply(cut_process) 117 | return df[['xid', 'uid', 'stock_list', 'unix_time']] 118 | 119 | 120 | def apply_parallel(df_grouped, func): 121 | """ 122 | # 多进程处理 123 | :param df_grouped: 124 | :param func: 125 | :return: 126 | """ 127 | ret_lst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(group) for name, group in df_grouped) 128 | 129 | return pd.concat(ret_lst) 130 | 131 | 132 | def run(target_df): 133 | """ 134 | 多进程处理主程序 135 | :param target_df: 136 | :return: 137 | """ 138 | # 将输入数据按照 139 | df_grouped = target_df.groupby(target_df.index) 140 | res = apply_parallel(df_grouped, tmp_func) 141 | return res 142 | 143 | 144 | def kk_test(): 145 | """ 146 | 测试股票实体是否提取成功 147 | :return: 148 | """ 149 | text = "大智慧的股票真烂,中美贸易战打得好,中美贸易摩擦擦出爱情火花!中信证券也上市了,还是注册制的, 中信建投也不错。" 150 | cut_res = cut_process(text) 151 | print('tmp_res %s' % cut_res) 152 | 153 | test_df = pd.DataFrame({'text': [text, text, text, text, text]}) 154 | # 非多进程下直接提取股票实体 155 | test_df['stock_list'] = test_df['text'].apply(cut_process) 156 | print('test_df %s' % test_df) 157 | 158 | # 多进程下提取股票实体 159 | df_grouped = test_df.groupby(test_df.index) 160 | pp = apply_parallel(df_grouped, tmp_func) 161 | print('pp %s' % pp['stock_list']) 162 | 163 | 164 | # 对dicuss_df做提取股票代码操作 165 | result_df = run(discuss_df) 166 | # print(result_df[['xid', 'stock_list']]) 167 | 168 | 169 | # 数据结构调整 170 | def transform_fuc(id, stock_list): 171 | """ 172 | 将user_id和stock_list两两组合成tuple的list集合 173 | :param id: str 174 | :param stock_list: list 175 | :return: 176 | """ 177 | if len(stock_list) <= 0: 178 | pass 179 | user_id_list = [id] * len(stock_list) 180 | tuple_zip = zip(stock_list, user_id_list) 181 | tuple_list = list(tuple_zip) 182 | return tuple_list 183 | 184 | 185 | # 对result_df下的文章id和股票集合进行结构调整 186 | # 可以改进成直接调用transform_fuc 187 | apply_func = lambda x: transform_fuc(x[0], x[1]) 188 | result_df['transform_res'] = result_df[['xid', 'stock_list']].apply(apply_func, axis=1) 189 | print(result_df['transform_res']) 190 | 191 | # 将若干个list合并成一个list 192 | transform_res_list = [] 193 | for i in result_df['transform_res'].values: 194 | transform_res_list += i 195 | 196 | # 转换成DataFrame格式 197 | transform_res_df = pd.DataFrame(transform_res_list, columns=['stock', 'xid']) 198 | 199 | # 将数据根据股票分组 200 | transform_res_grouped = transform_res_df.groupby('stock') 201 | 202 | # 合并每个分组中的文章id 203 | res_grouped = [] 204 | for i, j in transform_res_grouped: 205 | res_grouped.append([i, ','.join(j['xid'])]) 206 | print(res_grouped) 207 | 208 | # 构建成dataFrame格式,结合运行日期,保存到数据库中 209 | result = pd.DataFrame(res_grouped, columns=['stock', 'xid_list']) 210 | 211 | result['created_date'] = str(datetime.date.today().strftime("%Y-%m-%d")) 212 | # print(result) 213 | 214 | # # 存储到表中 215 | # # 创建数据库引擎 216 | # engine_mysql_test = data_source.GetDataEngine("XAVIER_DB") 217 | # engine_mysql = data_source.GetDataEngine("XAVIER") 218 | # result.to_sql('discuss_stock_filter_lists', engine_mysql, if_exists='replace', index=False) 219 | # 220 | # logging.logger.info('program finished') 221 | -------------------------------------------------------------------------------- /src/parser/xueqiu/discuss_parser/xueqiu_discuss_parser.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: xueqiu_discuss_parser_bak.py 8 | @time: 2019-03-05 10:31 9 | 对大V评论进行分析,提取大V评论中的股票实体,并且整合成股票ID:{评论ID, 评论大VID, 评论时间, 评论内容} 10 | 计算时间粒度,一天 11 | 12 | """ 13 | from joblib import Parallel, delayed 14 | import multiprocessing 15 | from src.utils import dicts 16 | import pandas as pd 17 | from src.data_reader import read_all_data 18 | from src.configure import conf 19 | from src.utils.engine import data_source 20 | from src.utils.data_process import DataPressing 21 | from src.utils.tokenization import Tokenizer, load_stop_words 22 | 23 | # 读取原始雪球评论数据 24 | sheet_name = 'xueqiu_discuss' 25 | sql = "SELECT xid, uid, title, text, mood, unix_time FROM xavier.%s ORDER BY unix_time" % sheet_name 26 | # sql = "SELECT count(*) FROM xavier_db.%s ORDER BY unix_time" % sheet_name 27 | 28 | 29 | # 导入股票实体词 30 | stock_code_dict = [] # 股票代码 31 | stock_dict = [] 32 | 33 | 34 | def load_stock_data(): 35 | dic_path = conf.dic_path 36 | st_path = dic_path + "/stock_words.txt" 37 | st_new_path = dic_path + "/stock.csv" 38 | for st in open(st_path): 39 | st = st.decode("utf8") 40 | code1, st_code = st.split("\t") 41 | code, stock = st_code.split(",") 42 | stock_code_dict.append(code.strip("\n")) 43 | stock_dict.append(stock.strip("\n")) 44 | 45 | stocks_df = pd.read_csv(st_new_path, encoding='utf-8') 46 | # stock_df.append(stocks_df.set_index('SESNAME')) 47 | for index, row in stocks_df.iterrows(): 48 | stock_dict.append(row.SESNAME) 49 | stock_dict.append(row.SYMBOL) 50 | return stock_dict, stocks_df 51 | 52 | 53 | _, stocks_df = load_stock_data() 54 | 55 | # 识别评论中的股票实体。 56 | # 对讨论进行分词,然后提取评论中的股票实体。 57 | data_process = DataPressing() 58 | dict_init = dicts.init() 59 | stop_words = load_stop_words() 60 | tokenizer = Tokenizer(data_process, stop_words) 61 | 62 | # 读取需要处理的数据,从数据库中以DataFrame的格式读取。 63 | discuss_df = read_all_data(sheet_name, sql) 64 | # discuss_df = discuss_df.head() 65 | # print('discuss_df %s' % discuss_df['mood']) 66 | 67 | # 整理股票代码 68 | stocks_df = stocks_df.set_index('SESNAME') 69 | # print('stocks_df %s' % stocks_df) 70 | 71 | 72 | def cut_process(text): 73 | """ 74 | 数据处理模块, 分词、提取股票实体词 75 | :param text: 76 | :return: 77 | """ 78 | # 分词 79 | dicts.init() 80 | text_list = tokenizer.token(text) 81 | # 提取text中涉及到的股票实体,并且转换成股票代码 82 | stock_list = data_process.find_stocks(text_list, stocks_df) 83 | res = ','.join(stock_list) 84 | return res 85 | 86 | 87 | def tmp_func(df): 88 | """ 89 | apply函数封装 90 | :param df: 91 | :return: 92 | """ 93 | df['stock_list'] = df['text'].apply(cut_process) 94 | return df[['xid', 'uid', 'mood', 'stock_list', 'unix_time']] 95 | 96 | 97 | def apply_parallel(df_grouped, func): 98 | """ 99 | # 多进程处理 100 | :param df_grouped: 101 | :param func: 102 | :return: 103 | """ 104 | ret_lst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(group) for name, group in df_grouped) 105 | 106 | return pd.concat(ret_lst) 107 | 108 | 109 | def run(target_df): 110 | """ 111 | 多进程处理主程序 112 | :param target_df: 113 | :return: 114 | """ 115 | # 将输入数据按照 116 | df_grouped = target_df.groupby(target_df.index) 117 | res = apply_parallel(df_grouped, tmp_func) 118 | return res 119 | 120 | 121 | def kk_test(): 122 | """ 123 | 测试股票实体是否提取成功 124 | :return: 125 | """ 126 | text = "大智慧的股票真烂,中美贸易战打得好,中美贸易摩擦擦出爱情火花!中信证券也上市了,还是注册制的, 中信建投也不错。" 127 | cut_res = cut_process(text) 128 | print('tmp_res %s' % cut_res) 129 | 130 | test_df = pd.DataFrame({'text': [text, text, text, text, text]}) 131 | # 非多进程下直接提取股票实体 132 | test_df['stock_list'] = test_df['text'].apply(cut_process) 133 | print('test_df %s' % test_df) 134 | 135 | # 多进程下提取股票实体 136 | df_grouped = test_df.groupby(test_df.index) 137 | pp = apply_parallel(df_grouped, tmp_func) 138 | print('pp %s' % pp['stock_list']) 139 | 140 | 141 | # 结构调整 142 | result_df = run(discuss_df) 143 | # 存储到表中 144 | print(result_df.head()) 145 | 146 | # # 创建数据库引擎 147 | engine_mysql_test = data_source.GetDataEngine("XAVIER_DB") 148 | engine_mysql = data_source.GetDataEngine("XAVIER") 149 | 150 | result_df.to_sql('discuss_stock_filter', engine_mysql, if_exists='replace', index=False) 151 | 152 | -------------------------------------------------------------------------------- /src/parser/xueqiu/discuss_parser/xueqiu_discuss_parser_bak.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: xueqiu_discuss_parser_bak.py 8 | @time: 2019-03-05 10:31 9 | 对大V评论进行分析,提取大V评论中的股票实体,并且整合成股票ID:{评论ID, 评论大VID, 评论时间, 评论内容} 10 | 计算时间粒度,一天 11 | 12 | """ 13 | from joblib import Parallel, delayed 14 | import multiprocessing 15 | from src.utils import dicts 16 | import pandas as pd 17 | from src.data_reader import read_all_data 18 | from src.configure import conf 19 | from src.utils.engine import data_source 20 | from src.utils.data_process import DataPressing 21 | from src.utils.tokenization import Tokenizer, load_stop_words 22 | 23 | # 读取原始雪球评论数据 24 | sheet_name = 'xueqiu_discuss' 25 | sql = "SELECT xid, uid, title, text, mood, unix_time FROM xavier.%s ORDER BY unix_time" % sheet_name 26 | # sql = "SELECT count(*) FROM xavier_db.%s ORDER BY unix_time" % sheet_name 27 | 28 | 29 | # 导入股票实体词 30 | stock_code_dict = [] # 股票代码 31 | stock_dict = [] 32 | 33 | 34 | def load_stock_data(): 35 | dic_path = conf.dic_path 36 | st_path = dic_path + "/stock_words.txt" 37 | st_new_path = dic_path + "/stock.csv" 38 | for st in open(st_path): 39 | st = st.decode("utf8") 40 | code1, st_code = st.split("\t") 41 | code, stock = st_code.split(",") 42 | stock_code_dict.append(code.strip("\n")) 43 | stock_dict.append(stock.strip("\n")) 44 | 45 | stocks_df = pd.read_csv(st_new_path, encoding='utf-8') 46 | # stock_df.append(stocks_df.set_index('SESNAME')) 47 | for index, row in stocks_df.iterrows(): 48 | stock_dict.append(row.SESNAME) 49 | stock_dict.append(row.SYMBOL) 50 | return stock_dict, stocks_df 51 | 52 | 53 | _, stocks_df = load_stock_data() 54 | 55 | # 识别评论中的股票实体。 56 | # 对讨论进行分词,然后提取评论中的股票实体。 57 | data_process = DataPressing() 58 | dict_init = dicts.init() 59 | stop_words = load_stop_words() 60 | tokenizer = Tokenizer(data_process, stop_words) 61 | 62 | # 读取需要处理的数据,从数据库中以DataFrame的格式读取。 63 | discuss_df = read_all_data(sheet_name, sql) 64 | # discuss_df = discuss_df.head() 65 | # print('discuss_df %s' % discuss_df['mood']) 66 | 67 | # 整理股票代码 68 | stocks_df = stocks_df.set_index('SESNAME') 69 | # print('stocks_df %s' % stocks_df) 70 | 71 | 72 | def cut_process(text): 73 | """ 74 | 数据处理模块, 分词、提取股票实体词 75 | :param text: 76 | :return: 77 | """ 78 | # 分词 79 | dicts.init() 80 | text_list = tokenizer.token(text) 81 | # 提取text中涉及到的股票实体,并且转换成股票代码 82 | stock_list = data_process.find_stocks(text_list, stocks_df) 83 | res = ','.join(stock_list) 84 | return res 85 | 86 | 87 | def tmp_func(df): 88 | """ 89 | apply函数封装 90 | :param df: 91 | :return: 92 | """ 93 | df['stock_list'] = df['text'].apply(cut_process) 94 | return df[['xid', 'uid', 'mood', 'stock_list', 'unix_time']] 95 | 96 | 97 | def apply_parallel(df_grouped, func): 98 | """ 99 | # 多进程处理 100 | :param df_grouped: 101 | :param func: 102 | :return: 103 | """ 104 | ret_lst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(group) for name, group in df_grouped) 105 | 106 | return pd.concat(ret_lst) 107 | 108 | 109 | def run(target_df): 110 | """ 111 | 多进程处理主程序 112 | :param target_df: 113 | :return: 114 | """ 115 | # 将输入数据按照 116 | df_grouped = target_df.groupby(target_df.index) 117 | res = apply_parallel(df_grouped, tmp_func) 118 | return res 119 | 120 | 121 | def kk_test(): 122 | """ 123 | 测试股票实体是否提取成功 124 | :return: 125 | """ 126 | text = "大智慧的股票真烂,中美贸易战打得好,中美贸易摩擦擦出爱情火花!中信证券也上市了,还是注册制的, 中信建投也不错。" 127 | cut_res = cut_process(text) 128 | print('tmp_res %s' % cut_res) 129 | 130 | test_df = pd.DataFrame({'text': [text, text, text, text, text]}) 131 | # 非多进程下直接提取股票实体 132 | test_df['stock_list'] = test_df['text'].apply(cut_process) 133 | print('test_df %s' % test_df) 134 | 135 | # 多进程下提取股票实体 136 | df_grouped = test_df.groupby(test_df.index) 137 | pp = apply_parallel(df_grouped, tmp_func) 138 | print('pp %s' % pp['stock_list']) 139 | 140 | 141 | # 结构调整 142 | result_df = run(discuss_df) 143 | # 存储到表中 144 | print(result_df.head()) 145 | 146 | # # 创建数据库引擎 147 | engine_mysql_test = data_source.GetDataEngine("XAVIER_DB") 148 | engine_mysql = data_source.GetDataEngine("XAVIER") 149 | 150 | result_df.to_sql('discuss_stock_filter', engine_mysql, if_exists='replace', index=False) 151 | 152 | -------------------------------------------------------------------------------- /src/parser/xueqiu/focus_parser/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: __init__.py.py 8 | @time: 2019-04-29 14:41 9 | """ -------------------------------------------------------------------------------- /src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: xueqiu_focus_statistics.py 8 | @time: 2019-04-29 14:47 9 | 统计累计大V的股票关注数 10 | """ 11 | import sys 12 | sys.path.append('../') 13 | sys.path.append('../../') 14 | sys.path.append('../../../') 15 | sys.path.append('../../../../') 16 | sys.path.append('../../../../../') 17 | 18 | import pandas as pd 19 | import time 20 | from sqlalchemy import create_engine 21 | from src.utils import time_util 22 | from src.utils.log import log_util 23 | from src.data_reader import read_all_data 24 | 25 | 26 | logging = log_util.Logger('xueqiu_focus_statistic') 27 | 28 | 29 | def f(row): 30 | if row[:2] == 'SH': 31 | return str(row[2:]) + '.' + 'XSHG' 32 | elif row[:2] == 'SZ': 33 | return str(row[2:]) + '.' + 'XSHE' 34 | 35 | 36 | if __name__ == '__main__': 37 | pd.set_option('display.max_rows', None, 'display.max_columns', None, "display.max_colwidth", 1000, 'display.width', 1000) 38 | # engine_mysql_test = data_source.GetDataEngine("VISIONTEST") 39 | # engine_mysql = data_source.GetDataEngine("VISION") 40 | engine_mysql_test = create_engine('mysql+mysqlconnector://test_edit:test_edit_2019@db1.irongliang.com:3306/test') 41 | 42 | date_time = time_util.get_integral_point_time(0) 43 | logging.logger.info("program start at {}".format(time_util.timestamp_to_time(date_time), "%Y-%m-%d")) 44 | 45 | # 读取原始大V关注数据 46 | # 大V关注所保存的表 47 | sheet_name = 'xq_user_stock' 48 | # 读取指定时间段的所有数据 49 | sql = "SELECT * FROM test.{} WHERE created <={}".format(sheet_name, str(date_time * 1000)) 50 | 51 | # 读取需要处理的数据,从数据库中以DataFrame的格式读取。 52 | focus_df = read_all_data(sheet_name, engine_mysql_test, sql) 53 | # print(focus_df) 54 | logging.logger.info("导入 %s 条数据" % len(focus_df)) 55 | 56 | res_grouped = [] 57 | focus_grouped = focus_df.groupby('symbol') 58 | for symbol, value in focus_grouped: 59 | counts = value['uid'].count() 60 | res_grouped.append([f(symbol), counts]) 61 | 62 | result = pd.DataFrame(res_grouped, columns=['symbol', 'focus_total_count']) 63 | save_time = date_time - 86400 # (24*60*60) 64 | 65 | result['created_at'] = str(time_util.timestamp_to_time(save_time, "%Y-%m-%d")) 66 | 67 | # 存储到表中 68 | # 创建数据库引擎 69 | result.to_sql('xueqiu_focus_total_count', engine_mysql_test, if_exists='append', index=False) 70 | # print(result) 71 | logging.logger.info('生成 %s 条数据' % len(result)) 72 | logging.logger.info('数据保存到第 %s 天' % str(time_util.timestamp_to_time(save_time, "%Y-%m-%d"))) 73 | logging.logger.info('program finished, end at %s' % str(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))) -------------------------------------------------------------------------------- /src/parser/xueqiu/focus_parser/雪球大V关注股票.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "pycharm": {} 7 | }, 8 | "source": [ 9 | "### 获取数据进行处理" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 5, 15 | "metadata": { 16 | "pycharm": {} 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "import os\n", 21 | "import pdb\n", 22 | "import pandas as pd\n", 23 | "import datetime\n", 24 | "from jqdatasdk import *" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 6, 30 | "metadata": { 31 | "pycharm": {} 32 | }, 33 | "outputs": [ 34 | { 35 | "name": "stderr", 36 | "output_type": "stream", 37 | "text": [ 38 | "/home/kerry/work/workenv/alpha_mind/lib/python3.6/site-packages/numpy/lib/arraysetops.py:569: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n", 39 | " mask |\u003d (ar1 \u003d\u003d a)\n" 40 | ] 41 | }, 42 | { 43 | "ename": "NameError", 44 | "evalue": "name \u0027vip_stock_sets\u0027 is not defined", 45 | "output_type": "error", 46 | "traceback": [ 47 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 48 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 49 | "\u001b[0;32m\u003cipython-input-6-282aeaea1455\u003e\u001b[0m in \u001b[0;36m\u003cmodule\u003e\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mip_stock_sets\u001b[0m \u001b[0;34m\u003d\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u0027/kywk/data/xq/vip_stock/vip_stock_sets.csv\u0027\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex_col\u001b[0m\u001b[0;34m\u003d\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreset_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----\u003e 2\u001b[0;31m vip_stock_sets[\u0027created\u0027] \u003d vip_stock_sets[\u0027created\u0027].apply(lambda x: datetime.datetime.strptime(x, \n\u001b[0m\u001b[1;32m 3\u001b[0m \u0027%Y-%m-%d %H:%M:%S\u0027))\n", 50 | "\u001b[0;31mNameError\u001b[0m: name \u0027vip_stock_sets\u0027 is not defined" 51 | ] 52 | } 53 | ], 54 | "source": [ 55 | "ip_stock_sets \u003d pd.read_csv(\u0027/kywk/data/xq/vip_stock/vip_stock_sets.csv\u0027, index_col\u003d0).reset_index()\n", 56 | "vip_stock_sets[\u0027created\u0027] \u003d vip_stock_sets[\u0027created\u0027].apply(lambda x: datetime.datetime.strptime(x, \n", 57 | " \u0027%Y-%m-%d %H:%M:%S\u0027))" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": { 64 | "pycharm": {} 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "vip_stock_sets" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "pycharm": {} 75 | }, 76 | "source": [ 77 | "## 只保留沪深股票" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 3, 83 | "metadata": { 84 | "pycharm": {} 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "def f(row):\n", 89 | " if row[:2] \u003d\u003d \u0027SH\u0027:\n", 90 | " return str(row[2:]) + \u0027.\u0027 + \u0027XSHG\u0027\n", 91 | " elif row[:2] \u003d\u003d \u0027SZ\u0027:\n", 92 | " return str(row[2:]) + \u0027.\u0027 + \u0027XSHE\u0027" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 4, 98 | "metadata": { 99 | "scrolled": false, 100 | "pycharm": {} 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "ex_vip_stock_sets \u003d vip_stock_sets.set_index(\u0027exchange\u0027)\n", 105 | "xsh_market_stock \u003d ex_vip_stock_sets.loc[\u0027SZ\u0027].append(ex_vip_stock_sets.loc[\u0027SH\u0027]).reset_index()\n", 106 | "xsh_market_stock[\u0027symbol\u0027] \u003d xsh_market_stock[\u0027symbol\u0027].apply(f)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": { 112 | "pycharm": {} 113 | }, 114 | "source": [ 115 | "## 统计近段时间股票关注比例" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 22, 121 | "metadata": { 122 | "scrolled": false, 123 | "pycharm": {} 124 | }, 125 | "outputs": [], 126 | "source": [ 127 | "bwteen_time \u003d datetime.datetime(2019,2,10)\n", 128 | "time_marekt_stock \u003d xsh_market_stock[xsh_market_stock[\u0027created\u0027] \u003e bwteen_time]\n", 129 | "time_marekt_stock \u003d time_marekt_stock[[\u0027exchange\u0027,\u0027name\u0027,\u0027symbol\u0027,\u0027vid\u0027]]\n", 130 | "symbol_market_stock \u003d time_marekt_stock.groupby(\u0027symbol\u0027).count()\n", 131 | "symbol_market_stock[\u0027ratio\u0027] \u003d symbol_market_stock[\u0027vid\u0027] / len(symbol_market_stock)\n", 132 | "symbol_market_stock \u003d symbol_market_stock.sort_values(by\u003d\u0027ratio\u0027, ascending\u003dFalse)[[\u0027ratio\u0027,\u0027vid\u0027]]\n", 133 | "symbol_market_stock.rename(columns\u003d{\u0027vid\u0027:\u0027count\u0027}, inplace\u003dTrue)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 23, 139 | "metadata": { 140 | "pycharm": {} 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "market_stock \u003d time_marekt_stock.drop_duplicates(subset\u003d\u0027symbol\u0027, keep\u003d\u0027first\u0027, inplace\u003dFalse).set_index(\u0027symbol\u0027)\n", 145 | "market_stock \u003d market_stock[[\u0027name\u0027]]\n", 146 | "v_market_stock \u003d symbol_market_stock.merge(market_stock, left_index\u003dTrue, right_index\u003dTrue)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": { 152 | "pycharm": {} 153 | }, 154 | "source": [ 155 | "# 获取对应行业" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 24, 161 | "metadata": { 162 | "pycharm": {} 163 | }, 164 | "outputs": [], 165 | "source": "industry_set \u003d [\u0027801010\u0027, \u0027801020\u0027, \u0027801030\u0027, \u0027801040\u0027, \u0027801050\u0027, \u0027801080\u0027, \u0027801110\u0027, \u0027801120\u0027, \u0027801130\u0027, \n \u0027801140\u0027, \u0027801150\u0027, \u0027801160\u0027, \u0027801170\u0027, \u0027801180\u0027, \u0027801200\u0027, \u0027801210\u0027, \u0027801230\u0027, \u0027801710\u0027,\n \u0027801720\u0027, \u0027801730\u0027, \u0027801740\u0027, \u0027801750\u0027, \u0027801760\u0027, \u0027801770\u0027, \u0027801780\u0027, \u0027801790\u0027, \u0027801880\u0027,\u0027801890\u0027]\nindustry_df \u003d pd.DataFrame(columns\u003dv_market_stock.index)\nfor industry in industry_set:\n industry_stocks \u003d get_industry_stocks(industry)\n industry_stocks \u003d list(set(industry_stocks)\u0026set(v_market_stock.index))\n industry_df.loc[\u0027industry\u0027,industry_stocks] \u003d industry\n\nindustry_df \u003d industry_df.T.dropna()\nindustry_df.reset_index(inplace \u003d True)\nindustry_df.set_index(\u0027symbol\u0027,inplace\u003dTrue)" 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 25, 170 | "metadata": { 171 | "pycharm": {} 172 | }, 173 | "outputs": [], 174 | "source": [ 175 | "industry_market_stock \u003d v_market_stock.merge(industry_df,left_index\u003dTrue, right_index\u003dTrue)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 26, 181 | "metadata": { 182 | "pycharm": {} 183 | }, 184 | "outputs": [], 185 | "source": [ 186 | "industry_set \u003d [\n", 187 | " {\u0027industry\u0027:\u0027801010\u0027,\u0027indname\u0027:\u0027农林牧渔I\u0027},\n", 188 | " {\u0027industry\u0027:\u0027801020\u0027,\u0027indname\u0027:\u0027采掘I\u0027},\n", 189 | " {\u0027industry\u0027:\u0027801030\u0027,\u0027indname\u0027:\u0027化工I\u0027},\n", 190 | " {\u0027industry\u0027:\u0027801040\u0027,\u0027indname\u0027:\u0027钢铁I\u0027},\n", 191 | " {\u0027industry\u0027:\u0027801050\u0027,\u0027indname\u0027:\u0027有色金属I\u0027},\n", 192 | " {\u0027industry\u0027:\u0027801080\u0027,\u0027indname\u0027:\u0027电子I\u0027},\n", 193 | " {\u0027industry\u0027:\u0027801110\u0027,\u0027indname\u0027:\u0027家用电器I\u0027},\n", 194 | " {\u0027industry\u0027:\u0027801120\u0027,\u0027indname\u0027:\u0027食品饮料I\u0027},\n", 195 | " {\u0027industry\u0027:\u0027801130\u0027,\u0027indname\u0027:\u0027纺织服装I\u0027},\n", 196 | " {\u0027industry\u0027:\u0027801140\u0027,\u0027indname\u0027:\u0027轻工制造I\u0027},\n", 197 | " {\u0027industry\u0027:\u0027801150\u0027,\u0027indname\u0027:\u0027医药生物I\u0027},\n", 198 | " {\u0027industry\u0027:\u0027801160\u0027,\u0027indname\u0027:\u0027公用事业I\u0027},\n", 199 | " {\u0027industry\u0027:\u0027801170\u0027,\u0027indname\u0027:\u0027交通运输I\u0027},\n", 200 | " {\u0027industry\u0027:\u0027801180\u0027,\u0027indname\u0027:\u0027房地产I\u0027},\n", 201 | " {\u0027industry\u0027:\u0027801200\u0027,\u0027indname\u0027:\u0027商业贸易I\u0027},\n", 202 | " {\u0027industry\u0027:\u0027801210\u0027,\u0027indname\u0027:\u0027休闲服务I\u0027},\n", 203 | " {\u0027industry\u0027:\u0027801230\u0027,\u0027indname\u0027:\u0027综合I\u0027},\n", 204 | " {\u0027industry\u0027:\u0027801710\u0027,\u0027indname\u0027:\u0027建筑材料I\u0027},\n", 205 | " {\u0027industry\u0027:\u0027801720\u0027,\u0027indname\u0027:\u0027建筑装饰I\u0027},\n", 206 | " {\u0027industry\u0027:\u0027801730\u0027,\u0027indname\u0027:\u0027电气设备I\u0027},\n", 207 | " {\u0027industry\u0027:\u0027801740\u0027,\u0027indname\u0027:\u0027国防军工I\u0027},\n", 208 | " {\u0027industry\u0027:\u0027801750\u0027,\u0027indname\u0027:\u0027计算机I\u0027},\n", 209 | " {\u0027industry\u0027:\u0027801760\u0027,\u0027indname\u0027:\u0027传媒I\u0027},\n", 210 | " {\u0027industry\u0027:\u0027801770\u0027,\u0027indname\u0027:\u0027通信I\u0027},\n", 211 | " {\u0027industry\u0027:\u0027801780\u0027,\u0027indname\u0027:\u0027银行I\u0027},\n", 212 | " {\u0027industry\u0027:\u0027801790\u0027,\u0027indname\u0027:\u0027非银金融I\u0027},\n", 213 | " {\u0027industry\u0027:\u0027801880\u0027,\u0027indname\u0027:\u0027汽车I\u0027},\n", 214 | " {\u0027industry\u0027:\u0027801890\u0027,\u0027indname\u0027:\u0027机械设备I\u0027}]\n", 215 | "market_industry \u003d pd.DataFrame(industry_set).set_index(\u0027industry\u0027)" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 27, 221 | "metadata": { 222 | "pycharm": {} 223 | }, 224 | "outputs": [], 225 | "source": [ 226 | "industry_market_stock \u003dindustry_market_stock.reset_index().set_index(\n", 227 | " \u0027industry\u0027).merge(market_industry,left_index\u003dTrue, right_index\u003dTrue)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 28, 233 | "metadata": { 234 | "scrolled": false, 235 | "pycharm": {} 236 | }, 237 | "outputs": [], 238 | "source": [ 239 | "result_socket \u003d industry_market_stock.sort_values(by\u003d\u0027count\u0027, ascending\u003dFalse)[:50]" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 29, 245 | "metadata": { 246 | "pycharm": {} 247 | }, 248 | "outputs": [], 249 | "source": [ 250 | "result_socket.to_csv(bwteen_time.strftime(\"%Y-%m-%d\") + \u0027_stock_50.csv\u0027,encoding\u003d\u0027UTF-8\u0027)\n", 251 | "result_socket.groupby(\u0027indname\u0027).count().to_csv(bwteen_time.strftime(\"%Y-%m-%d\") + \u0027group_50.csv\u0027,encoding\u003d\u0027UTF-8\u0027)\n", 252 | "\n", 253 | "industry_market_stock.groupby(\u0027indname\u0027).count().to_csv(bwteen_time.strftime(\"%Y-%m-%d\") + \u0027group_all.csv\u0027,encoding\u003d\u0027UTF-8\u0027)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": { 260 | "pycharm": {} 261 | }, 262 | "outputs": [], 263 | "source": [] 264 | } 265 | ], 266 | "metadata": { 267 | "kernelspec": { 268 | "display_name": "Python 3", 269 | "language": "python", 270 | "name": "python3" 271 | }, 272 | "language_info": { 273 | "codemirror_mode": { 274 | "name": "ipython", 275 | "version": 3 276 | }, 277 | "file_extension": ".py", 278 | "mimetype": "text/x-python", 279 | "name": "python", 280 | "nbconvert_exporter": "python", 281 | "pygments_lexer": "ipython3", 282 | "version": "3.6.7" 283 | } 284 | }, 285 | "nbformat": 4, 286 | "nbformat_minor": 2 287 | } -------------------------------------------------------------------------------- /src/parser/xueqiu/log/dict_log.log.2019-04-29: -------------------------------------------------------------------------------- 1 | 2019-04-29 16:09:34,506 - /Users/li/PycharmProjects/event_parser/src/utils/dicts.py[line:189] - INFO: [Info] jieba总共添加了10889个自定义词汇。 2 | 2019-04-29 16:09:51,171 - /Users/li/PycharmProjects/event_parser/src/utils/dicts.py[line:189] - INFO: [Info] jieba总共添加了10889个自定义词汇。 3 | 2019-04-29 16:18:37,725 - /Users/li/PycharmProjects/event_parser/src/utils/dicts.py[line:189] - INFO: [Info] jieba总共添加了10889个自定义词汇。 4 | 2019-04-29 16:19:32,879 - /Users/li/PycharmProjects/event_parser/src/utils/dicts.py[line:189] - INFO: [Info] jieba总共添加了10889个自定义词汇。 5 | 2019-04-29 16:20:55,599 - /Users/li/PycharmProjects/event_parser/src/utils/dicts.py[line:189] - INFO: [Info] jieba总共添加了10889个自定义词汇。 6 | -------------------------------------------------------------------------------- /src/parser/xueqiu/log/discuss_stock_filter_daily.log.2019-04-29: -------------------------------------------------------------------------------- 1 | 2019-04-29 16:09:35,871 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:62] - INFO: program start at 2019-04-28 00:00:00 2 | 2019-04-29 16:09:37,541 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:82] - INFO: load discuss data from mysql successful 3 | 2019-04-29 16:09:38,095 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:120] - INFO: length of result: 55 4 | 2019-04-29 16:09:38,095 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:126] - INFO: program finished 5 | 2019-04-29 16:09:52,444 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:61] - INFO: program start at 2019-04-28 09:00:00 6 | 2019-04-29 16:09:53,530 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:81] - INFO: load discuss data from mysql successful 7 | 2019-04-29 16:09:53,981 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:119] - INFO: length of result: 55 8 | 2019-04-29 16:09:53,981 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:125] - INFO: program finished 9 | 2019-04-29 16:18:39,087 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:61] - INFO: program start at 2019-04-28 09:00:00 10 | 2019-04-29 16:18:39,087 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:62] - INFO: program stop at 2019-04-29 09:00:00 11 | 2019-04-29 16:18:40,227 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:82] - INFO: load discuss data from mysql successful 12 | 2019-04-29 16:18:40,686 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:120] - INFO: length of result: 55 13 | 2019-04-29 16:18:40,686 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:126] - INFO: program finished 14 | 2019-04-29 16:19:34,182 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:61] - INFO: program start at 2019-04-28 00:00:00 15 | 2019-04-29 16:19:34,182 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:62] - INFO: program stop at 2019-04-29 00:00:00 16 | 2019-04-29 16:19:35,319 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:82] - INFO: load discuss data from mysql successful 17 | 2019-04-29 16:19:35,775 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:120] - INFO: length of result: 55 18 | 2019-04-29 16:19:35,775 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:126] - INFO: program finished 19 | 2019-04-29 16:20:56,877 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:61] - INFO: program start at 2019-04-28 00:00:00 20 | 2019-04-29 16:20:56,877 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:62] - INFO: program stop at 2019-04-29 00:00:00 21 | 2019-04-29 16:20:57,998 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:82] - INFO: load discuss data from mysql successful 22 | 2019-04-29 16:20:58,450 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:121] - INFO: length of result: 55 23 | 2019-04-29 16:20:58,450 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:126] - INFO: 数据保存到第 2019-04-28 天 24 | 2019-04-29 16:20:58,450 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:128] - INFO: program finished 25 | -------------------------------------------------------------------------------- /src/parser/xueqiu/log/tokenization_log.log.2019-04-29: -------------------------------------------------------------------------------- 1 | 2019-04-29 16:09:34,509 - /Users/li/PycharmProjects/event_parser/src/utils/tokenization.py[line:37] - INFO: Stopwords 导入成功! 2 | 2019-04-29 16:09:51,173 - /Users/li/PycharmProjects/event_parser/src/utils/tokenization.py[line:37] - INFO: Stopwords 导入成功! 3 | 2019-04-29 16:18:37,726 - /Users/li/PycharmProjects/event_parser/src/utils/tokenization.py[line:37] - INFO: Stopwords 导入成功! 4 | 2019-04-29 16:19:32,881 - /Users/li/PycharmProjects/event_parser/src/utils/tokenization.py[line:37] - INFO: Stopwords 导入成功! 5 | 2019-04-29 16:20:55,600 - /Users/li/PycharmProjects/event_parser/src/utils/tokenization.py[line:37] - INFO: Stopwords 导入成功! 6 | -------------------------------------------------------------------------------- /src/parser/xueqiu/log/xueqiu_focus_statistic.log.2019-04-29: -------------------------------------------------------------------------------- 1 | 2019-04-29 15:32:14,329 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:36] - INFO: program start at 2019-04-29 00:00:00 2 | 2019-04-29 15:32:37,232 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:36] - INFO: program start at 2019-04-29 00:00:00 3 | 2019-04-29 15:34:08,280 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:36] - INFO: program start at 2019-04-29 00:00:00 4 | 2019-04-29 15:34:32,826 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:36] - INFO: program start at 2019-04-29 00:00:00 5 | 2019-04-29 15:35:56,258 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:36] - INFO: program start at 51292-06-25 00:00:00 6 | 2019-04-29 15:36:11,297 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:36] - INFO: program start at 2019-04-29 00:00:00 7 | 2019-04-29 15:38:13,175 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:36] - INFO: program start at 2019-04-29 00:00:00 8 | 2019-04-29 15:40:33,020 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:38] - INFO: program start at 2019-04-29 00:00:00 9 | 2019-04-29 15:42:01,647 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:38] - INFO: program start at 2019-04-29 00:00:00 10 | 2019-04-29 15:42:03,936 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:65] - INFO: 数据保存到第 2019-04-28 天 11 | 2019-04-29 15:42:03,937 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:66] - INFO: program finished 12 | 2019-04-29 15:42:50,969 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:38] - INFO: program start at 2019-04-29 00:00:00 13 | 2019-04-29 15:42:53,807 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:65] - INFO: 数据保存到第 2019-04-28 天 14 | 2019-04-29 15:42:53,807 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:66] - INFO: program finished, end at 2019-04-29 15 | 2019-04-29 15:43:43,864 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:38] - INFO: program start at 2019-04-29 00:00:00 16 | 2019-04-29 15:43:46,227 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:65] - INFO: 数据保存到第 2019-04-28 天 17 | 2019-04-29 15:43:46,227 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:66] - INFO: program finished, end at 2019-04-29 00:00:00 18 | 2019-04-29 15:44:10,233 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:38] - INFO: program start at 2019-04-29 00:00:00 19 | 2019-04-29 15:44:12,612 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:65] - INFO: 数据保存到第 2019-04-28 天 20 | 2019-04-29 15:44:12,612 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:66] - INFO: program finished, end at 2019-04-29 21 | 2019-04-29 15:44:47,022 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:38] - INFO: program start at 2019-04-29 00:00:00 22 | 2019-04-29 15:44:49,805 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:65] - INFO: 数据保存到第 2019-04-28 天 23 | 2019-04-29 15:47:00,046 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:38] - INFO: program start at 2019-04-29 00:00:00 24 | 2019-04-29 15:47:02,921 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:65] - INFO: 数据保存到第 2019-04-28 天 25 | 2019-04-29 15:47:02,922 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:66] - INFO: program finished, end at 2019.04.29 26 | 2019-04-29 15:47:47,126 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:38] - INFO: program start at 2019-04-29 00:00:00 27 | 2019-04-29 15:47:49,928 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:65] - INFO: 数据保存到第 2019-04-28 天 28 | 2019-04-29 15:47:49,928 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:66] - INFO: program finished, end at 2019-04-29 15:47:49 29 | 2019-04-29 15:50:14,033 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:38] - INFO: program start at 2019-04-29 00:00:00 30 | 2019-04-29 15:50:15,301 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:49] - INFO: load 20 data 31 | 2019-04-29 15:50:16,338 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:66] - INFO: 生成 20 条数据 32 | 2019-04-29 15:50:16,338 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:67] - INFO: 数据保存到第 2019-04-28 天 33 | 2019-04-29 15:50:16,339 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:68] - INFO: program finished, end at 2019-04-29 15:50:16 34 | -------------------------------------------------------------------------------- /src/singlepass_run.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: singlepass_run.py 8 | @time: 2018/11/29 8:04 PM 9 | 新闻聚类 10 | """ 11 | import sys 12 | import time 13 | import pickle 14 | sys.path.append('..') 15 | sys.path.append('../') 16 | sys.path.append('../../') 17 | from src.configure import conf 18 | from src.utils.log import log_util 19 | from src.utils.VSM import tfidf 20 | from src.algorithm.cluster.singlePass import singlePassCluster 21 | 22 | logging = log_util.Logger('singlepass_run') 23 | # corpus_train_path = "/Users/li/PycharmProjects/event_parser/src/data/text_full_index.txt" 24 | corpus_train_path = conf.corpus_train_path 25 | # tfidf_train, word_dict = tfidf_vector(corpus_train) 26 | # tfidf_train, word_dict = tfidf.tfidf_vector(corpus_train) 27 | corpus_train_dict = tfidf.load_data(corpus_train_path) 28 | 29 | # load tf-idf VSM 30 | tfidf_feature_path = conf.tfidf_feature_path 31 | tfidf_transformer_path = conf.tfidftransformer_path 32 | 33 | try: 34 | tfidf_feature = tfidf.load_tfidf_feature(tfidf_feature_path) 35 | logging.logger.info("TF-IDF feature load success") 36 | tfidf_transformer = tfidf.load_tfidf_transformer(tfidf_transformer_path) 37 | logging.logger.info("TF-IDF transformer load success") 38 | except: 39 | logging.logger.info("TF-IDF model load failed, please check path %s,%s" % (tfidf_feature_path, 40 | tfidf_transformer_path)) 41 | sys.exit() 42 | # 计算历史新闻文本的TF-IDF,并与news_id组成tuple 43 | tf_idf_start_time = time.time() 44 | tfidf_train_tuple = tfidf.load_batch_tfidf_vector(corpus_train_dict, tfidf_feature, tfidf_transformer) 45 | logging.logger.info('TF-IDF of news calculate success, using {} s'.format(time.time() - tf_idf_start_time)) 46 | 47 | # tfidf_train_tuple = [] 48 | # for item in corpus_train_dict.items(): 49 | # catagory, corpus = item[1], item[0] 50 | # tfidf_train_tuple.append((catagory, tfidf.load_tfidf_vectorizer([corpus], tfidf_feature, tfidf_transformer))) 51 | 52 | # tfidf_train_dict, tfidf_train_tuple, word_dict = tfidf.tfidf_vectorizer(corpus_train_path) 53 | 54 | # 对输入的历史新闻文本进行singlepass聚类。 55 | # clustering = OnePassCluster(vector_tuple=tfidf_train.toarray(), threshold=10) 56 | # clustering = singlePassCluster.OnePassCluster(vector_tuple=tfidf_train_tuple, threshold=10) 57 | statrt_time = time.time() 58 | clustering = singlePassCluster.OnePassCluster(vector_tuple=tfidf_train_tuple, threshold=10) 59 | clustering.print_result() 60 | logging.logger.info('singPass cluster done, it take\'s %s s' % (time.time()-statrt_time)) 61 | 62 | # 将聚好的类簇保存下来,为后面的事件表示和有效事件判断使用。 63 | # clustering_path = '/Users/li/PycharmProjects/event_parser/src/model/clustering_new.pkl' 64 | clustering_path = conf.clustering_save_path 65 | with open(clustering_path, 'wb') as fw: 66 | pickle.dump(clustering, fw) 67 | logging.logger.info("cluster units save success in path{}".format(clustering_path)) 68 | # for cluster_index, cluster in enumerate(cluster_list): 69 | # print "cluster:%s" % cluster_index # 簇的序号 70 | # print cluster.node_list # 该簇的节点列表 71 | -------------------------------------------------------------------------------- /src/singlepass_test.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: singlepass_test.py 8 | @time: 2018-12-27 20:38 9 | """ 10 | 11 | 12 | import sys 13 | import numpy as np 14 | sys.path.append('..') 15 | sys.path.append('../') 16 | sys.path.append('../../') 17 | from src.configure import conf 18 | from src.utils.log import log_util 19 | from src.utils.VSM import tfidf 20 | 21 | logging = log_util.Logger('singlepass_test') 22 | # corpus_train_path = "/Users/li/PycharmProjects/event_parser/src/data/text_full_index.txt" 23 | corpus_train_path = conf.corpus_train_path 24 | # tfidf_train, word_dict = tfidf_vector(corpus_train) 25 | # tfidf_train, word_dict = tfidf.tfidf_vector(corpus_train) 26 | corpus_train_dict = tfidf.load_data(corpus_train_path) 27 | 28 | # load tf-idf VSM 29 | tfidf_feature_path = conf.tfidf_feature_path 30 | tfidf_transformer_path = conf.tfidftransformer_path 31 | 32 | try: 33 | tfidf_feature = tfidf.load_tfidf_feature(tfidf_feature_path) 34 | tfidf_transformer = tfidf.load_tfidf_transformer(tfidf_transformer_path) 35 | logging.logger.info("TF-IDF model load sucess") 36 | except: 37 | logging.logger.info("TF-IDF model load failed, please check path %s,%s" % (tfidf_feature_path, 38 | tfidf_transformer_path)) 39 | sys.exit() 40 | # 计算历史新闻文本的TF-IDF,并与news_id组成tuple 41 | tfidf_train_tuple = tfidf.load_batch_tfidf_vector(corpus_train_dict, tfidf_feature, tfidf_transformer) 42 | logging.logger.info('TF-IDF of news calculate success') 43 | 44 | # tfidf_train_tuple = [] 45 | # for item in corpus_train_dict.items(): 46 | # catagory, corpus = item[1], item[0] 47 | # tfidf_train_tuple.append((catagory, tfidf.load_tfidf_vectorizer([corpus], tfidf_feature, tfidf_transformer))) 48 | 49 | tfidf_train_dict, tfidf_train_tuple2, word_dict = tfidf.tfidf_vectorizer(corpus_train_path) 50 | 51 | # for i in tfidf_train_tuple[0][1]: 52 | # print i 53 | print(np.nonzero(tfidf_train_tuple[0][1])) 54 | print(np.nonzero(tfidf_train_tuple2[0][1])) 55 | 56 | print(tfidf_train_tuple[0][1] == np.nonzero(tfidf_train_tuple2[0][1])) 57 | 58 | # for i in dict(tfidf_train_tuple).items(): 59 | # print i[0] -------------------------------------------------------------------------------- /src/utils/Keywords.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: Keywords.py 8 | @time: 2018/11/14 8:23 PM 9 | 关键词提取程序 10 | """ 11 | import sys 12 | sys.path.append("../") 13 | import log_util 14 | from src.configure import Configure 15 | from pyhanlp import * 16 | from jieba import analyse 17 | from tokenization import Tokenizer 18 | 19 | logging = log_util.Logger('keywordsLog') 20 | 21 | # logging.logger.info("running %s" % ' '.join(sys.argv)) 22 | 23 | 24 | class keywordsExtractor(object): 25 | def __init__(self): 26 | conf = Configure() 27 | self.stopwords_path = conf.stop_words_path 28 | self.persent = 0.1 29 | 30 | def run(self, document): 31 | 32 | # tk = Tokenizer() 33 | # document = tk.token(document) 34 | # 基于Hanlp库的关键词提取 35 | print("[Info] keywords by Hanlp:") 36 | keywords_hanlp = HanLP.extractKeyword(document, 20) 37 | # print ",".join(keyword for keyword in keywords_hanlp) 38 | 39 | # 基于jieba库的关键词抽取 40 | # 添加停用词 41 | analyse.set_stop_words(self.stopwords_path) 42 | # 引入TextRank关键词抽取接口 43 | textrank = analyse.textrank 44 | print "[Info] keywords by textrank:" 45 | # keywords_jieba = textrank(document, 8, allowPOS=['n', 'nr', 'ns', 'vn', 'v']) 46 | # keywords_jieba = textrank(document, 20, withWeight=True) 47 | keywords_jieba = textrank(document, 20) 48 | # 输出抽取出的关键词 49 | # print ",".join(keyword for keyword in keywords_jieba) 50 | 51 | # 两种关键词提取接口做交集 52 | print"[Info] 两个关键词提取方法取交集:" 53 | join_set = set(keywords_hanlp).intersection(set(keywords_jieba)) 54 | # print ",".join(item for item in join_set) 55 | return join_set 56 | 57 | 58 | def test(): 59 | # 关键词提取 60 | document = u"水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露," \ 61 | u"根据刚刚完成了水资源管理制度的考核,有部分省接近了红线的指标,\n" \ 62 | u"有部分省超过红线的指标。对一些超过红线的地方,\n陈明忠表示,对一些取用水项目进行区域的限批," \ 63 | u"严格地进行水资源论证和取水许可的批准。" 64 | 65 | kex = keywordsExtractor() 66 | keywords = kex.run(document) 67 | print ",".join(item for item in keywords) 68 | 69 | 70 | if __name__ == '__main__': 71 | test() -------------------------------------------------------------------------------- /src/utils/VSM/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: 1.0 6 | @author: li 7 | @file: __init__.py.py 8 | @time: 2019/12/12 1:58 下午 9 | """ -------------------------------------------------------------------------------- /src/utils/VSM/tfidf.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: tfidf.py 8 | @time: 2018/11/28 4:03 PM 9 | """ 10 | import pickle 11 | import numpy as np 12 | 13 | from sklearn.feature_extraction.text import TfidfTransformer 14 | from sklearn.feature_extraction.text import CountVectorizer 15 | 16 | import sys 17 | sys.path.append('..') 18 | sys.path.append('../') 19 | sys.path.append('../../') 20 | sys.path.append('/Users/li/PycharmProjects/event_parser/src') 21 | from src.configure import Configure 22 | 23 | conf = Configure() 24 | 25 | 26 | def load_data(corpus_path): 27 | corpus_train_dic = {} 28 | for line in open(corpus_path): 29 | line = line.strip().split('\t') 30 | if len(line) == 3: 31 | category = line[0] 32 | words = line[2] 33 | corpus_train_dic[category] = words 34 | return corpus_train_dic 35 | 36 | 37 | def tfidf_vectorizer(corpus_path): 38 | """vectorize the training documents""" 39 | corpus_train = [] 40 | category_train = [] 41 | for line in open(corpus_path): 42 | line = line.strip().split('\t') 43 | if len(line) == 3: 44 | category = line[0] 45 | words = line[2] 46 | category_train.append(category) 47 | corpus_train.append(words) 48 | print("build train-corpus done!!") 49 | print("corpus_train.shape %s" % np.shape(corpus_train)) 50 | # replace 必须加,保存训练集的特征 51 | count_vectorizer = CountVectorizer(decode_error="replace") 52 | # count_vectorizer = CountVectorizer(max_df=0.4, min_df=0.01, decode_error="replace") 53 | counts_train = count_vectorizer.fit_transform(corpus_train) 54 | 55 | word_dict = {} 56 | for index, word in enumerate(count_vectorizer.get_feature_names()): 57 | word_dict[index] = word 58 | print("The VSM shape of train is" + repr(counts_train.shape)) 59 | 60 | tfidftransformer = TfidfTransformer() 61 | # 注意在训练的时候必须用vectorizer.fit_transform、tfidftransformer.fit_transform 62 | # 在预测的时候必须用vectorizer.transform、tfidftransformer.transform 63 | tfidf_train = tfidftransformer.fit_transform(counts_train) 64 | # tfidf_train = tfidftransformer.fit(counts_train).transform(counts_train) 65 | 66 | tfidf_train_array = tfidf_train.toarray() 67 | tfidf_train_dict = {} 68 | for item in range(len(tfidf_train_array)): 69 | tfidf_train_dict[category_train[item]] = tfidf_train_array[item] 70 | 71 | tfidf_train_tuple = [] 72 | for item in range(len(tfidf_train_array)): 73 | tfidf_train_tuple.append((category_train[item], tfidf_train_array[item])) 74 | 75 | # 保存经过fit的vectorizer 与 经过fit的tfidftransformer,预测时使用 76 | # tfidf_feature_path = '/Users/li/PycharmProjects/event_parser/src/model/tfidf_model/feature_1.pkl' 77 | tfidf_feature_path = conf.tfidf_feature_path 78 | with open(tfidf_feature_path, 'wb') as fw: 79 | pickle.dump(count_vectorizer.vocabulary_, fw) 80 | 81 | # tfidftransformer_path = '/Users/li/PycharmProjects/event_parser/src/model/tfidf_model/tfidftransformer_1.pkl' 82 | tfidftransformer_path = conf.tfidftransformer_path 83 | with open(tfidftransformer_path, 'wb') as fw: 84 | pickle.dump(tfidftransformer, fw) 85 | 86 | # word_dict_path = '/Users/li/PycharmProjects/event_parser/src/model/tfidf_model/word_dict_1.pkl' 87 | word_dict_path = conf.word_dict_path 88 | with open(word_dict_path, 'wb') as fw: 89 | pickle.dump(word_dict, fw) 90 | 91 | return tfidf_train_dict, tfidf_train_tuple, word_dict 92 | 93 | 94 | def load_batch_tfidf_vector(corpus_train_dict, tfidf_feature, tfidf_transformer): 95 | """ 96 | 将语料库中的数据批量转换成VSM 97 | :param corpus_train_dict: 98 | :param tfidf_feature: 99 | :param tfidf_transformer: 100 | :return: 101 | """ 102 | tfidf_vsm_tuple = [] 103 | for item in corpus_train_dict.items(): 104 | category, corpus = item[0], item[1] 105 | tfidf_vsm_tuple.append((category, load_tfidf_vectorizer([corpus], tfidf_feature, tfidf_transformer))) 106 | 107 | return tfidf_vsm_tuple 108 | 109 | 110 | def load_tfidf_feature(tfidf_feature_path): 111 | """ 112 | load tf-idf VSM 113 | :param tfidf_feature_path: 114 | :return: 115 | """ 116 | tfidf_feature = pickle.load(open(tfidf_feature_path, "rb")) 117 | return tfidf_feature 118 | 119 | 120 | def load_tfidf_transformer(tfidf_transformer_path): 121 | """ 122 | load tf-idf transformer 123 | :param tfidf_transformer_path: 124 | :return: 125 | """ 126 | tfidf_transformer = pickle.load(open(tfidf_transformer_path, "rb")) 127 | return tfidf_transformer 128 | 129 | 130 | def load_tfidf_vectorizer(corpus_path, tfidf_feature, tfidf_transformer): 131 | """ 132 | :param tfidf_transformer: 133 | :param tfidf_feature: tf-idf feature 134 | :param corpus_path: 135 | :return: 136 | """ 137 | # if type(corpus_path) is not list: 138 | # corpus_test = [] 139 | # target_test = [] 140 | # for line in open(corpus_path): 141 | # line = line.strip().split('\t') 142 | # if len(line) == 3: 143 | # category = line[0] 144 | # words = line[2] 145 | # target_test.append(category) 146 | # corpus_test.append(words) 147 | # else: 148 | # corpus_test = corpus_path 149 | corpus_test = corpus_path 150 | # 加载特征 151 | loaded_vec = CountVectorizer(decode_error="replace", vocabulary=tfidf_feature) 152 | # 加载TfidfTransformer 153 | # 测试用transform,表示测试数据,为list 154 | test_tfidf = tfidf_transformer.transform(loaded_vec.transform(corpus_test)) 155 | return test_tfidf.toarray().reshape(-1) 156 | 157 | 158 | def tfidf_vector_test(corpus_path): 159 | """vectorize the input documents""" 160 | corpus_train = [] 161 | # 利用train-corpus提取特征 162 | target_train = [] 163 | for line in open(corpus_path): 164 | line = line.strip().split('\t') 165 | if len(line) == 3: 166 | words = line[2] 167 | category = line[0] 168 | target_train.append(category) 169 | corpus_train.append(words) 170 | print("build train-corpus done!!") 171 | count_v1 = CountVectorizer(max_df=0.4, min_df=0.01) 172 | # count_v1 = CountVectorizer() 173 | counts_train = count_v1.fit_transform(corpus_train) 174 | 175 | word_dict = {} 176 | for index, word in enumerate(count_v1.get_feature_names()): 177 | word_dict[index] = word 178 | 179 | print("the shape of train is " + repr(counts_train.shape)) 180 | tfidftransformer = TfidfTransformer() 181 | tfidf_train = tfidftransformer.fit(counts_train).transform(counts_train) 182 | return tfidf_train, word_dict 183 | 184 | 185 | if __name__ == '__main__': 186 | # corpus_train = "/Users/li/PycharmProjects/event_parser/src/text_full_full.txt" 187 | corpus_train = conf.corpus_train_path 188 | tfidf_train_dic, tfidf_train_tuple, word_dict = tfidf_vectorizer(corpus_train) 189 | print(np.nonzero(tfidf_train_dic["111755669"])) 190 | print(np.shape(tfidf_train_dic['111755669'])) 191 | print(type(tfidf_train_dic['111755669'])) 192 | # print np.shape(tfidf_train.toarray()[0]) 193 | # print np.nonzero(tfidf_train.toarray()[0]) 194 | # for i in tfidf_train.toarray()[0]: 195 | # print i 196 | 197 | # corpus_data_dic = load_data(corpus_train) 198 | # print type(corpus_data_dic['111755669']) 199 | # tfidf_test = load_tfidf_vectorizer([corpus_data_dic['111755669']]).toarray().reshape(-1) 200 | # print np.nonzero(tfidf_test) 201 | -------------------------------------------------------------------------------- /src/utils/VSM/vector.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: vector.py 8 | @time: 2018/11/5 2:31 PM 9 | 词向量,文本向量训练模块 10 | 训练用的编码格式要与使用model时的编码格式一致。 11 | """ 12 | 13 | import sys 14 | import os 15 | sys.path.append("..") 16 | sys.path.append("../") 17 | sys.path.append("../../") 18 | import logging.handlers 19 | from src.utils import file_util 20 | import numpy as np 21 | import multiprocessing 22 | from gensim.models.word2vec import Word2Vec 23 | from gensim.models.doc2vec import Doc2Vec, LabeledSentence 24 | from src import data_reader 25 | 26 | LOG_FILE = '../log/vectors.log' 27 | file_util.check_path(LOG_FILE) 28 | handler = logging.handlers.RotatingFileHandler(LOG_FILE, maxBytes=1024 * 1024, backupCount=1) # 实例化handler 29 | fmt = '%(asctime)s - %(filename)s:%(lineno)s - %(levelname)s - %(message)s' 30 | # logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') 31 | formatter = logging.Formatter(fmt) 32 | handler.setFormatter(formatter) 33 | logger = logging.getLogger() 34 | logger.addHandler(handler) 35 | logger.setLevel(level=logging.INFO) 36 | # logger.setLevel(level=logging.DEBUG) 37 | logger.info("running %s" % ' '.join(sys.argv)) 38 | 39 | 40 | class word2vecs(object): 41 | """ 42 | :keyword Word2Vec模型训练 43 | """ 44 | def __init__(self, wd_configure): 45 | """ 46 | Word2Vec模型训练 47 | :param wd_configure: 模型的参数 48 | """ 49 | 50 | if "size" in wd_configure.keys(): 51 | # 训练时词向量维度,默认为100 52 | self.size = wd_configure["size"] 53 | else: 54 | self.size = 300 55 | 56 | if "min_count" in wd_configure.keys(): 57 | # min_count 不能设置过大,不然词汇表中会没有词汇 58 | # 需要训练词语的最小出现次数,默认为5 59 | self.min_count = wd_configure["min_count"] 60 | else: 61 | self.min_count = 1 62 | 63 | if "window" in wd_configure.keys(): 64 | self.window = wd_configure["window"] 65 | else: 66 | self.window = 5 67 | 68 | if "worker" in wd_configure.keys(): 69 | # 完成训练过程的线程数,默认为1不使用多线程, 只有注意安装Cython的前提下该参数设置才有意义 70 | self.worker = wd_configure["worker"] 71 | else: 72 | self.worker = multiprocessing.cpu_count() 73 | 74 | def train(self, sentences): 75 | """ 76 | 模型训练 77 | :param sentences:每行为一个list 如sentences = [['A1', 'A2'], ['A1', 'A2'], ['A1', 'A2', 'A1', 'A2']] 78 | :return: word2vec 模型 79 | # """ 80 | model_w2d = Word2Vec(size=self.size, window=self.window, min_count=self.min_count, workers=self.worker) 81 | model_w2d.build_vocab(sentences) 82 | model_w2d.train(sentences, total_examples=model_w2d.corpus_count, epochs=model_w2d.iter) 83 | return model_w2d 84 | 85 | def save(self, model, model_path): 86 | model.save(model_path) 87 | 88 | def load_model(self, model_path): 89 | return Word2Vec.load(model_path) 90 | 91 | 92 | class doc2vec(object): 93 | def __init__(self, dm_configure): 94 | if dm_configure.min_count: 95 | self.min_count = dm_configure.min_count 96 | else: 97 | self.min_count = 1 98 | 99 | if dm_configure.window: 100 | self.window = dm_configure.window 101 | else: 102 | self.window = 3 103 | 104 | if dm_configure.size: 105 | self.size = dm_configure.size 106 | else: 107 | self.size = 200 108 | 109 | if dm_configure.sample: 110 | self.sample = dm_configure.sample 111 | else: 112 | self.sample = 1e-3 113 | 114 | if dm_configure.negative: 115 | self.negative = dm_configure.negative 116 | else: 117 | self.negative = 5 118 | 119 | if dm_configure.workers: 120 | self.workers = dm_configure.workers 121 | else: 122 | self.workers = multiprocessing.cpu_count() 123 | 124 | def train(self, x_train): 125 | model_dm = Doc2Vec(x_train, min_count=self.min_count, window=self.window, size=self.size, 126 | sample=self.sample, negative=self.negative, workers=self.workers) 127 | model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=100) 128 | 129 | return model_dm 130 | 131 | def save(self, model_dm, model_path): 132 | # model_dm.save('../model/model_dm') 133 | model_dm.save(model_path) # model_dm.load(model_path) 134 | # model_dm.save_word2vec_format(model_path) # model_dm.load_word2vec_format(model_path,encoding='utf-8') 135 | 136 | # def train(x_train, size=200, epoch_num=1): 137 | # model_dm = Doc2Vec(x_train, min_count=1, window=3, size=size, sample=1e-3, negative=5, workers=4) 138 | # model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=100) 139 | # model_dm.save('../model/model_dm') 140 | # 141 | # return model_dm 142 | 143 | 144 | def word2vec_train(self): 145 | # load training data 146 | x_train = data_reader.get_news_data() 147 | # word2vec 训练测试 148 | wd_configure = {"size": 300, 149 | "window": 2, 150 | "min_count": 1} 151 | wd = word2vecs(wd_configure) 152 | model_wd = wd.train(x_train) 153 | print("[Info] word2vec模型训练结束") 154 | logger.info("[Info] word2vec模型训练结束") 155 | print(model_wd.wv[u'食品饮料']) 156 | # print model_wd.most_similar['食品饮料'] 157 | 158 | # 模型保存 159 | model_path = "/Users/li/PycharmProjects/event_parser/src/model/model_300_2_1" 160 | if not os.path.exists(model_path): 161 | wd.save(model_wd, model_path) 162 | else: 163 | print("[Exception] word2vec的保存路径已经存在。") 164 | 165 | 166 | def word2vec_load(model_path=None): 167 | """ 168 | load word2vec model 169 | :return: 170 | """ 171 | if model_path: 172 | model_path = model_path 173 | else: 174 | model_path = "/Users/li/PycharmProjects/event_parser/src/model/model_300_2_1" 175 | 176 | wd_conf = {"size": 300, 177 | "window": 5, 178 | "min_count": 1} 179 | model_wd = word2vecs(wd_conf) 180 | model_wd = model_wd.load_model(model_path) 181 | # print model_wd.wv[u'食饮料'] 182 | return model_wd 183 | 184 | 185 | def word_vector(word, w2v_model): 186 | """ 187 | 查找某个词的词向量 188 | :param word: 需要查找的词 189 | :param w2v_model: 词向量 shape = (vector_size, ) 190 | :return: 191 | """ 192 | try: 193 | vector = w2v_model.wv[word] 194 | return vector 195 | except KeyError: 196 | return np.zeros(w2v_model.vector_size) 197 | 198 | 199 | if __name__ == '__main__': 200 | model_path = "/Users/li/PycharmProjects/event_parser/src/model/model_300_2_1" 201 | 202 | # word2vec 训练测试 203 | wd_conf = {"size": 300, 204 | "window": 5, 205 | "min_count": 1} 206 | x_train = data_reader.get_data_sets() 207 | wd = word2vecs(wd_conf) 208 | model_wd = wd.train(x_train) 209 | print(model_wd.wv[u'球员']) 210 | 211 | # doc2vec 训练测试 212 | # cluster_centers = cluster(x_train) 213 | -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: __init__.py.py 8 | @time: 2018/10/30 10:45 AM 9 | """ 10 | import sys 11 | sys.path.append('../') 12 | sys.path.append('../../') 13 | 14 | -------------------------------------------------------------------------------- /src/utils/cluster_test.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: cluster_test.py 8 | @time: 2018/11/6 1:35 PM 9 | 聚类模块 10 | """ 11 | from sklearn.cluster import KMeans 12 | from gensim.models.doc2vec import Doc2Vec, LabeledSentence 13 | 14 | 15 | def cluster(x_train): 16 | infered_vectors_list = [] 17 | print("load doc2vec model...") 18 | model_dm = Doc2Vec.load("model/model_dm") 19 | print("load train vectors...") 20 | i = 0 21 | for text, label in x_train: 22 | vector = model_dm.infer_vector(text) 23 | infered_vectors_list.append(vector) 24 | i += 1 25 | 26 | print("train kmean model...") 27 | kmean_model = KMeans(n_clusters=15) 28 | kmean_model.fit(infered_vectors_list) 29 | labels = kmean_model.predict(infered_vectors_list[0:100]) 30 | cluster_centers = kmean_model.cluster_centers_ 31 | 32 | with open("out/own_claasify.txt", 'w') as wf: 33 | for i in range(100): 34 | string = "" 35 | text = x_train[i][0] 36 | for word in text: 37 | string = string + word 38 | string = string + '\t' 39 | string = string + str(labels[i]) 40 | string = string + '\n' 41 | wf.write(string) 42 | 43 | return cluster_centers 44 | -------------------------------------------------------------------------------- /src/utils/corpus_update.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: corpus_update.py 8 | @time: 2018-12-19 16:51 9 | """ 10 | import sys 11 | sys.path.append('../') 12 | sys.path.append('../../') 13 | sys.path.append('../../../') 14 | from src.utils import dicts 15 | import codecs # noqa: E402 16 | import pandas as pd # noqa: E402 17 | from src import data_reader # noqa: E402 18 | from src.configure import conf # noqa: E402 19 | 20 | 21 | def stock_code_data_process(): 22 | dic_path = conf.dic_path 23 | stock_new_path = dic_path + "/stock.csv" 24 | n2_path = dic_path + "/新增2" 25 | # 处理股票实体 26 | # 将stock_words.txt中的股票词转换成jieba用户自定义词典的格式,然后添加到jieba的userdict中 27 | # 读取股票代码 28 | stock_df = data_reader.read_stock_code('TQ_SK_BASICINFO') 29 | stock_df['SYMBOL'] = stock_df['SYMBOL'].apply(lambda x: "'" + x + "'") 30 | 31 | stock_dict = [] 32 | for s in stock_df.values: 33 | code, stock = s[0], s[1] 34 | stock_dict.append(code + ' ' + '5' + ' ' + 'n') 35 | stock_dict.append(stock.strip('\n').decode('utf-8') + ' ' + '5' + ' ' + 'n') 36 | f = codecs.open(n2_path, 'w', 'utf-8') 37 | for i in stock_dict: 38 | f.write(i + '\n') # \n为换行符 39 | f.close() 40 | # 数据保存 41 | stock_df.to_csv(path_or_buf=stock_new_path, index=False) 42 | 43 | 44 | if __name__ == '__main__': 45 | stock_code_data_process() 46 | dic_path = conf.dic_path 47 | stock_new_path = dic_path + "/stock.csv" 48 | data_df = pd.read_csv(stock_new_path, encoding="utf-8").set_index('SESNAME') 49 | print(data_df.loc[u'万科A'].values) 50 | # dicts.init() 51 | # print dicts.stock_dict 52 | # for index, row in data_df.iterrows(): 53 | # print row.SESNAME 54 | -------------------------------------------------------------------------------- /src/utils/data_process.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: data_process.py 8 | @time: 2018/10/31 1:32 PM 9 | 金融文本解析类 10 | """ 11 | import sys 12 | sys.path.append('../') 13 | sys.path.append('../../') 14 | import re 15 | import pandas as pd 16 | from src.configure import conf 17 | 18 | dic_path = conf.dic_path 19 | stock_new_path = dic_path + "/stock.csv" 20 | data_df = pd.read_csv(stock_new_path, encoding="utf-8") 21 | 22 | 23 | class DataPressing(object): 24 | def __init__(self): 25 | # 杂质词 26 | self.pattern_word = u'(\\[AI\u51b3\u7b56\\])|(\u3010\u4eca\u65e5\u9898\u6750\u3011)' \ 27 | u'|(\u5173\u6ce8\u540c.*\u673a\u4f1a\u3002)' 28 | # [关注同花顺财经(ths518),获取更多机会。] 29 | self.pattern_text = u'(\\[AI\u51b3\u7b56\\])' 30 | self.num = 5 31 | 32 | def no_remove(self, text): 33 | """ 34 | 杂质词剔除,比如["今日走势", "AI决策"] 35 | :param text: 36 | :return: 37 | """ 38 | # result = re.sub(self.pattern_word, "", text.decode('utf8')) 39 | result = re.sub(self.pattern_word, "", text) 40 | return result 41 | 42 | def useless_contain(self, content): 43 | """ 44 | 判断content中是否包含某些字符 45 | :return: 46 | """ 47 | # py2使用 48 | # match_obj = re.search(self.pattern_text, content.decode('utf8')) 49 | match_obj = re.search(self.pattern_text, content) 50 | if match_obj: 51 | return True 52 | else: 53 | return False 54 | 55 | def useless_filter(self, content_list, stock_dicts): 56 | """ 57 | 如果文章中超过5只以上的股票,股市收报类的新闻,则将这篇文章剔除 58 | :param content_list: 分词之后的文章list 59 | :param stock_dicts: 股票代码 60 | :return: 61 | """ 62 | stock_num = 0 63 | for item in set(content_list): 64 | if item in stock_dicts: 65 | stock_num += 1 66 | 67 | if stock_num >= self.num: 68 | return True 69 | else: 70 | return False 71 | 72 | def find_stocks(self, content_list, stock_df): 73 | """ 74 | 提取content_list中所有的股票以及股票代码 75 | :param content_list: 分词之后的文章list 76 | :param stock_df: dataFrame 股票代码 77 | :return: 返回股票列表 78 | """ 79 | stock_num = [] 80 | for item in set(content_list): 81 | stock = [] 82 | # py2 使用 83 | # item = item.decode('utf-8') 84 | if item in stock_df.index.tolist(): 85 | res = stock_df.loc[item].values.tolist() 86 | if len(res) > 1: 87 | for i in range(len(res)): 88 | stock.extend(res[i]) 89 | else: 90 | stock.extend(res) 91 | stock_num.extend(stock) 92 | if len(stock_num) > 0: 93 | return stock_num 94 | else: 95 | return [] 96 | 97 | def find_keywords(self, content, key1, key2): 98 | """ 99 | 获取一大段文本之间两个关键字之间的内容 100 | :param content: 101 | :param key1: 102 | :param key2: 103 | :return: 104 | """ 105 | form = re.compile(key1 + '(.*?)' + key2, re.S) 106 | result = form.findall(content) 107 | return result 108 | -------------------------------------------------------------------------------- /src/utils/dicts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: dicts.py 8 | @time: 2018/11/5 2:31 PM 9 | jieba 字典初始化模块 10 | 功能:添加用户自定义词典,结巴添加新词 11 | 如果有新登陆词,可以在corpus中的新增中添加 12 | """ 13 | import jieba 14 | import pandas as pd 15 | from src.utils.log import log_util 16 | 17 | import sys 18 | sys.path.append('../') 19 | sys.path.append('../../') 20 | sys.path.append('../../../') 21 | try: 22 | from src.configure import conf 23 | except Exception: 24 | raise 25 | 26 | deg_dict = {} # 程度副词 27 | senti_dict = {} # 情感词 28 | eng_dict = {} # 英语或拼音词 29 | fou_dict = [] # 否定词 30 | but_dict = [] # 转折词 31 | lim_dict = [] # 限定词 32 | new_dict = [] # 新词 33 | zhi_dict = [] # 知网 34 | stock_dict = [] # 股票词 35 | stock_code_dict = [] # 股票代码 36 | jg_dict = [] # 机构名 37 | stock_df = [] 38 | 39 | logging = log_util.Logger('dict_log') 40 | 41 | 42 | class DictInit(object): 43 | pass 44 | 45 | 46 | def load_stock_data(): 47 | dic_path = conf.dic_path 48 | st_path = dic_path + "/stock_words.txt" 49 | st_new_path = dic_path + "/stock.csv" 50 | for st in open(st_path): 51 | # st = st.decode("utf8") 52 | code1, st_code = st.split("\t") 53 | code, stock = st_code.split(",") 54 | stock_code_dict.append(code.strip("\n")) 55 | stock_dict.append(stock.strip("\n")) 56 | 57 | stocks_df = pd.read_csv(st_new_path, encoding='utf-8') 58 | # stock_df.append(stocks_df.set_index('SESNAME')) 59 | for index, row in stocks_df.iterrows(): 60 | stock_dict.append(row.SESNAME) 61 | stock_dict.append(row.SYMBOL) 62 | # 整理股票代码 63 | stocks_df = stocks_df.set_index('SESNAME') 64 | return stock_dict, stocks_df 65 | 66 | 67 | def init(): 68 | # dic_path = '/Users/li/PycharmProjects/huihongcaihui/src/corpus' 69 | dic_path = conf.dic_path 70 | 71 | # 读取词典 72 | d_path = dic_path + "/程度副词_datatang.txt" 73 | s_path = dic_path + "/senti.txt" 74 | f_path = dic_path + "/fou.txt" 75 | b_path = dic_path + "/but.txt" 76 | e_path = dic_path + "/eng.txt" 77 | l_path = dic_path + "/limit.dict" 78 | a_path = dic_path + "/dic.txt" 79 | ns_path = dic_path + "/新增_stock" 80 | n_path = dic_path + "/新增" 81 | n2_path = dic_path + "/新增2" 82 | st_path = dic_path + "/stock_words.txt" 83 | st_new_path = dic_path + "/stock.csv" 84 | zhi_ne_path = dic_path + "/知网/zhi_neg.txt" 85 | zhi_po_path = dic_path + "/知网/zhi_pos.txt" 86 | jg_path = dic_path + "/机构" 87 | 88 | # 添加基金公司实体名字,比如("工银瑞信基金", "华泰柏瑞基金", "东方基金") 89 | 90 | # 结巴新词 91 | word_add = set() 92 | 93 | for d in open(d_path): 94 | # temp = d.decode("utf-8").split(" ") 95 | temp = d.split(" ") 96 | word_arr = temp[1].strip("\n").rstrip(" ").split("、") 97 | for w in word_arr: 98 | deg_dict[w] = float(temp[0]) 99 | word_add.add(temp[0]) 100 | 101 | for s in open(s_path): 102 | # temp = s.decode("utf-8").split(" ") 103 | temp = s.split(" ") 104 | senti_dict[temp[0]] = float(temp[1]) 105 | word_add.add(temp[0]) 106 | 107 | for e in open(e_path): 108 | temp = e.split(" ") 109 | eng_dict[temp[0]] = float(temp[1]) 110 | word_add.add(temp[0]) 111 | 112 | for f in open(f_path): 113 | # f = f.decode("utf-8-sig") 114 | fou_dict.append(f.strip("\n")) 115 | word_add.add(f.strip("\n")) 116 | 117 | for b in open(b_path): 118 | but_dict.append(b.strip("\n")) 119 | word_add.add(b.strip("\n")) 120 | 121 | for l in open(l_path): 122 | lim_dict.append(l.strip("\n")) 123 | word_add.add(l.strip("\n")) 124 | 125 | for a in open(a_path): 126 | new_dict.append(a.strip("\n")) 127 | word_add.add(a.strip("\n")) 128 | 129 | for st in open(st_path): 130 | # st = st.decode("utf8") 131 | code1, st_code = st.split("\t") 132 | code, stock = st_code.split(",") 133 | stock_code_dict.append(code.strip("\n")) 134 | stock_dict.append(stock.strip("\n")) 135 | word_add.add(code.strip("\n")) 136 | word_add.add(stock.strip("\n")) 137 | stocks_df = pd.read_csv(st_new_path, encoding='utf-8') 138 | stock_df.append(stocks_df.set_index('SESNAME')) 139 | for index, row in stocks_df.iterrows(): 140 | stock_dict.append(row.SESNAME) 141 | 142 | for z1 in open(zhi_ne_path): 143 | # z1 = z1.decode("utf8") 144 | new_dict.append(z1.strip("\n")) 145 | word_add.add(z1.strip("\n")) 146 | 147 | for z2 in open(zhi_po_path): 148 | # z2 = z2.decode("utf8") 149 | z2_data = z2.strip("\n") 150 | new_dict.append(z2_data) 151 | word_add.add(z2_data) 152 | 153 | for jg in open(jg_path): 154 | # jg = jg.decode("utf8") 155 | jg_data = jg.split("\t")[0].strip("\n") 156 | new_dict.append(jg_data) 157 | word_add.add(jg_data) 158 | 159 | ''' 160 | # 将stock_words.txt中的股票词转换成jieba用户自定义词典的格式,然后添加到jieba的userdict中 161 | for st in open(st_path): 162 | code1, st_code = st.split("\t") 163 | code, stock = st_code.split(",") 164 | stock_dict.append(code + ' ' + '5' + ' ' + 'n') 165 | stock_dict.append(stock.strip('\n').decode('utf-8') + ' ' + '5' + ' ' + 'n') 166 | apply_func = codecs.open(n_path, 'w', 'utf-8') 167 | for i in stock_dict: 168 | apply_func.write(i + '\n') # \n为换行符 169 | apply_func.close() 170 | ''' 171 | # 添加用户自定义字典 172 | jieba.load_userdict(ns_path) 173 | jieba.load_userdict(n_path) 174 | jieba.load_userdict(jg_path) 175 | jieba.load_userdict(n2_path) 176 | 177 | # 添加新词 178 | for w in word_add: 179 | jieba.add_word(w) 180 | 181 | # 结巴添加新词 182 | jieba.add_word("淡定") 183 | # jieba.add_word("加多宝") 184 | # jieba.add_word("红罐") 185 | jieba.add_word("非公开") 186 | jieba.add_word("不成人形") 187 | jieba.add_word("中美贸易战") 188 | logging.logger.info("[Info] jieba总共添加了{}个自定义词汇。".format(len(word_add))) 189 | 190 | 191 | if __name__ == '__main__': 192 | init() 193 | -------------------------------------------------------------------------------- /src/utils/engine/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: 1.0 6 | @author: li 7 | @file: __init__.py.py 8 | @time: 2019/12/12 2:15 下午 9 | """ -------------------------------------------------------------------------------- /src/utils/engine/data_source.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: data_source.py 8 | @time: 2018/10/30 6:39 PM 9 | """ 10 | 11 | 12 | URL = 'url' 13 | DTYPE = 'DTYPE' 14 | OBJ = 'OBJ' 15 | 16 | SQLALCHEMY = 1 17 | 18 | __DNS = { 19 | 'DNDS':{ 20 | URL:'mssql+pymssql://reader:reader@10.15.97.127:1433/dnds', 21 | DTYPE:SQLALCHEMY 22 | }, 23 | 24 | 'XAVIER': { 25 | URL: 'mysql+mysqlconnector://root:t2R7P7@10.15.5.86:3306/xavier', 26 | DTYPE: SQLALCHEMY 27 | }, 28 | 29 | 'VISIONTEST': { 30 | URL: 'mysql+mysqlconnector://root:1234@10.15.97.128:3306/test', 31 | DTYPE: SQLALCHEMY 32 | }, 33 | 34 | 'VISION': { 35 | URL: 'mysql+mysqlconnector://root:1234@10.15.97.128:3306/vision', 36 | DTYPE: SQLALCHEMY 37 | }, 38 | 39 | 'XAVIER_DB': { 40 | URL: 'mysql+mysqlconnector://root:t2R7P7@10.15.5.86:3306/xavier_db', 41 | DTYPE: SQLALCHEMY 42 | }, 43 | 44 | 'XAVIER_SQLITE': { 45 | # URL: 'sqlite://///Users/li/workshop/dataset/database/xueqiu/discuss.db', 46 | URL: 'sqlite://///Users/li/PycharmProjects/event_parser/src/parser/discuss_parser/discuss_data/discuss.db', 47 | DTYPE: SQLALCHEMY 48 | } 49 | 50 | } 51 | 52 | 53 | def __getSqlAlchemyEngine(source): 54 | if not OBJ in __DNS[source].keys(): 55 | import sqlalchemy as sa 56 | __DNS[source][OBJ] = sa.create_engine(__DNS[source][URL]) 57 | return __DNS[source][OBJ] 58 | 59 | 60 | def GetDataEngine(source): 61 | engine = None 62 | if source in __DNS.keys(): 63 | if __DNS[source][DTYPE] == SQLALCHEMY: 64 | return __getSqlAlchemyEngine(source) 65 | else: 66 | raise Exception("未知的数据源 --'{0}'") 67 | return engine 68 | 69 | -------------------------------------------------------------------------------- /src/utils/engine/mysql_util.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: mysql_util.py 8 | @time: 2018/11/28 4:44 PM 9 | """ 10 | 11 | import MySqldb 12 | import z 13 | 14 | 15 | class mysql: 16 | def __init__(self): 17 | self.conn = MySqldb.connect(host=z.mysql_ip, 18 | user=z.mysql_user, 19 | passwd=z.mysql_passwd, 20 | db=z.mysql_db, 21 | port=z.mysql_port) 22 | self.cursor = self.conn.cursor() 23 | 24 | def close(self): 25 | self.cursor.close() 26 | self.conn.close() 27 | 28 | 29 | ms = mysql() 30 | 31 | 32 | def load_from_mysql(query_str): 33 | ms.cursor.execute(query_str) 34 | result = ms.cursor.fetchall() 35 | ms.close() 36 | return result 37 | 38 | 39 | def insert_data(query_insert): 40 | # # SQL 插入语句 41 | # sql = """INSERT INTO EMPLOYEE(FIRST_NAME, 42 | # LAST_NAME, AGE, SEX, INCOME) 43 | # VALUES ('Mac', 'Mohan', 20, 'M', 2000)""" 44 | try: 45 | # 执行sql语句 46 | ms.cursor.execute(query_insert) 47 | # 提交到数据库执行 48 | ms.conn.commit() 49 | except EOFError: 50 | # Rollback in case there is any error 51 | ms.conn.rollback() 52 | 53 | # 关闭数据库连接 54 | ms.close() 55 | 56 | 57 | def delete_data(query_delete): 58 | # SQL 删除语句 59 | # sql = "DELETE FROM EMPLOYEE WHERE AGE > %s" % (20) 60 | try: 61 | # 执行SQL语句 62 | ms.cursor.execute(query_delete) 63 | # 提交修改 64 | ms.conn.commit() 65 | except IOError: 66 | # 发生错误时回滚 67 | ms.conn.rollback() 68 | 69 | # 关闭连接 70 | ms.close() 71 | 72 | 73 | def update_data(query_update): 74 | # SQL 更新语句 75 | # sql = "DELETE FROM EMPLOYEE WHERE AGE > %s" % (20) 76 | try: 77 | # 执行SQL语句 78 | ms.cursor.execute(query_update) 79 | # 提交修改 80 | ms.conn.commit() 81 | except EOFError: 82 | # 发生错误时回滚 83 | ms.conn.rollback() 84 | 85 | # 关闭连接 86 | ms.close() -------------------------------------------------------------------------------- /src/utils/file_util.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: file_util.py 8 | @time: 2018/11/19 8:52 AM 9 | """ 10 | import os 11 | import datetime 12 | 13 | 14 | def check_path(_path): 15 | """check out weather the _path exists. If not, create a new _path dit""" 16 | dir_name = os.path.dirname(_path) 17 | if dir_name: 18 | if not os.path.exists(dir_name): 19 | os.makedirs(dir_name) 20 | 21 | 22 | def list_all_files(file_path): 23 | _file = [] 24 | lists = os.listdir(file_path) 25 | for i in range(len(lists)): 26 | path = os.path.join(file_path, lists[i]) 27 | if os.path.isdir(path): 28 | _file.append(list_all_files(path)) 29 | if os.path.isfile(path): 30 | _file.append(path) 31 | 32 | return _file 33 | 34 | 35 | def find_newest_file(save_path): 36 | """ 37 | 从文件夹中读取最新保存或修改的文件。 38 | :param save_path: 目录地址 39 | :return: 40 | """ 41 | _file = [] 42 | lists = os.listdir(save_path) # 列出目录的下所有文件和文件夹保存到lists 43 | if len(lists) > 0: 44 | for i in range(len(lists)): 45 | path = lists[i] 46 | # 提取文件,剔除文件夹 47 | if os.path.isfile(save_path + path): 48 | _file.append(path) 49 | if len(_file) > 0: 50 | _file.sort(key=lambda fn: os.path.getmtime(save_path + fn)) # 将文件按时间排序 51 | file_new = _file[-1] # 获取最新的文件保存到file_new 52 | # filetime = datetime.datetime.fromtimestamp(os.path.getmtime(file_new)) 53 | else: 54 | file_new = 'NULL' 55 | else: 56 | file_new = 'NULL' 57 | # logging.logger.info("文件的最新修改时间:" + filetime.strftime('%Y-%m-%d %H:%M:%S')) 58 | # logging.logger.info("最新修改的文件(夹):" + lists[-1]) 59 | return file_new 60 | 61 | 62 | if __name__ == '__main__': 63 | print(find_newest_file("/Users/li/PycharmProjects/event_parser/src/model/event_model/")) 64 | # print list_all_files("/Users/li/PycharmProjects/event_parser/src/log/") 65 | -------------------------------------------------------------------------------- /src/utils/keywords_extractor.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: keywords_extractor.py 8 | @time: 2018/11/19 10:19 AM 9 | 基于textRank的关键词提取 10 | """ 11 | import numpy as np 12 | 13 | from src.utils import data_process, dicts, tokenization 14 | from src.utils.log import log_util 15 | 16 | logging = log_util.Logger('keywordsExtractor_log') 17 | 18 | 19 | class TextRank(object): 20 | def __init__(self, top_k=20, with_weight=False, window=5, alpha=0.85, min_diff=1000): 21 | """ 22 | :param top_k: return how many top keywords. `None` for all possible words. 23 | :param with_weight: if True, return a list of (word, weight); 24 | if False, return a list of words. 25 | :param window: 26 | :param alpha: 27 | :param min_diff: 28 | """ 29 | # self.sentence = sentence 30 | self.word_list = "" 31 | self.window = window 32 | self.alpha = alpha 33 | self.edge_dict = {} # 记录节点的边连接字典 34 | self.iter_num = min_diff # 设置收敛阈值 35 | self.topK = top_k # 提取关键词的个数 36 | self.withWeight = with_weight 37 | 38 | def _cut_sentence(self, sentence): 39 | """ 40 | # 对句子进行分词 41 | :return: 42 | """ 43 | # 使用多进程的时候需要修改一下 44 | dp, dict_init, stop_words = data_process.DataPressing(), dicts.init(), tokenization.load_stop_words() 45 | tk = tokenization.Tokenizer(dp, stop_words) 46 | self.word_list = tk.token(sentence) 47 | # dicts.init() 48 | # jieba.load_userdict('user_dict.txt') 49 | # tag_filter = ['a', 'd', 'n', 'v'] 50 | # seg_result = pseg.cut(self.sentence) 51 | # self.word_list = [s.word for s in seg_result if s.flag in tag_filter] 52 | # print(self.word_list) 53 | 54 | def _create_nodes(self): 55 | """ 56 | # 根据窗口,构建每个节点的相邻节点,返回边的集合 57 | :return: 58 | """ 59 | tmp_list = [] 60 | word_list_len = len(self.word_list) 61 | for index, word in enumerate(self.word_list): 62 | if word not in self.edge_dict.keys(): 63 | tmp_list.append(word) 64 | tmp_set = set() 65 | left = index - self.window + 1 # 窗口左边界 66 | right = index + self.window # 窗口右边界 67 | if left < 0: left = 0 68 | if right >= word_list_len: right = word_list_len 69 | for i in range(left, right): 70 | if i == index: 71 | continue 72 | tmp_set.add(self.word_list[i]) 73 | self.edge_dict[word] = tmp_set 74 | 75 | def _create_matrix(self): 76 | """ 77 | # 根据边的相连关系,构建矩阵 78 | :return: 79 | """ 80 | self.matrix = np.zeros([len(set(self.word_list)), len(set(self.word_list))]) 81 | self.word_index = {} # 记录词的index 82 | self.index_dict = {} # 记录节点index对应的词 83 | 84 | for i, v in enumerate(set(self.word_list)): 85 | self.word_index[v] = i 86 | self.index_dict[i] = v 87 | for key in self.edge_dict.keys(): 88 | for w in self.edge_dict[key]: 89 | self.matrix[self.word_index[key]][self.word_index[w]] = 1 90 | self.matrix[self.word_index[w]][self.word_index[key]] = 1 91 | # 归一化 92 | for j in range(self.matrix.shape[1]): 93 | summary = 0 94 | for i in range(self.matrix.shape[0]): 95 | summary += self.matrix[i][j] 96 | for i in range(self.matrix.shape[0]): 97 | self.matrix[i][j] /= summary 98 | 99 | def _cal_pr(self): 100 | """ 101 | # 根据textRank公式计算权重 102 | :return: 103 | """ 104 | # 105 | self.PR = np.ones([len(set(self.word_list)), 1]) 106 | for i in range(self.iter_num): 107 | self.PR = (1 - self.alpha) + self.alpha * np.dot(self.matrix, self.PR) 108 | 109 | def _print_result(self): 110 | """ 111 | # 输出词和相应的权重 112 | :return: 113 | """ 114 | word_pr = {} 115 | for i in range(len(self.PR)): 116 | word_pr[self.index_dict[i]] = self.PR[i][0] 117 | if self.withWeight: 118 | tags = sorted(word_pr.items(), key=lambda x: x[1], reverse=True) 119 | # tags = sorted(word_pr.items(), key=itemgetter(1), reverse=True) 120 | else: 121 | tags = sorted(word_pr, key=word_pr.__getitem__, reverse=True) 122 | 123 | if self.topK: 124 | return tags[:self.topK] 125 | else: 126 | return tags 127 | 128 | def run(self, sentence): 129 | if type(sentence) is not list: 130 | self._cut_sentence(sentence) 131 | else: 132 | self.word_list = sentence 133 | 134 | if len(self.word_list) > 1: # bug 如果sentence分词后只有一个单词,则直接输出 135 | self._create_nodes() 136 | self._create_matrix() 137 | self._cal_pr() 138 | result = self._print_result() 139 | else: 140 | result = self.word_list 141 | return result 142 | 143 | 144 | def d_test(): 145 | """ 146 | 类接口测试 147 | :return: 148 | """ 149 | # s = '程序员(英文Programmer)是从事程序开发、维护的专业人员。' \ 150 | # '一般将程序员分为程序设计人员和程序编码人员,但两者的界限并不非常清楚,' \ 151 | # '特别是在中国。软件从业人员分为初级程序员、高级程序员、系统分析员和项目经理四大类。' 152 | 153 | # s = '【今日题材】[AI决策]大智慧的股票真烂,中美贸易战打得好,中美贸易摩擦擦出爱情火花!科创板也上市了,' \ 154 | # '还是注册制的, 关注同花顺财经(ths58), 获取更多机会。' 155 | 156 | s = '中兴通讯(000063)在经历七个一字跌停板后,于今天打开跌停板。债转股开盘大涨,天津普林(002134)、信达地产(600657)' \ 157 | '、海德股份(000567)集体涨停,长航凤凰(000520)、浙江东方(600120)、陕国投A(000563)大涨,消息面上,' \ 158 | '央行宣布定向降准0.5个百分点,将重点支持债转股。中兴通讯机构最低估值12.02元/股在复牌之前,' \ 159 | '多家基金公司对中兴通讯估值大多调整至20.54元/股。连续7个跌停板后,中兴通讯A股股价早就已经跌穿这一价格。' \ 160 | '据《中国经营报》记者不完全统计,6月20日~22日,多家基金公司再做出调整中兴通讯A股估值的公告,下调公司包括工银瑞信基金、' \ 161 | '华泰柏瑞基金、东方基金、大摩华鑫基金、融通基金、大成基金等22家基金公司。值得注意的是,此次基金公司估值下调幅度并不一致,' \ 162 | '调整估值在每股12.02~16.64元之间。其中,大摩华鑫基金、融通基金和安信基金给出的估值最高,为每股16.64元,而工银瑞信基金、' \ 163 | '富国基金和泰达宏利基金给出的估值最低,为每股12.02元。关注同花顺财经(ths518),获取更多机会' 164 | 165 | # s = u"水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露," \ 166 | # u"根据刚刚完成了水资源管理制度的考核,有部分省接近了红线的指标,\n" \ 167 | # u"有部分省超过红线的指标。对一些超过红线的地方,\n陈明忠表示,对一些取用水项目进行区域的限批," \ 168 | # u"严格地进行水资源论证和取水许可的批准。" 169 | 170 | dp, dict_init, stop_words = data_process.DataPressing(), dicts.init(), tokenization.load_stop_words() 171 | tk = tokenization.Tokenizer(dp, stop_words) 172 | s_list = tk.token(s) 173 | # 根据句子的长度,动态划分关键词的个数 174 | # top_k = int(len(s_list) * 0.1) 175 | # text_rank = TextRank(s_list, top_k=15, with_weight=True) 176 | 177 | text_rank = TextRank(top_k=15) 178 | res = text_rank.run(s_list) 179 | logging.logger.info("提取的%s个关键词: " % len(res)) 180 | if text_rank.withWeight: 181 | print(",".join(item[0] for item in res)) 182 | print(",".join(str(item[1]) for item in res)) 183 | else: 184 | print(",".join(str(item) for item in res)) 185 | 186 | 187 | def parallel_test(text): 188 | text_rank = TextRank(top_k=15) 189 | return text_rank.run(text) 190 | 191 | 192 | def multi_extract(s_lists): 193 | from multiprocessing import Pool 194 | import multiprocessing as mp 195 | res_l = [] 196 | pool = Pool(processes=int(mp.cpu_count())) 197 | for s_list in s_lists: 198 | res = pool.apply_async(parallel_test, (s_list,)) 199 | res_l.append(res.get()) 200 | pool.close() 201 | pool.join() 202 | 203 | return res_l 204 | 205 | 206 | def multi_extract_test(): 207 | """ 208 | 多进程测试 209 | :return: 210 | """ 211 | import time 212 | from multiprocessing import Pool 213 | import multiprocessing as mp 214 | 215 | s = '程序员(英文Programmer)是从事程序开发、维护的专业人员。' \ 216 | '一般将程序员分为程序设计人员和程序编码人员,但两者的界限并不非常清楚,' \ 217 | '特别是在中国。软件从业人员分为初级程序员、高级程序员、系统分析员和项目经理四大类。' 218 | 219 | dp = data_process.DataPressing() 220 | dict_init = dicts.init() 221 | stop_words = tokenization.load_stop_words() 222 | # 分词 223 | tk = tokenization.Tokenizer(dp, stop_words) 224 | s_list = tk.token(s) 225 | t0 = time.time() 226 | for i in range(10000): 227 | parallel_test(s_list) 228 | logging.logger.info("串行处理花费时间{t}".format(t=time.time()-t0)) 229 | 230 | pool = Pool(processes=int(mp.cpu_count())) 231 | res_l = [] 232 | t1 = time.time() 233 | for i in range(10000): 234 | res = pool.apply_async(parallel_test, (s_list,)) 235 | res_l.append(res) 236 | # pool.map(parallel_test, s_list) 237 | 238 | # for i in res_l: 239 | # print i.get() 240 | pool.close() 241 | pool.join() 242 | logging.logger.info("并行处理花费时间{t}s".format(t=time.time()-t1)) 243 | 244 | 245 | if __name__ == '__main__': 246 | d_test() 247 | # multi_extract_test() 248 | -------------------------------------------------------------------------------- /src/utils/log/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: 1.0 6 | @author: li 7 | @file: __init__.py.py 8 | @time: 2019/12/12 2:14 下午 9 | """ -------------------------------------------------------------------------------- /src/utils/log/log2.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: logging.py 8 | @time: 2018-12-21 17:15 9 | """ 10 | import logging.handlers 11 | import file_util 12 | 13 | 14 | class LoggerConfig(object): 15 | def __init__(self, log_file_name): 16 | self.log_file_name = log_file_name 17 | 18 | def logger_info(self): 19 | log_file = '../log/%s_info.log' % self.log_file_name 20 | file_util.check_path(log_file) 21 | handler = logging.handlers.RotatingFileHandler(log_file, maxBytes=10240 * 1024, backupCount=5) # 实例化handler 22 | fmt = '%(asctime)s - %(filename)s:%(lineno)s - %(name)s - %(message)s' 23 | formatter = logging.Formatter(fmt) # 实例化formatter 24 | handler.setFormatter(formatter) # 为handler添加formatter 25 | logger = logging.getLogger('info') # 获取名为tst的logger 26 | if not logger.handlers: 27 | logger.addHandler(handler) # 为logger添加handler 28 | logger.setLevel(logging.INFO) 29 | return logger 30 | 31 | def logger_error(self): 32 | log_file = '../log/%s_error.log' % self.log_file_name 33 | file_util.check_path(log_file) 34 | handler = logging.handlers.RotatingFileHandler(log_file, maxBytes=10240 * 1024, backupCount=5) # 实例化handler 35 | fmt = '%(asctime)s - %(filename)s:%(lineno)s - %(name)s - %(message)s' 36 | formatter = logging.Formatter(fmt) # 实例化formatter 37 | handler.setFormatter(formatter) # 为handler添加formatter 38 | logger = logging.getLogger('error') # 获取名为tst的logger 39 | if not logger.handlers: 40 | logger.addHandler(handler) # 为logger添加handler 41 | logger.setLevel(logging.ERROR) 42 | return logger -------------------------------------------------------------------------------- /src/utils/log/log_util.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: log_test.py 8 | @time: 2018-12-26 16:33 9 | """ 10 | 11 | import logging 12 | from src.utils import file_util 13 | from logging import handlers 14 | 15 | 16 | class Logger(object): 17 | level_relations = { 18 | # 读取的事件文件目录 19 | 'debug': logging.DEBUG, 20 | 'info': logging.INFO, 21 | 'warning': logging.WARNING, 22 | 'error': logging.ERROR, 23 | 'crit': logging.CRITICAL 24 | } 25 | 26 | def __init__(self, log_file_name, level='debug', when='D', backup_count=5, 27 | fmt='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s'): 28 | log_file = '../log/%s.log' % log_file_name 29 | file_util.check_path(log_file) 30 | self.logger = logging.getLogger(log_file) 31 | format_str = logging.Formatter(fmt) # 设置日志格式 32 | self.logger.setLevel(self.level_relations.get(level)) # 设置日志级别 33 | sh = logging.StreamHandler() # 往屏幕上输出 34 | sh.setFormatter(format_str) # 设置屏幕上显示的格式 35 | th = handlers.TimedRotatingFileHandler(filename=log_file, when=when, backupCount=backup_count, 36 | encoding='utf-8') # 往文件里写入#指定间隔时间自动生成文件的处理器 37 | # 实例化TimedRotatingFileHandler 38 | # interval是时间间隔,backupCount是备份文件的个数,如果超过这个个数,就会自动删除,when是间隔的时间单位,单位有以下几种: 39 | # S 秒 40 | # M 分 41 | # H 小时、 42 | # D 天、 43 | # W 每星期(interval==0时代表星期一) 44 | # midnight 每天凌晨 45 | th.setFormatter(format_str) # 设置文件里写入的格式 46 | self.logger.addHandler(sh) # 把对象加到logger里 47 | self.logger.addHandler(th) 48 | 49 | 50 | if __name__ == '__main__': 51 | log = Logger('all', level='info') 52 | log.logger.debug('debug') 53 | log.logger.info('info') 54 | log.logger.warning(u'警告') 55 | log.logger.error(u'报错') 56 | log.logger.critical(u'严重') 57 | Logger('error.log', level='error').logger.error('error') 58 | -------------------------------------------------------------------------------- /src/utils/news.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: news.py 8 | @time: 2018/11/28 10:56 AM 9 | """ 10 | 11 | 12 | class news(object): 13 | def __init__(self, title, content, publish_time): 14 | self.title = title 15 | self.content = content 16 | self.publish_time = publish_time 17 | 18 | def title(self): 19 | return self.title() 20 | 21 | def content(self): 22 | return self.title() 23 | 24 | def publish_time(self): 25 | return self.publish_time() 26 | 27 | def news_detail(self): 28 | if self.title and self.content: 29 | print(self.title + self.content) 30 | 31 | def news_lists(self): 32 | pass 33 | -------------------------------------------------------------------------------- /src/utils/test.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: test.py 8 | @time: 2018/11/12 2:19 PM 9 | """ 10 | import sys 11 | sys.path.append("..") 12 | from gensim import corpora, models, similarities 13 | from src.utils.tokenization import Tokenizer, load_stop_words 14 | from src.configure import Configure 15 | from src.utils import data_process 16 | 17 | 18 | conf = Configure() 19 | 20 | 21 | raw_documents = [ 22 | u'0无偿居间介绍买卖毒品的行为应如何定性', 23 | u'1吸毒男动态持有大量毒品的行为该如何认定', 24 | u'2如何区分是非法种植毒品原植物罪还是非法制造毒品罪', 25 | u'3为毒贩贩卖毒品提供帮助构成贩卖毒品罪', 26 | u'4将自己吸食的毒品原价转让给朋友吸食的行为该如何认定', 27 | u'5为获报酬帮人购买毒品的行为该如何认定', 28 | u'6毒贩出狱后再次够买毒品途中被抓的行为认定', 29 | u'7虚夸毒品功效劝人吸食毒品的行为该如何认定', 30 | u'8妻子下落不明丈夫又与他人登记结婚是否为无效婚姻', 31 | u'9一方未签字办理的结婚登记是否有效', 32 | u'10夫妻双方1990年按农村习俗举办婚礼没有结婚证 一方可否起诉离婚', 33 | u'11结婚前对方父母出资购买的住房写我们二人的名字有效吗', 34 | u'12身份证被别人冒用无法登记结婚怎么办?', 35 | u'13同居后又与他人登记结婚是否构成重婚罪', 36 | u'14未办登记只举办结婚仪式可起诉离婚吗', 37 | u'15同居多年未办理结婚登记,是否可以向法院起诉要求离婚' 38 | ] 39 | 40 | 41 | # def tokenization(filename): 42 | # """ 43 | # 对语料进行分词,分词之后先按照词性过滤出一些停用词,然后在通过停用词表过滤掉一些停用词。 44 | # :param filename: 45 | # :return: 46 | # """ 47 | # dicts.init() # 初始化人工词典 48 | # result = [] 49 | # with open(filename, 'r') as apply_func: 50 | # text = apply_func.read() 51 | # words = pseg.cut(text) 52 | # for word, flag in words: 53 | # if flag not in stop_flag and word not in stopwords: 54 | # result.append(word) 55 | # return result 56 | 57 | 58 | # 语料库准备,导入所有的语料,并且进行分词,去停用词 59 | # filenames = ['/Users/yiiyuanliu/Desktop/nlp/demo/articles/13 件小事帮您稳血压.txt', 60 | # '/Users/yiiyuanliu/Desktop/nlp/demo/articles/高血压患者宜喝低脂奶.txt', 61 | # '/Users/yiiyuanliu/Desktop/nlp/demo/articles/ios.txt'] 62 | 63 | 64 | # corpus = [] 65 | # t = Tokenizer() 66 | # 67 | # for each in raw_documents: 68 | # corpus.append(t.token(each)) 69 | # print len(corpus) 70 | # 71 | # 72 | # for item in corpus[0]: 73 | # print item 74 | # 75 | # 76 | # def DictionaryBuild(corpus): 77 | # # 建立词袋模型。 78 | # dictionary = corpora.Dictionary(corpus) 79 | # return dictionary 80 | # 81 | # 82 | # dictionary = DictionaryBuild(corpus) 83 | # print dictionary 84 | # 85 | # 86 | # def docVectors(dictionary): 87 | # doc_vectors = [dictionary.doc2bow(text) for text in corpus] 88 | # print len(doc_vectors) 89 | # print doc_vectors 90 | # 91 | # docVectors(dictionary) 92 | 93 | 94 | # query = tokenization('/Users/yiiyuanliu/Desktop/nlp/demo/articles/关于降压药的五个问题.txt') 95 | # query_bow = dictionary.doc2bow(query) 96 | # print query_bow 97 | # 98 | # 99 | # # 文本相似度计算 100 | # # 基于积累的事件,首先计算所有事件的词向量或者tf-idf值,然后将新晋事件与最近的事件进行相似度计算,计算 101 | # lsi = models.LsiModel(tfidf_vectors, id2word=dictionary, num_topics=2) 102 | 103 | 104 | if __name__ == '__main__': 105 | import dicts 106 | data_processing = data_process.DataPressing() 107 | dict_init = dicts.init() 108 | stop_words = load_stop_words() 109 | t = Tokenizer(data_processing, stop_words) 110 | stock_dict = dicts.stock_dict 111 | print(["大智慧".decode("utf8")]) 112 | a = ["大智慧".decode("utf8")] 113 | print(len(a[0])) 114 | # print(["【今日题材】".decode("utf8")]) 115 | 116 | # file = open('file_name.txt', 'w') 117 | # file.write(str(raw_documents)) 118 | # file.close() 119 | 120 | # 剔除杂质词 121 | print(data_processing.no_remove("【今日题材】[AI决策]大智慧的股票真烂,中美贸易战打得好,中美贸易摩擦擦出爱情火花!科创板也上市了,还是注册制的")) 122 | 123 | # 判断content中是否存在某些特殊词 124 | print(data_processing.useless_contain("[AI决策]大智慧的股票真烂,中美贸易战打得好,中美贸易摩擦擦出爱情火花!科创板也上市了,还是注册制的")) 125 | 126 | # 筛选新闻,筛选出股市收报 127 | # str = '午后,分散染料概念股走强。截至发稿,浙江龙盛(600352)[AI决策](浙江龙盛(600352)[AI决策]-CN)涨6.74%报13.15元,闰土股份(002440)[AI决策](闰土股份(002440)[AI决策]-CN)涨5.84%报19.94元,安诺其(300067)[AI决策](安诺其(300067)[AI决策]-CN)涨5.46%报6.38元,吉华集团(603980)[AI决策](吉华集团(603980)[AI决策]-CN)涨3.41%报22.42元,航民股份(600987)[AI决策](航民股份(600987)[AI决策]-CN)、江苏吴中(600200)[AI决策](江苏吴中(600200)[AI决策]-CN)等个股跟随上涨近2%。据分散染料龙头企业介绍,由于环保形势的持续严峻,企业开工受到限制,染料供应量较少,库存偏低。染料贸易商和印染企业前期采购的分散染料,经过四季度的消耗库存已经很低,近期需要补仓,刚需力度增强。推荐阅读:浙江龙盛最新消息' 128 | # a = t.token(str) 129 | # tmp_res = data_processing.useless_filter(a, stock_dict) 130 | # print tmp_res 131 | 132 | 133 | 134 | 135 | 136 | 137 | -------------------------------------------------------------------------------- /src/utils/time_util.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: time_util.py 8 | @time: 2018/11/29 10:15 AM 9 | """ 10 | 11 | import time 12 | import datetime 13 | 14 | 15 | def time_to_timestamp(time_str, style=None): 16 | """ 17 | 固定格式的时间转换成时间戳 18 | :param time_str: 19 | :param style: 20 | :return: 21 | """ 22 | if style is None: 23 | style = "%Y-%m-%d %H:%M:%S" 24 | time_array = time.strptime(time_str, style) 25 | time_stamp = int(time.mktime(time_array)) 26 | return time_stamp 27 | 28 | 29 | def timestamp_to_time(time_stamp, style=None): 30 | """ 31 | 时间戳转换成固定格式的时间 32 | :param time_stamp: 33 | :param style: 34 | :return: 35 | """ 36 | if style is None: 37 | style = "%Y-%m-%d %H:%M:%S" 38 | time_array = time.localtime(time_stamp) 39 | date_time = time.strftime(style, time_array) 40 | return date_time 41 | 42 | 43 | def get_integral_point_time(hour=0, minute=0, sec=0): 44 | """ 45 | 获取当天的某个时间点, 并转化成时间戳 46 | :param sec: 秒 47 | :param minute: 分钟 48 | :param hour: 小时 49 | :return: 50 | """ 51 | if hour > 24 or minute > 60 or sec > 60: 52 | print('time error in get_integral_point_time') 53 | exit() 54 | today = datetime.date.today().strftime("%Y-%m-%d") + ' %s:%s:%s' % (hour, minute, sec) 55 | time_array = time.strptime(today, "%Y-%m-%d %H:%M:%S") 56 | today_time = int(time.mktime(time_array)) 57 | return today_time 58 | 59 | 60 | if __name__ == '__main__': 61 | now = int(time.time()) 62 | # print now 63 | print(time_to_timestamp("2018-12-5 21:49:7", "%Y-%m-%d %H:%M:%S")) 64 | print(timestamp_to_time(now, "%Y-%m-%d %H:%M:%S")) 65 | print(get_integral_point_time(9)) 66 | -------------------------------------------------------------------------------- /src/utils/tokenization.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: ?? 6 | @author: li 7 | @file: tokenization.py 8 | @time: 2018/11/2 10:40 AM 9 | 分词模块 10 | 调用jieba分词,添加用户自定义词典,封装,并且去停用词等操作 11 | """ 12 | import codecs 13 | import sys 14 | 15 | import jieba.posseg as pseg 16 | 17 | from src.utils import data_process, dicts 18 | from src.utils.log import log_util 19 | 20 | sys.path.append('..') 21 | sys.path.append('../') 22 | sys.path.append('../../') 23 | from src.configure import Configure 24 | 25 | logging = log_util.Logger('tokenization_log') 26 | 27 | stopwords = globals() 28 | 29 | 30 | def load_stop_words(): 31 | # 停用词库准备, 构建停用词表 32 | conf = Configure() 33 | stop_words_path = conf.stop_words_path 34 | words_count = dict() 35 | try: 36 | stop_word = codecs.open(stop_words_path, 'r', encoding='utf8').readlines() 37 | stop_words = [w.strip() for w in stop_word] 38 | logging.logger.info("Stopwords 导入成功!") 39 | return stop_words 40 | except BaseException as e: 41 | logging.logger.error('Stop Words Exception: {0}'.format(e)) 42 | 43 | 44 | class Tokenizer(object): 45 | def __init__(self, data_process, stop_words): 46 | # dicts.init() # 初始化人工词典 47 | self.data_precessing = data_process 48 | # self.dicts = dict_init 49 | # 按照词性去停用词 50 | # 去停用词的词性列表,包括[标点符号、连词、助词、副词、介词、时语素、‘的’, 数词, 方位词, 代词, 形容词, 动词],暂时没有使用,原因是添加的新词没有添加词性,所以新词词性有问题。 51 | self.stop_flag = ['x', 'c', 'u', 'd', 'p', 't', 'uj', 'm', 'apply_func', 'r'] 52 | self.stopwords = stop_words 53 | self.words_count = {} 54 | 55 | def remove_stopwords(self): 56 | # 使用映射来判断当前元素是否在字典中,速度会比list匹配快 57 | # [pandas apply 函数 多进程实现](https://blog.csdn.net/Jerry/article/details/71425298?utm_source=blogxgwz1#commentBox) 58 | pass 59 | 60 | def token(self, text): 61 | """ 62 | 对语料进行分词,分词之后先按照词性过滤出一些停用词,然后在通过停用词表过滤掉一些停用词。 63 | :param text: 64 | :return: 65 | """ 66 | if text is None: 67 | return None 68 | result = [] 69 | words = pseg.cut(self.data_precessing.no_remove(text)) 70 | 71 | # for word, flag in words: 72 | # result.append(word) 73 | 74 | for word, flag in words: 75 | if flag not in self.stop_flag and word not in self.stopwords and len(word) >= 2: 76 | result.append(word) 77 | return result 78 | 79 | 80 | def d_test(): 81 | data_processing = data_process.DataPressing() 82 | dict_init = dicts.init() 83 | stop_words = load_stop_words() 84 | tk = Tokenizer(data_processing, stop_words) 85 | # print(["大智慧".decode("utf8")]) 86 | # print(["【今日题材】".decode("utf8")]) 87 | # print(["关注同".decode("utf-8")]) 88 | 89 | # 剔除杂质词 90 | print(data_processing.no_remove("【今日题材】[AI决策]大智慧的股票真烂,中美贸易战打得好,中美贸易摩擦擦出爱情火花!科创板也上市了,还是注册制的, 关注同花顺财经(ths58), 获取更多机会。")) 91 | # 判断content中是否存在某些特殊词 92 | print(data_processing.useless_contain("[AI决策]大智慧的股票真烂,中美贸易战打得好,中美贸易摩擦擦出爱情火花!科创板也上市了,还是注册制的")) 93 | 94 | # 对content中的内容进行去停,去杂质词,分词 95 | # result = tk.token("【今日题材】[AI决策]加多宝的股票真烂,中美贸易战打得好,中美贸易摩擦擦出爱情火花!科创板也上市了,还是注册制的") 96 | result = tk.token("加多宝重推红罐 是否能再与王老吉争锋") 97 | print('Type of result: {}。'.format(type(result))) 98 | for i in result: 99 | print(i) 100 | 101 | 102 | def paralize_test(text, data_process, stop_words): 103 | t = Tokenizer(data_process, stop_words) 104 | restult = t.token(text) 105 | return restult 106 | 107 | 108 | def multi_token_test(): 109 | """ 110 | 多进程测试 111 | :return: 112 | """ 113 | import time 114 | from multiprocessing import Pool 115 | import multiprocessing as mp 116 | 117 | s = '程序员(英文Programmer)是从事程序开发、维护的专业人员。' \ 118 | '一般将程序员分为程序设计人员和程序编码人员,但两者的界限并不非常清楚,' \ 119 | '特别是在中国。软件从业人员分为初级程序员、高级程序员、系统分析员和项目经理四大类。' 120 | 121 | dataprocess = data_process.DataPressing() 122 | dicts.init() 123 | stop_words = load_stop_words() 124 | # 串行处理 125 | t0 = time.time() 126 | res1_l = [] 127 | for i in range(10000): 128 | res1 = paralize_test(s, dataprocess, stop_words) 129 | res1_l.append(res1) 130 | print("串行处理花费时间{t}s".format(t=time.time() - t0)) 131 | 132 | # 并行处理 133 | t1 = time.time() 134 | res2_l = [] 135 | pool = Pool(processes=int(mp.cpu_count() * 0.8)) 136 | for i in range(10000): 137 | res = pool.apply_async(paralize_test, ((s, dataprocess, stop_words),)) 138 | res2_l.append(res) 139 | # 获取数据 140 | # for k in res2_l: 141 | # print k.get() 142 | pool.close() 143 | pool.join() 144 | print("并行处理花费时间{t}s".format(t=time.time() - t1)) 145 | 146 | 147 | # tokenizer = Tokenizer() 148 | if __name__ == '__main__': 149 | d_test() 150 | # multi_token_test() 151 | --------------------------------------------------------------------------------