├── .gitignore
├── README.md
├── _config.yml
├── requirements.txt
└── src
    ├── __init__.py
    ├── algorithm
        ├── __init__.py
        └── cluster
        │   ├── Kmeans
        │       ├── __init__.py
        │       └── k_means_cluster.py
        │   ├── LDA
        │       ├── __init__.py
        │       └── lda_cluster.py
        │   ├── __init__.py
        │   └── singlePass
        │       ├── __init__.py
        │       ├── singlePassCluster.py
        │       ├── singlePassCluster_copy.py
        │       └── singlepassrun.py
    ├── configure.py
    ├── data_reader.py
    ├── dynamic_update_event.py
    ├── event2mysql.py
    ├── event_update.sh
    ├── history_event.py
    ├── load_event_data.py
    ├── model
        └── __init__.py
    ├── parser
        ├── __init__.py
        ├── news_parser
        │   ├── __init__.py
        │   ├── dbscan.py
        │   ├── tonghuashun.py
        │   └── xueqiu.py
        ├── requirement.txt
        └── xueqiu
        │   ├── README.md
        │   ├── __init__.py
        │   ├── discuss_focus_statistics_daily.sh
        │   ├── discuss_parser
        │       ├── __init__.py
        │       ├── discuss_data
        │       │   ├── __init__.py
        │       │   ├── discuss.db
        │       │   └── discuss.db'
        │       ├── discuss_parser.py
        │       ├── format_transform.py
        │       ├── participle
        │       │   └── __init__.py
        │       ├── xueqiu_dicsuss_batch.py
        │       ├── xueqiu_discuss_batch_multi.py
        │       ├── xueqiu_discuss_csv.py
        │       ├── xueqiu_discuss_csv_bak.py
        │       ├── xueqiu_discuss_daily.py
        │       ├── xueqiu_discuss_daily_bak.py
        │       ├── xueqiu_discuss_parser.py
        │       └── xueqiu_discuss_parser_bak.py
        │   ├── focus_parser
        │       ├── __init__.py
        │       ├── xueqiu_focus_statistics.py
        │       └── 雪球大V关注股票.ipynb
        │   └── log
        │       ├── dict_log.log.2019-04-29
        │       ├── discuss_stock_filter_daily.log.2019-04-29
        │       ├── tokenization_log.log.2019-04-29
        │       └── xueqiu_focus_statistic.log.2019-04-29
    ├── singlepass_run.py
    ├── singlepass_test.py
    └── utils
        ├── Keywords.py
        ├── VSM
            ├── __init__.py
            ├── tfidf.py
            └── vector.py
        ├── __init__.py
        ├── cluster_test.py
        ├── corpus_update.py
        ├── data_process.py
        ├── dicts.py
        ├── engine
            ├── __init__.py
            ├── data_source.py
            └── mysql_util.py
        ├── event_util.py
        ├── file_util.py
        ├── keywords_extractor.py
        ├── log
            ├── __init__.py
            ├── log2.py
            └── log_util.py
        ├── news.py
        ├── test.py
        ├── time_util.py
        └── tokenization.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### Python template
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | .hypothesis/
 50 | .pytest_cache/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | db.sqlite3
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # pyenv
 78 | .python-version
 79 | 
 80 | # celery beat schedule file
 81 | celerybeat-schedule
 82 | 
 83 | # SageMath parsed files
 84 | *.sage.py
 85 | 
 86 | # Environments
 87 | .env
 88 | .venv
 89 | env/
 90 | venv/
 91 | ENV/
 92 | env.bak/
 93 | venv.bak/
 94 | 
 95 | # Spyder project settings
 96 | .spyderproject
 97 | .spyproject
 98 | 
 99 | # Rope project settings
100 | .ropeproject
101 | 
102 | # mkdocs documentation
103 | /site
104 | 
105 | # mypy
106 | .mypy_cache/
107 | .DS_Store
108 | .idea/
109 | /src/corpus
110 | .log
111 | /src/parser/xueqiu/discuss_parser/hs_err_pid40588.log
112 | /src/algorithm/cluster/singlePass/c2.txt
113 | /src/data/text.txt
114 | /src/model/
115 | /src/data/full_text.txt
116 | /src/data/corpus_train.txt
117 | /src/data/text_keyword.txt
118 | /src/data/cluster_keywords_lda.txt
119 | /src/data/cluster_keywords_lsi.txt
120 | /src/data/cluster_result_document.txt
121 | /src/data/cluster_result_keyword.txt
122 | /src/model/tfidf_model/feature.pkl
123 | /src/model/tfidf_model/tfidftransformer.pkl
124 | /src/model/model_300_2_1
125 | /src/data/text_full.txt
126 | /src/data/text_title.txt
127 | /src/data/text_title_cut.txt
128 | /src/data/text_full_full.txt
129 | /src/data/
130 | /log/
131 | /src/data_copy/
132 | /src/log/
133 | /src/parser/log/
134 | /requ.txt
135 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 金融财经类新闻文本主题事件提取
 2 | 
 3 | ## 代码结构     
 4 |     .
 5 |     ├── configure.py  # 配置文件模块
 6 |     ├── data_reader.py  # 数据读取和预处理模块
 7 |     ├── dynamic_update.py # 事件实时更新模块
 8 |     ├── history_event.py  # 构建历史事件模块
 9 |     ├── load_history_event.py  # load历史事件代码
10 |     ├── algorithm
11 |     │   └── cluster # 聚类模块
12 |     │       ├── Kmeans # kmeans聚类
13 |     │       │   └── k_means_cluster.py
14 |     │       ├── LDA  # kmeans聚类
15 |     │       │   └── lda_cluster.py
16 |     │       └── singlePass   # singlepass聚类
17 |     │           ├── singlePassCluster.py
18 |     │           └── singlepassrun.py
19 |     │
20 |     ├── corpus
21 |     ├── data  # 预处理的数据
22 |     ├── log  # 分词,关键词提取等日志文件
23 |     ├── model  # 存放各类模型文件,如聚类结果、事件结果、tfidf结果
24 |     │   ├── event_model # 存放事件结果
25 |     │   └── tfidf_model # 存放tfidf结果
26 |     └── utils
27 |         ├── Keywords.py  # 关键词提取代码
28 |         ├── cluster.py
29 |         ├── data_process.py # 数据预处理
30 |         ├── data_source.py  # 数据读取
31 |         ├── dicts.py  # 分词词典
32 |         ├── event_util.py  # 事件类库
33 |         ├── keywords_extractor.py  # 关键词提取
34 |         ├── my_util.py  # 工具类
35 |         ├── mysql_util.py  # sql类
36 |         ├── news.py  # 新闻处理类库
37 |         ├── test.py  # 测试代码
38 |         ├── tfidf.py  # tfidf模型训练
39 |         ├── time_util.py  # 时间工具类
40 |         ├── tokenization.py  # 分词模块
41 |         └── vector.py  # 空间向量模块
42 |         
43 | # 主要流程
44 | ## step、1 数据准备
45 | - 涉及文件：data_reader.py
46 | - 从数据库中读取指定日期前的所有新闻，然后整理成两部分数据。
47 | - 第一部分数据新闻的标题，正文组合在一起，然后分词去停等预处理，保存为新闻ID，发布时间， 分词后的正文；[news_id, timestamp, contents]
48 | - 第二部分提取新闻的标题，保存新闻的新闻ID， 发布时间， 新闻标题；[news_id, timestamp, title]
49 | 
50 | ## step、2 VSM训练
51 | - 涉及文件：/utils/tfidf.py
52 | - 构建TFIDF空间向量模型，训练预料为step、1中第一部分保存的内容。
53 | - 空间向量模型保存。
54 | 
55 | ## step、3 singlePass聚类
56 | - 涉及文件：singlepass_run.py
57 | - 对step、1中第一部分生成的数据进行singlePass聚类
58 | 
59 | ## step、4 历史事件准备
60 | - 涉及文件：history_event.py
61 | - 根据step3聚类的结果，构建事件库， 包括添加事件标题，筛选事件涉及的股票，提取事件关键词等，对事件的有效性进行判断。
62 | 
63 | ## step、5 事件更新
64 | - 涉及文件：dynamic_update.py
65 | - 当数据库中出现新的新闻之后，将新的新闻和历史事件进行合并，若合并不成功生成新的事件。
66 | 
67 | 
68 | ## step、6 数据写入数据库
69 | - 涉及文件：event2mysql.py
70 | - 事件生成之后，根据项目需求整理成规定的格式，存入数据库，目前保存的是[事件ID，事件标题， 事件包含的股票， 事件包含的新闻], [股票, 股票涉及的事件]两张表
71 | 
72 | ## step、7 雪球讨论历史数据统计
73 | - 涉及文件： xueqiu_dicsuss_batch.py， xueqiu_dicsuss_batch.py
74 | - 统计雪球讨论数据中涉及到的相关股票
75 | 
76 | ## step、8 数据格式转换
77 | - 涉及文件： format_transform.py
78 | - 将step 7中统计得到的数据转换数据格式，转换成个股以及个股涉及的讨论数目， 股票代码形式修改了。


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | joblib
 2 | tqdm
 3 | backports.functools-lru-cache==1.5
 4 | boto==2.49.0
 5 | boto3==1.9.34
 6 | botocore==1.12.34
 7 | bz2file==0.98
 8 | certifi==2018.10.15
 9 | chardet==3.0.4
10 | cycler==0.10.0
11 | docutils==0.14
12 | futures==3.2.0
13 | gensim==3.6.0
14 | idna==2.7
15 | jieba==0.39
16 | jmespath==0.9.3
17 | JPype1==0.6.3
18 | kiwisolver==1.0.1
19 | matplotlib==2.2.3
20 | mysql-connector==2.1.6
21 | numpy==1.15.3
22 | pandas==0.23.4
23 | pyhanlp==0.1.44
24 | pyparsing==2.2.2
25 | python-dateutil==2.7.5
26 | pytz==2018.7
27 | requests==2.20.0
28 | s3transfer==0.1.13
29 | scikit-learn==0.20.0
30 | scipy==1.1.0
31 | six==1.11.0
32 | sklearn==0.0
33 | smart-open==1.7.1
34 | SQLAlchemy==1.2.12
35 | subprocess32==3.5.3
36 | urllib3==1.26.5
37 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: __init__.py.py
 8 | @time: 2018/10/30 10:44 AM
 9 | """
10 | import sys
11 | sys.path.append('../')
12 | sys.path.append('..')
13 | sys.path.append('../../')
14 | # import pandas as pd
15 | # df = pd.DataFrame({"A":['a','a','b','c','d'],"B":[4,5,6,7,8]}).set_index("A")
16 | # # print df
17 | # kk = ['a', 'b']
18 | # hj = []
19 | # for j in kk:
20 | #     tmp_res = (df.loc[j].values.tolist())
21 | #     if len(tmp_res) > 1:
22 | #         for k in range(len(tmp_res)):
23 | #             hj.extend(tmp_res[k])
24 | #     else:
25 | #         hj.extend(tmp_res)
26 | # print hj
27 | 
28 | 
29 | # from collections import Counter
30 | #
31 | # stock_lists = ['a','b','b','b','b','c','c','c','d','d','d','d']
32 | #
33 | # stock_lists_dict = Counter(stock_lists).items()
34 | # stock_lists_dict.sort(key=lambda item: item[1], reverse=True)
35 | #
36 | # stock_list = []
37 | # for i in stock_lists_dict:
38 | #     stock_list.append(i[0])
39 | #
40 | # print stock_list
41 | 
42 | 
43 | # import pandas as pd
44 | # # from itertools import groupby #itertool还包含有其他很多函数，比如将多个list联合起来。。
45 | # #
46 | # df = pd.DataFrame({'event_id': [1, 2, 3,4, 5],
47 | #                   'event_stock': [['i1','i2'], ['i3', 'i2'], ['i3', 'i5'], ['i9', 'i7'], ['i9']]})
48 | # print df
49 | #
50 | # lst = {}
51 | #
52 | # for i in range(len(df)):
53 | #     event_id = df.loc[i]['event_id']
54 | #     event_stock = df.loc[i]['event_stock']
55 | #     if len(event_stock) > 0:
56 | #         for symbol in event_stock:
57 | #             lst.setdefault(symbol, []).append(event_id)
58 | #
59 | # print lst
60 | 
61 | # from collections import defaultdict
62 | # # lst = [{'a': 123}, {'a': 456},{'b': 789}]
63 | #
64 | # dic = {}
65 | # for _ in lst:
66 | #     for k, v in _.items():
67 | #         dic.setdefault(k, []).append(v)
68 | #
69 | # print dic
70 | 
71 | 
72 | # from utils import file_util, event_util, time_util
73 | # from configure import conf
74 | # import datetime
75 | # import time
76 | #
77 | # history_event_file = file_util.find_newest_file(conf.event_save_path)
78 | #
79 | # print history_event_file
80 | 


--------------------------------------------------------------------------------
/src/algorithm/__init__.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | 
4 | """
5 | @version: ??
6 | @author: li
7 | @file: __init__.py.py
8 | @time: 2019-03-05 10:34
9 | """


--------------------------------------------------------------------------------
/src/algorithm/cluster/Kmeans/__init__.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | 
4 | """
5 | @version: ??
6 | @author: li
7 | @file: __init__.py.py
8 | @time: 2018/11/26 9:49 AM
9 | """


--------------------------------------------------------------------------------
/src/algorithm/cluster/Kmeans/k_means_cluster.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | @version: ??
  6 | @author: li
  7 | @file: k_means_cluster.py
  8 | @time: 2018/11/26 9:50 AM
  9 | """
 10 | 
 11 | from sklearn.cluster import KMeans
 12 | from sklearn import feature_extraction
 13 | from sklearn.feature_extraction.text import TfidfTransformer
 14 | from sklearn.feature_extraction.text import CountVectorizer
 15 | 
 16 | 
 17 | def tfidf_vector(corpus_path):
 18 |     """vectorize the input documents"""
 19 |     corpus_train = []
 20 |     # 利用train-corpus提取特征
 21 |     target_train = []
 22 |     for line in open(corpus_path):
 23 |         line = line.strip().split('\t')
 24 |         if len(line) == 2:
 25 |             words = line[1]
 26 |             category = line[0]
 27 |             target_train.append(category)
 28 |             corpus_train.append(words)
 29 |     print "build train-corpus done!!"
 30 |     # count_v1 = CountVectorizer(max_df=0.4, min_df=0.01)
 31 |     count_v1 = CountVectorizer()
 32 |     counts_train = count_v1.fit_transform(corpus_train)
 33 | 
 34 |     word_dict = {}
 35 |     for index, word in enumerate(count_v1.get_feature_names()):
 36 |         word_dict[index] = word
 37 | 
 38 |     print "the shape of train is " + repr(counts_train.shape)
 39 |     tfidftransformer = TfidfTransformer()
 40 |     tfidf_train = tfidftransformer.fit(counts_train).transform(counts_train)
 41 |     return tfidf_train, word_dict
 42 | 
 43 | 
 44 | def cluster_kmeans(tfidf_train, word_dict, cluster_docs, cluster_keywords, num_clusters):  # K均值分类
 45 |     """topic cluster"""
 46 | 
 47 |     f_docs = open(cluster_docs, 'w+')
 48 |     km = KMeans(n_clusters=num_clusters)
 49 |     km.fit(tfidf_train)
 50 |     clusters = km.labels_.tolist()
 51 |     cluster_dict = {}
 52 |     order_centroids = km.cluster_centers_.argsort()[:, ::-1]
 53 |     doc = 1
 54 |     for cluster in clusters:
 55 |         f_docs.write(str(str(doc)) + ',' + str(cluster) + '\n')
 56 |         doc += 1
 57 |         if cluster not in cluster_dict:
 58 |             cluster_dict[cluster] = 1
 59 |         else:
 60 |             cluster_dict[cluster] += 1
 61 |     f_docs.close()
 62 |     cluster = 1
 63 | 
 64 |     f_clusterwords = open(cluster_keywords, 'w+')
 65 |     for ind in order_centroids:  # 每个聚类选 50 个词
 66 |         words = []
 67 |         for index in ind:
 68 |             words.append(word_dict[index])
 69 |         print cluster, ','.join(words)
 70 |         # f_clusterwords.write(str(cluster) + '\t' + ','.join(words) + '\n')
 71 |         cluster += 1
 72 |         print '*****' * 5
 73 |     f_clusterwords.close()
 74 | 
 75 | 
 76 | def best_kmeans(tfidf_matrix, word_dict):
 77 |     """select the best cluster num"""
 78 | 
 79 |     import matplotlib.pyplot as plt
 80 |     from matplotlib.font_manager import FontProperties
 81 |     from sklearn.cluster import KMeans
 82 |     from scipy.spatial.distance import cdist
 83 |     import numpy as np
 84 |     K = range(1, 300)
 85 |     meandistortions = []
 86 |     for k in K:
 87 |         print k,'****'*5
 88 |         kmeans = KMeans(n_clusters=k)
 89 |         kmeans.fit(tfidf_matrix)
 90 |         meandistortions.append(sum(np.min(cdist(tfidf_matrix.toarray(), kmeans.cluster_centers_, 'euclidean'), axis=1)) / tfidf_matrix.shape[0])
 91 |     plt.plot(K, meandistortions, 'bx-')
 92 |     plt.grid(True)
 93 |     plt.xlabel('Number of clusters')
 94 |     plt.ylabel('Average within-cluster sum of squares')
 95 |     plt.title('Elbow for Kmeans clustering')
 96 |     plt.show()
 97 | 
 98 | 
 99 | if __name__=='__main__':
100 |     corpus_train = "/Users/li/PycharmProjects/event_parser/src/text.txt"
101 |     cluster_docs = "/Users/li/PycharmProjects/event_parser/src/cluster_result_document.txt"
102 |     cluster_keywords = "/Users/li/PycharmProjects/event_parser/src/cluster_result_keyword.txt"
103 |     num_clusters = 15
104 |     tfidf_train, word_dict = tfidf_vector(corpus_train)
105 |     best_kmeans(tfidf_train, word_dict)
106 |     cluster_kmeans(tfidf_train, word_dict, cluster_docs, cluster_keywords, num_clusters)


--------------------------------------------------------------------------------
/src/algorithm/cluster/LDA/__init__.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | 
4 | """
5 | @version: ??
6 | @author: li
7 | @file: __init__.py.py
8 | @time: 2018/11/26 9:49 AM
9 | """


--------------------------------------------------------------------------------
/src/algorithm/cluster/LDA/lda_cluster.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: lda_cluster.py
 8 | @time: 2018/11/26 5:01 PM
 9 | """
10 | 
11 | import os, sys
12 | 
13 | reload(sys)
14 | sys.setdefaultencoding('utf-8')
15 | from gensim.models import LdaModel, TfidfModel, LsiModel
16 | from gensim import similarities
17 | from gensim import corpora
18 | 
19 | 
20 | def create_data(corpus_path):  # 构建数据，先后使用doc2bow和tfidf model对文本进行向量表示
21 |     sentences = []
22 |     sentence_dict = {}
23 |     count = 0
24 |     for line in open(corpus_path):
25 |         # print line
26 |         line = line.strip().split('\t')
27 |         if len(line) == 2:
28 |             sentence_dict[count] = line[1]
29 |             count += 1
30 |             sentences.append(line[1].split(','))
31 |         else:
32 |             break
33 |     # 对文本进行处理，得到文本集合中的词表
34 |     dictionary = corpora.Dictionary(sentences)
35 |     # 利用词表，对文本进行cbow表示
36 |     corpus = [dictionary.doc2bow(text) for text in sentences]
37 |     # 利用cbow，对文本进行tfidf表示
38 |     tfidf = TfidfModel(corpus)
39 |     corpus_tfidf = tfidf[corpus]
40 |     return sentence_dict, dictionary, corpus, corpus_tfidf
41 | 
42 | 
43 | def lda_model(sentence_dict, dictionary, corpus, corpus_tfidf, cluster_keyword_lda):  # 使用lda模型，获取主题分布
44 |     lda = LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=11)
45 |     f_keyword = open(cluster_keyword_lda, 'w+')
46 |     for topic in lda.print_topics(11, 53):
47 |         print '****' * 5
48 |         words = []
49 |         for word in topic[1].split('+'):
50 |             word = word.split('*')[1].replace(' ', '')
51 |             words.append(word)
52 |         f_keyword.write(str(topic[0]) + '\t' + ','.join(words) + '\n')
53 |     # 利用lsi模型，对文本进行向量表示，这相当于与tfidf文档向量表示进行了降维，维度大小是设定的主题数目
54 |     corpus_lda = lda[corpus_tfidf]
55 |     for doc in corpus_lda:
56 |         print len(doc), doc
57 |     return lda
58 | 
59 | 
60 | def lsi_model(sentence_dict, dictionary, corpus, corpus_tfidf, cluster_keyword_lsi):  # 使用lsi模型，获取主题分布
61 |     lsi = LsiModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=11)
62 |     f_keyword = open(cluster_keyword_lsi, 'w+')
63 |     for topic in lsi.print_topics(11, 50):
64 |         print topic[0]
65 |         words = []
66 |         for word in topic[1].split('+'):
67 |             word = word.split('*')[1].replace(' ', '')
68 |             words.append(word)
69 |         f_keyword.write(str(topic[0]) + '\t' + ','.join(words) + '\n')
70 | 
71 |     return lsi
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     corpus_path = "/Users/li/PycharmProjects/event_parser/src/text.txt"
76 |     # corpus_path = "/Users/li/PycharmProjects/event_parser/src/corpus_train.txt"
77 |     cluster_keyword_lda = '/Users/li/PycharmProjects/event_parser/src/cluster_keywords_lda.txt'
78 |     cluster_keyword_lsi = '/Users/li/PycharmProjects/event_parser/src/cluster_keywords_lsi.txt'
79 |     sentence_dict, dictionary, corpus, corpus_tfidf = create_data(corpus_path)
80 |     # for i in corpus_tfidf:
81 |     #     print i
82 |     lsi_model(sentence_dict, dictionary, corpus, corpus_tfidf, cluster_keyword_lsi)
83 |     lda_model(sentence_dict, dictionary, corpus, corpus_tfidf, cluster_keyword_lda)
84 | 


--------------------------------------------------------------------------------
/src/algorithm/cluster/__init__.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | # OnePassCluster
 6 | 
 7 | @version: ??
 8 | @author: li
 9 | @file: __init__.py.py
10 | @time: 2018/11/8 10:41 AM
11 | """
12 | 
13 | import sys
14 | sys.path.append('../')
15 | sys.path.append('..')
16 | sys.path.append('../../')


--------------------------------------------------------------------------------
/src/algorithm/cluster/singlePass/__init__.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: __init__.py.py
 8 | @time: 2018/11/28 3:49 PM
 9 | """
10 | 


--------------------------------------------------------------------------------
/src/algorithm/cluster/singlePass/singlePassCluster.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | @version: ??
  6 | @author: li
  7 | @file: singlePassCluster.py
  8 | @time: 2018/11/26 9:48 AM
  9 | """
 10 | import gc
 11 | import time
 12 | import numpy as np
 13 | from math import sqrt
 14 | from tqdm import tqdm
 15 | 
 16 | 
 17 | class ClusterUnit(object):
 18 |     """
 19 |     # 定义一个簇单元
 20 |     """
 21 | 
 22 |     def __init__(self):
 23 |         self.node_list = []  # 该簇存在的节点列表
 24 |         self.node_num = 0  # 该簇节点数
 25 |         self.centroid = None  # 该簇质心
 26 | 
 27 |     def add_node(self, node_id, node_vec):
 28 |         """
 29 |         为本簇添加指定节点，并更新簇心
 30 |         :param node_id: 节点ID
 31 |         :param node_vec: 该节点对应的特征向量
 32 |         :return: null
 33 |         """
 34 |         self.node_list.append(node_id)
 35 |         try:
 36 |             self.centroid = (self.node_num * self.centroid + node_vec) / (self.node_num + 1)  # 更新簇心
 37 |         except TypeError:
 38 |             self.centroid = np.array(node_vec) * 1  # 初始化质心
 39 |         self.node_num += 1  # 节点数加1
 40 | 
 41 |     def remove_node(self, node_id):
 42 |         # 移除本簇指定节点
 43 |         try:
 44 |             self.node_list.remove(node_id)
 45 |             # 更新簇心
 46 |             self.node_num -= 1
 47 |         except ValueError:
 48 |             raise ValueError("%s not in this cluster" % node_id)  # 该簇本身就不存在该节点，移除失败
 49 | 
 50 |     def move_node(self, node_id, another_cluster):
 51 |         # 将本簇中的其中一个节点移至另一个簇
 52 |         self.remove_node(node_id=node_id)
 53 |         another_cluster.add_node(node_id=node_id)
 54 | 
 55 | 
 56 | def euclidean_distance(vec_a, vec_b):
 57 |     # 计算向量a与向量b的欧式距离
 58 |     diff = vec_a - vec_b
 59 |     return sqrt(np.dot(diff, diff))  # dot计算矩阵内积
 60 | 
 61 | 
 62 | def cosine_distance(vec_a, vec_b):
 63 |     # 计算向量a与向量b的余弦距离
 64 |     dot_product = 0.0
 65 |     norm_a = 0.0
 66 |     norm_b = 0.0
 67 |     for a, b in zip(vec_a, vec_b):
 68 |         dot_product += a * b
 69 |         norm_a += a ** 2
 70 |         norm_b += b ** 2
 71 |     if norm_a == 0.0 or norm_b == 0.0:
 72 |         return 0
 73 |     else:
 74 |         return round(dot_product / ((norm_a ** 0.5) * (norm_b ** 0.5)) * 100, 2)
 75 | 
 76 | 
 77 | def cosine_distance_numpy(vector1, vector2):
 78 |     vector1 = vector1.reshape([-1])
 79 |     vector2 = vector2.reshape([-1])
 80 |     cos_v12 = np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))
 81 |     return cos_v12
 82 | 
 83 | 
 84 | class OnePassCluster:
 85 |     def __init__(self, threshold, vector_tuple):
 86 |         # t:一趟聚类的阈值
 87 |         self.threshold = threshold  # 一趟聚类的阈值
 88 |         # self.vectors = np.array(vector_tuple)
 89 |         self.vectors = vector_tuple
 90 |         self.cluster_list = []  # 聚类后簇的列表
 91 |         t1 = time.time()
 92 |         self.clustering()
 93 |         t2 = time.time()
 94 |         self.cluster_num = len(self.cluster_list)  # 聚类完成后 簇的个数
 95 |         self.spend_time = t2 - t1  # 聚类花费的时间
 96 | 
 97 |     def clustering(self):
 98 |         self.cluster_list.append(ClusterUnit())  # 初始新建一个簇
 99 |         self.cluster_list[0].add_node(self.vectors[0][0], self.vectors[0][1])  # 将读入的第一个节点归于该簇
100 |         for index in tqdm(range(len(self.vectors))[1:]):
101 |             # min_distance = euclidean_distance(vec_a=self.vectors[index][1],
102 |             #                                   vec_b=self.cluster_list[0].centroid)  # 与簇的质心的最小欧式距离
103 |             min_distance = cosine_distance(vec_a=self.vectors[index][1],
104 |                                            vec_b=self.cluster_list[0].centroid)  # 与簇的质心的最小cosine距离
105 | 
106 |             # print("index:{}, min_distance:{}".format(index, min_distance))
107 |             min_cluster_index = 0  # 最小距离的簇的索引
108 |             # print "len of cluster_list %s " % len(self.cluster_list)
109 |             for cluster_index, cluster in enumerate(self.cluster_list[1:]):
110 |                 # enumerate会将数组或列表组成一个索引序列
111 |                 # 寻找距离最小的簇，记录下距离和对应的簇的索引
112 |                 # distance = euclidean_distance(vec_a=self.vectors[index][1],
113 |                 #                               vec_b=cluster.centroid)
114 |                 distance = cosine_distance(vec_a=self.vectors[index][1],
115 |                                            vec_b=cluster.centroid)
116 |                 # print("cluster_index:{}, distance:{}".format(cluster_index, distance))
117 |                 if distance > min_distance:  # 使用欧式距离是改为小于号
118 |                     min_distance = distance
119 |                     min_cluster_index = cluster_index + 1
120 |             # print 'max_dist: %s' % min_distance
121 |             # print 'min_cluster_index: %s' % min_cluster_index
122 |             if min_distance > self.threshold:  # 最小距离小于阈值，则归于该簇  # 使用欧式距离时改为小于号
123 |                 self.cluster_list[min_cluster_index].add_node(self.vectors[index][0], self.vectors[index][1])
124 |             else:  # 否则新建一个簇
125 |                 new_cluster = ClusterUnit()
126 |                 new_cluster.add_node(self.vectors[index][0], self.vectors[index][1])
127 |                 self.cluster_list.append(new_cluster)
128 |                 del new_cluster
129 |                 gc.collect()
130 | 
131 |     def print_result(self, label_dict=None):
132 |         # 打印出聚类结果
133 |         # label_dict:节点对应的标签字典
134 |         print("*******  one-pass cluster result  ***********")
135 |         for index, cluster in enumerate(self.cluster_list):
136 |             print("cluster:%s" % index)  # 簇的序号
137 |             print("簇心: %s" % cluster.centroid)  # 簇心
138 |             print(cluster.node_list)  # 该簇的节点列表
139 |             if label_dict is not None:
140 |                 print(" ".join([label_dict[n] for n in cluster.node_list]))  # 若有提供标签字典，则输出该簇的标签
141 |             print("node num: %s" % cluster.node_num)
142 |             print("-------------")
143 |         print("the number of nodes %s" % len(self.vectors))
144 |         print("the number of cluster %s" % self.cluster_num)
145 |         print("spend time %.9fs" % (self.spend_time / 1000))
146 | 
147 | 
148 | if __name__ == '__main__':
149 |     # cluster unit 测试
150 |     # cluster_unit = ClusterUnit()
151 |     # cluster_unit.add_node(1, [1, 1, 2])
152 |     # cluster_unit.add_node(5, [2, 1, 2])
153 |     # cluster_unit.add_node(3, [3, 1, 2])
154 |     # print cluster_unit.centroid
155 | 
156 |     # 读取测试集
157 |     temperature_all_city = np.loadtxt('c2.txt', delimiter=",", usecols=(3, 4))  # 读取聚类特征:[最高温度， 最低温度]
158 |     temperature_all_city_index = np.loadtxt('c2.txt', delimiter=",", usecols=0)  # 索引
159 | 
160 |     result = []
161 |     for i in range(len(temperature_all_city_index)):
162 |         result.append((temperature_all_city_index[i], temperature_all_city[i]))
163 | 
164 |     xy_ = dict()
165 |     xy = np.loadtxt('c2.txt', delimiter=",", usecols=(8, 9))  # 读取各地经纬度
166 |     for i in range(len(temperature_all_city_index)):
167 |         xy_[temperature_all_city_index[i]] = xy[i]
168 | 
169 |     f = open('c2.txt', 'r')
170 |     lines = f.readlines()
171 |     zone = [i.split(',')[1] for i in lines]  # 读取地区并转化为字典
172 |     zone_dict = dict()
173 |     for i in range(len(zone)):
174 |         zone_dict[temperature_all_city_index[i]] = zone[i]
175 |     f.close()
176 | 
177 |     # 构建一趟聚类器
178 |     clustering = OnePassCluster(vector_tuple=result, threshold=97)
179 |     # clustering.print_result()
180 |     clustering.print_result(label_dict=zone_dict)
181 | 
182 |     # 将聚类结果导出图
183 |     # import matplotlib.pylab as pl
184 |     # fig, ax = pl.subplots()
185 |     # fig = zone_dict
186 |     # c_map = pl.get_cmap('jet', clustering.cluster_num)
187 |     # c = 0
188 |     # for cluster in clustering.cluster_list:
189 |     #     for node in cluster.node_list:
190 |     #         ax.scatter(xy_[node][0], xy_[node][1], c=c, s=30, cmap=c_map, vmin=0, vmax=clustering.cluster_num)
191 |     #     c += 1
192 |     # pl.show()
193 | 


--------------------------------------------------------------------------------
/src/algorithm/cluster/singlePass/singlePassCluster_copy.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | @version: ??
  6 | @author: li
  7 | @file: singlePassCluster.py
  8 | @time: 2018/11/26 9:48 AM
  9 | """
 10 | 
 11 | import numpy as np
 12 | from math import sqrt
 13 | import time
 14 | import matplotlib.pylab as pl
 15 | 
 16 | 
 17 | class ClusterUnit:
 18 |     """
 19 |     # 定义一个簇单元
 20 |     """
 21 |     def __init__(self):
 22 |         self.node_list = []  # 该簇存在的节点列表
 23 |         self.node_num = 0  # 该簇节点数
 24 |         self.centroid = None  # 该簇质心
 25 | 
 26 |     def add_node(self, node, node_vec):
 27 |         """
 28 |         为本簇添加指定节点，并更新簇心
 29 |          node_vec:该节点的特征向量
 30 |          node:节点
 31 |          return:null
 32 |         """
 33 |         self.node_list.append(node)
 34 |         try:
 35 |             self.centroid = (self.node_num * self.centroid + node_vec) / (self.node_num + 1)  # 更新簇心
 36 |         except TypeError:
 37 |             self.centroid = np.array(node_vec) * 1  # 初始化质心
 38 |         self.node_num += 1  # 节点数加1
 39 | 
 40 |     def remove_node(self, node):
 41 |         # 移除本簇指定节点
 42 |         try:
 43 |             self.node_list.remove(node)
 44 |             # 更新簇心
 45 |             self.node_num -= 1
 46 |         except ValueError:
 47 |             raise ValueError("%s not in this cluster" % node)  # 该簇本身就不存在该节点，移除失败
 48 | 
 49 |     def move_node(self, node, another_cluster):
 50 |         # 将本簇中的其中一个节点移至另一个簇
 51 |         self.remove_node(node=node)
 52 |         another_cluster.add_node(node_id=node)
 53 | 
 54 | 
 55 | # cluster_unit = ClusterUnit()
 56 | # cluster_unit.add_node(1, [1, 1, 2])
 57 | # cluster_unit.add_node(5, [2, 1, 2])
 58 | # cluster_unit.add_node(3, [3, 1, 2])
 59 | # print cluster_unit.centroid
 60 | 
 61 | 
 62 | def euclidean_distance(vec_a, vec_b):
 63 |     # 计算向量a与向量b的欧式距离
 64 |     diff = vec_a - vec_b
 65 |     return sqrt(np.dot(diff, diff))  # dot计算矩阵内积
 66 | 
 67 | 
 68 | def cosine_distance(vec_a, vec_b):
 69 |     # 计算向量a与向量b的余弦距离
 70 |     dot_product = 0.0
 71 |     normA = 0.0
 72 |     normB = 0.0
 73 |     for a, b in zip(vec_a, vec_b):
 74 |         dot_product += a * b
 75 |         normA += a ** 2
 76 |         normB += b ** 2
 77 |     if normA == 0.0 or normB == 0.0:
 78 |         return 0
 79 |     else:
 80 |         return round(dot_product / ((normA**0.5)*(normB**0.5)) * 100, 2)
 81 | 
 82 | 
 83 | class OnePassCluster:
 84 |     def __init__(self, threshold, vector_list):
 85 |         # t:一趟聚类的阈值
 86 |         self.threshold = threshold  # 一趟聚类的阈值
 87 |         self.vectors = np.array(vector_list)
 88 |         self.cluster_list = []  # 聚类后簇的列表
 89 |         t1 = time.time()
 90 |         self.clustering()
 91 |         t2 = time.time()
 92 |         self.cluster_num = len(self.cluster_list)  # 聚类完成后 簇的个数
 93 |         self.spend_time = t2 - t1  # 聚类花费的时间
 94 | 
 95 |     def clustering(self):
 96 |         self.cluster_list.append(ClusterUnit())  # 初始新建一个簇
 97 |         self.cluster_list[0].add_node(0, self.vectors[0])  # 将读入的第一个节点归于该簇
 98 |         for index in range(len(self.vectors))[1:]:
 99 |             # min_distance = euclidean_distance(vec_a=self.vectors[index],
100 |             #                                   vec_b=self.cluster_list[0].centroid)  # 与簇的质心的最小欧式距离
101 |             min_distance = cosine_distance(vec_a=self.vectors[index],
102 |                                            vec_b=self.cluster_list[0].centroid)  # 与簇的质心的最小cosine距离
103 | 
104 |             # print("index:{}, min_distance:{}".format(index, min_distance))
105 |             min_cluster_index = 0  # 最小距离的簇的索引
106 |             # print "len of cluster_list %s " % len(self.cluster_list)
107 |             for cluster_index, cluster in enumerate(self.cluster_list[1:]):
108 |                 # enumerate会将数组或列表组成一个索引序列
109 |                 # 寻找距离最小的簇，记录下距离和对应的簇的索引
110 |                 # distance = euclidean_distance(vec_a=self.vectors[index],
111 |                 #                               vec_b=cluster.centroid)
112 |                 distance = cosine_distance(vec_a=self.vectors[index],
113 |                                            vec_b=cluster.centroid)
114 |                 # print("cluster_index:{}, distance:{}".format(cluster_index, distance))
115 |                 if distance > min_distance:  # 使用欧式距离是改为小于号
116 |                     min_distance = distance
117 |                     min_cluster_index = cluster_index + 1
118 |             print 'max_dist: %s' % min_distance
119 |             print 'min_cluster_index: %s' % min_cluster_index
120 |             if min_distance > self.threshold:  # 最小距离小于阈值，则归于该簇  # 使用欧式距离时改为小于号
121 |                 self.cluster_list[min_cluster_index].add_node(index, self.vectors[index])
122 |             else:  # 否则新建一个簇
123 |                 new_cluster = ClusterUnit()
124 |                 new_cluster.add_node(index, self.vectors[index])
125 |                 self.cluster_list.append(new_cluster)
126 |                 del new_cluster
127 | 
128 |     def print_result(self, label_dict=None):
129 |         # 打印出聚类结果
130 |         # label_dict:节点对应的标签字典
131 |         print "*******  one-pass cluster result  ***********"
132 |         for index, cluster in enumerate(self.cluster_list):
133 |             print "cluster:%s" % index  # 簇的序号
134 |             print "簇心: %s" % cluster.centroid  # 簇心
135 |             print cluster.node_list  # 该簇的节点列表
136 |             if label_dict is not None:
137 |                 print " ".join([label_dict[n] for n in cluster.node_list])  # 若有提供标签字典，则输出该簇的标签
138 |             print "node num: %s" % cluster.node_num
139 |             print "-------------"
140 |         print "the number of nodes %s" % len(self.vectors)
141 |         print "the number of cluster %s" % self.cluster_num
142 |         print "spend time %.9fs" % (self.spend_time / 1000)
143 | 
144 | 
145 | if __name__ == '__main__':
146 |     # 读取测试集
147 |     temperature_all_city = np.loadtxt('c2.txt', delimiter=",", usecols=(3, 4))  # 读取聚类特征:[最高温度， 最低温度]
148 |     xy = np.loadtxt('c2.txt', delimiter=",", usecols=(8, 9))  # 读取各地经纬度
149 |     # print(temperature_all_city)
150 |     f = open('c2.txt', 'r')
151 |     lines = f.readlines()
152 |     zone_dict = [i.split(',')[1] for i in lines]  # 读取地区并转化为字典
153 |     f.close()
154 | 
155 |     # 构建一趟聚类器
156 |     clustering = OnePassCluster(vector_list=temperature_all_city, threshold=90)
157 |     clustering.print_result(label_dict=zone_dict)
158 | 
159 |     # 将聚类结果导出图
160 |     fig, ax = pl.subplots()
161 |     fig = zone_dict
162 |     c_map = pl.get_cmap('jet', clustering.cluster_num)
163 |     c = 0
164 |     for cluster in clustering.cluster_list:
165 |         for node in cluster.node_list:
166 |             ax.scatter(xy[node][0], xy[node][1], c=c, s=30, cmap=c_map, vmin=0, vmax=clustering.cluster_num)
167 |         c += 1
168 |     pl.show()
169 | 


--------------------------------------------------------------------------------
/src/algorithm/cluster/singlePass/singlepassrun.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: singlepass_run.py
 8 | @time: 2018/11/29 8:04 PM
 9 | """
10 | import sys
11 | sys.path.append('..')
12 | sys.path.append('../')
13 | sys.path.append('../../')
14 | import pickle
15 | from src.configure import conf
16 | from src.utils.VSM import tfidf
17 | from src.algorithm.cluster.singlePass import singlePassCluster
18 | 
19 | # corpus_train_path = "/Users/li/PycharmProjects/event_parser/src/data/text_full_index.txt"
20 | corpus_train_path = conf.corpus_train_path
21 | # tfidf_train, word_dict = tfidf_vector(corpus_train)
22 | # tfidf_train, word_dict = tfidf.tfidf_vector(corpus_train)
23 | tfidf_train_dict, tfidf_train_tuple, word_dict = tfidf.tfidf_vectorizer(corpus_train_path)
24 | # print np.shape(tfidf_train.toarray())
25 | # print tfidf_train.toarray()[1]
26 | 
27 | # clustering = OnePassCluster(vector_tuple=tfidf_train.toarray(), threshold=10)
28 | clustering = singlePassCluster.OnePassCluster(vector_tuple=tfidf_train_tuple, threshold=10)
29 | clustering.print_result()
30 | 
31 | # 将聚好的类簇保存下来，为后面的事件表示和有效事件判断使用。
32 | # clustering_path = '/Users/li/PycharmProjects/event_parser/src/model/clustering_new.pkl'
33 | clustering_path = conf.clustering_save_path
34 | with open(clustering_path, 'wb') as fw:
35 |     pickle.dump(clustering, fw)
36 | 
37 | # for cluster_index, cluster in enumerate(cluster_list):
38 | #     print "cluster:%s" % cluster_index  # 簇的序号
39 | #     print cluster.node_list  # 该簇的节点列表


--------------------------------------------------------------------------------
/src/configure.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: configure.py
 8 | @time: 2018/10/31 1:52 PM
 9 | 配置文件
10 | """
11 | 
12 | import os
13 | 
14 | 
15 | class Configure(object):
16 | 
17 |     project_path = "/Users/li/PycharmProjects/event_parser/src"
18 |     # project_path = os.getcwd()
19 | 
20 |     # singlepass_run 和history_event 使用同一个时间段前的新闻，动态更新使用该时间之后的新闻
21 |     data_time = '1545235200'
22 | 
23 |     # 词典目录
24 |     dic_path = project_path + '/corpus'
25 |     stock_new_path = dic_path + "/stock.csv"
26 | 
27 |     # 停用词目录
28 |     stop_words_path = project_path + '/corpus/stop_words_CN'
29 | 
30 |     # tf-idf 训练语料文件位置，标题和正文合并在一起
31 |     corpus_train_path = project_path + "/data/text_full_index.txt"
32 | 
33 |     # 新闻标题的保存路径
34 |     corpus_news_title = project_path + "/data/text_title_index.txt"
35 | 
36 |     # singlePass聚类结果保存目录文件
37 |     clustering_save_path = project_path + '/model/clustering_new_10.pkl'
38 |     # clustering_save_path = project_path + '/model/clustering_new_20.pkl'
39 |     # clustering_save_path = project_path + '/model/clustering_new_30.pkl'
40 |     # clustering_save_path = project_path + '/model/clustering_new_40.pkl'
41 | 
42 |     corpus_news = corpus_train_path
43 | 
44 |     event_unit_path = project_path + '/model/event_units_new_10.pkl'
45 |     # event_unit_path = project_path + '/model/event_units_new_20.pkl'
46 |     # event_unit_path = project_path + '/model/event_units_new_30.pkl'
47 |     # event_unit_path = project_path + '/model/event_units_new_40.pkl'
48 | 
49 |     event_save_path = project_path + '/model/event_model/'
50 | 
51 |     # TF-IDF计算相关文件
52 |     tfidf_feature_path = project_path + '/model/tfidf_model/feature_full.pkl'
53 |     tfidftransformer_path = project_path + '/model/tfidf_model/tfidftransformer_full.pkl'
54 |     word_dict_path = project_path + '/model/tfidf_model/word_dict_full.pkl'
55 | 
56 | 
57 | conf = Configure()
58 | 


--------------------------------------------------------------------------------
/src/dynamic_update_event.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | @version: ??
  6 | @author: li
  7 | @file: dynamic_update_event.py
  8 | @time: 2018/12/5 2:23 PM
  9 | 增量式事件更新，基于历史事件库，将新增新闻实时与历史事件库进行相似度计算，最后合并
 10 | # 每天十二点之前更新一次
 11 | # 每天开盘前更新一次
 12 | """
 13 | import sys
 14 | import gc
 15 | import time
 16 | import datetime
 17 | from src import data_reader
 18 | import pandas as pd
 19 | from tqdm import tqdm
 20 | 
 21 | sys.path.append('../')
 22 | sys.path.append('..')
 23 | sys.path.append('../../')
 24 | 
 25 | from src.utils.log import log_util
 26 | from src.configure import conf  # noqa: E402
 27 | from src.utils import event_util, file_util, data_process, dicts, tokenization, time_util  # noqa: E402
 28 | from src.utils.VSM import tfidf
 29 | from src.algorithm.cluster.singlePass import singlePassCluster
 30 | 
 31 | logging = log_util.Logger('dynamic_update', level='debug')
 32 | logging.logger.info('事件库动态更新启动时间: {}'.format(time_util.timestamp_to_time(time.time())))
 33 | # step 1、读取指定日期之后的新闻
 34 | # 初次动态更新时，event_save_path下保存的是event
 35 | latest_event_file = file_util.find_newest_file(conf.event_save_path)
 36 | if latest_event_file is None or latest_event_file is 'NULL':
 37 |     # 如果没有动态更新过事件， 则today_timestamp
 38 |     # 读取当前时间段时间
 39 |     now = datetime.date.today()
 40 |     today_timestamp = int(time.mktime(now.timetuple()))
 41 |     today = time_util.timestamp_to_time(today_timestamp)
 42 |     # logging.logger.info('读取新闻的起始时间: {}'.format(today))
 43 |     # ordered_df = data_reader.get_ordered_data(timestamp=today_timestamp)
 44 | else:
 45 |     # 使用事件的最后更新时间作为新闻的起止时间
 46 |     latest_event_time = latest_event_file.split('.')[0]
 47 |     today_timestamp = int(latest_event_time)
 48 |     today = time_util.timestamp_to_time(today_timestamp)
 49 | 
 50 | logging.logger.info('读取新闻的起始时间: {}'.format(today))
 51 | ordered_df = data_reader.get_ordered_data(timestamp=today_timestamp)
 52 | 
 53 | # load tf-idf VSM
 54 | # tfidf_feature_path = '/Users/li/PycharmProjects/event_parser/src/model/tfidf_model/feature_1.pkl'
 55 | # tfidftransformer_path = '/Users/li/PycharmProjects/event_parser/src/model/tfidf_model/tfidftransformer_1.pkl'
 56 | tfidf_feature_path = conf.tfidf_feature_path
 57 | tfidf_transformer_path = conf.tfidftransformer_path
 58 | tfidf_feature = tfidf.load_tfidf_feature(tfidf_feature_path)
 59 | tfidf_transformer = tfidf.load_tfidf_transformer(tfidf_transformer_path)
 60 | 
 61 | # 导入词典，停用词，数据处理接口，分词接口
 62 | dp, dict_init, stop_words = data_process.DataPressing(), dicts.init(), tokenization.load_stop_words()
 63 | tk = tokenization.Tokenizer(dp, stop_words)
 64 | 
 65 | # 提取dataFrame中的内容
 66 | ordered_news_lists = data_reader.trans_df_data(ordered_df, tfidf_feature, tfidf_transformer, dp, tk)
 67 | 
 68 | # 如果当天没有新闻更新，则直接退出程序，事件单元不需要更新。
 69 | # 文章重复更新，
 70 | if len(ordered_news_lists) <= 0:
 71 |     # print '今天没有新新闻，事件单元不更新'
 72 |     logging.logger.info('[事件库未更新]: 今天没有新新闻，事件单元不更新')
 73 |     sys.exit()
 74 | 
 75 | # for tmp in ordered_news_lists:
 76 | #     print tmp[0], tmp[1]
 77 | 
 78 | # step 2、导入历史事件
 79 | # 如果第一次执行dynamic_update_event文件，则event_save_path
 80 | # history_event_file = file_util.find_newest_file(conf.event_save_path)
 81 | # history_event_file = conf.event_save_path + latest_event_file
 82 | history_event_units = event_util.load_history_event(latest_event_file)
 83 | # print "[Info] 事件库中事件的个数 %s" % len(history_event_units)
 84 | logging.logger.info("[事件库中事件的个数:] {}".format(len(history_event_units)))
 85 | # for index, event_unit in enumerate(history_event_units):
 86 | #     print "cluster: %s" % index  # 簇的序号
 87 | #     print event_unit.node_list  # 该簇的节点列表
 88 | #     print event_unit.centroid
 89 | 
 90 | len_news = len(ordered_news_lists)
 91 | new_event_units = []
 92 | new_event_units.extend(history_event_units)
 93 | # step 3、遍历新新闻，然后将新新闻添加到事件单元中，更新事件单元的节点和簇心
 94 | for news_index in tqdm(range(len_news)):  # 遍历每一篇新的新闻
 95 |     # 新的节点id
 96 |     new_node_id = ordered_news_lists[news_index][0]
 97 |     # 新的节点的VSM
 98 |     new_node_vec = ordered_news_lists[news_index][2]
 99 |     # max_dist = singlePassCluster.cosine_distance(history_event_units[0].centroid, ordered_news_lists[news_index][2])
100 |     max_dist = singlePassCluster.cosine_distance(new_event_units[0].centroid, new_node_vec)
101 |     min_event_index = 0
102 |     for event_index, new_event_unit in enumerate(new_event_units[1:]):  # 遍历每一个事件单元
103 |         # 计算当前新闻和每个事件元之间距离
104 |         # dist = singlePassCluster.cosine_distance(history_event_unit.centroid, ordered_news_lists[news_index][2])
105 |         dist = singlePassCluster.cosine_distance(new_event_unit.centroid, new_node_vec)
106 |         # print 'dist: %s' % dist
107 |         # 找出最大的距离的事件元
108 |         if dist > max_dist:
109 |             max_dist = dist
110 |             min_event_index = event_index + 1
111 |     logging.logger.info('[Info] new_node_id: %s' % new_node_id)
112 |     logging.logger.info('[Info] len of new_event_unit: %s' % len(new_event_units))
113 |     logging.logger.info('[Info] max_dist: %s' % max_dist)
114 |     logging.logger.info('[Info] min_cluster_index: %s\n' % min_event_index)
115 |     # 如果最大距离大于某一个阈值，则将该新闻归并到该事件单元
116 |     if max_dist > 10:
117 |         # new_node_id = ordered_news_lists[news_index][0]
118 |         # new_node_vec = ordered_news_lists[news_index][2]
119 |         new_event_units[min_event_index].add_node(new_node_id, new_node_vec)
120 |         # new_event_units[min_event_index].add_unit_title()
121 |         # new_event_units[min_event_index].event_expression()
122 |     else:
123 |         # 否则则新建一个事件单元
124 |         index = len(new_event_units)
125 |         new_event = event_util.EventUnit()
126 |         new_event.event_id = index
127 |         new_event.add_node(new_node_id, new_node_vec)
128 |         # new_event.add_unit_title()
129 |         # new_event.event_expression()
130 |         new_event_units.append(new_event)
131 |         del new_event
132 |         gc.collect()
133 | 
134 | logging.logger.info('[更新后的事件个数]: {}'.format(len(new_event_units)))
135 | # step 4、对更新的事件库进行标题和关键词更新
136 | # 事件库更新，更新标题，关键词,股票代码。
137 | # 读取数据库中的所有新闻数据
138 | full_df_data = data_reader.get_all_data().set_index('id')
139 | 
140 | # 股票及股票代码
141 | stock_df = pd.read_csv(conf.stock_new_path, encoding='utf-8').set_index('SESNAME')
142 | 
143 | for unit in tqdm(new_event_units):
144 |     if unit.event_tag == 1:
145 |         # 更新标题，股票代码，关键词等
146 |         logging.logger.info("事件 [%s] 是新事件" % unit.event_id)
147 |         # 读取每个事件
148 |         node_df_data = full_df_data.loc[set(unit.node_list)]
149 |         node_news_lists = data_reader.trans_df_data(node_df_data.reset_index(), tfidf_feature, tfidf_transformer, dp,
150 |                                                     tk)
151 |         news_list = []
152 |         news_title_list = []
153 |         for i in node_news_lists:
154 |             # print i[1], i[4]
155 |             news_list.append(i[1])
156 |             news_title_list.append(i[4])
157 |         # 更新股票列表
158 |         unit.event_expression(news_title_list, news_list, stock_df)
159 |         logging.logger.info("股票列表: %s" % ','.join(tmp for tmp in unit.stocks))
160 |         logging.logger.info("关键词列表: %s" % unit.keywords)
161 |         # 更新标题
162 |         node_news_dict = {}
163 |         for node in node_news_lists:
164 |             node_news_dict[node[0]] = (node[1], node[2], node[3], node[4])
165 |         unit.title_update(node_news_dict)
166 |         unit.event_tag = 0  # 所有内容更新完成之后将事件表示为0
167 |     else:
168 |         continue
169 | 
170 | # step 5、将更新后的事件单元保存下来
171 | event_save_name = int(time.time())
172 | event_save_path = conf.event_save_path
173 | # event_save_path = "/Users/li/PycharmProjects/event_parser/src/model/event_model/"
174 | event_util.event_save(new_event_units, event_save_name, event_save_path)
175 | 
176 | # step 6、load最新的事件单元库
177 | file_new = file_util.find_newest_file(event_save_path)
178 | logging.logger.info('[最新的文件: %s]' % file_new)
179 | # new_event_units = event_util.load_history_event(file_new)
180 | # for i in new_event_units:
181 | #     print i.topic_title
182 | #     print i.event_id
183 | #     print i.node_list
184 | #     print i.stocks
185 | 


--------------------------------------------------------------------------------
/src/event2mysql.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: event2mysql.py
 8 | @time: 2018-12-17 13:53
 9 | 将更新好的事件按固定格式保存到mysql中
10 | """
11 | 
12 | import json
13 | import pandas as pd
14 | from src import data_reader
15 | from src.configure import conf
16 | from src.utils import file_util, event_util
17 | from src.utils.log import log_util
18 | from src.utils.engine import data_source
19 | 
20 | logging = log_util.Logger('event2mysql')
21 | event_save_path = conf.event_save_path
22 | # event_save_path = "/Users/li/PycharmProjects/event_parser/src/model/event_model/"
23 | 
24 | # 从文件目录中导入最新的更新文件
25 | file_new = file_util.find_newest_file(event_save_path)
26 | new_event_units = event_util.load_history_event(event_save_path + file_new)
27 | 
28 | # 从数据库中读取最新的新闻的id，title，url和timestamp
29 | total_data = data_reader.get_all_data()
30 | # 将id设为index，方面后面根据id提取title和url
31 | total_data_df = total_data.set_index('id')
32 | 
33 | # 将事件单元的信息整理成规定格式
34 | result = []
35 | for item in new_event_units:
36 |     # 如果是有效事件
37 |     if item.effectiveness == 1:
38 |         logging.logger.info('[effective event ID]: %s' % item.event_id)
39 |         logging.logger.info('[effective event title]: %s' % item.topic_title)
40 |         event_stock = ','.join(k for k in set(item.stocks))
41 |         logging.logger.info('[effective stock]: %s\n' % event_stock)
42 |         logging.logger.info('[effective node list]: %s' % item.node_list)
43 |         # 从dataFrame中获取事件单元中node的标题和出处
44 |         title_url = []
45 |         time_list = []
46 |         for node_id in item.node_list:
47 |             title, url, unix_time = total_data_df.loc[node_id][['title', 'url', 'unix_time']]
48 |             time_list.append(unix_time)
49 |             title_url.append({node_id: {'news_title': title.encode('utf-8'), 'url': url, 'unix_time': unix_time}})
50 |         event_detail = json.dumps(title_url)
51 |         stop_time = max(time_list)
52 |         start_time = min(time_list)
53 |         logging.logger.info("[event start-stop time]start_time {}, stop_time {}".format(start_time, stop_time))
54 |         result.append((item.event_id, item.topic_title.encode('utf-8'), event_stock, start_time, stop_time, event_detail))
55 |     else:
56 |         continue
57 | 
58 | # 整理成dataFrame的格式
59 | result_df = pd.DataFrame(result,
60 |                          columns=['event_id', 'event_title', 'event_stock', 'start_time', 'stop_time', 'event_detail'])
61 | 
62 | """
63 | # 将整理好「事件以及事件涉及股票列表」数据保存到{event_detail}
64 | """
65 | # # 创建数据库引擎
66 | engine_mysql = data_source.GetDataEngine("XAVIER_DB")
67 | # # 将「事件以及事件涉及股票列表」的数据保存到mysql中
68 | result_df.to_sql('event_detail', engine_mysql, if_exists='replace', index=False)
69 | logging.logger.info('event_detail update success')
70 | 
71 | """
72 | # 整理出「股票以及股票涉及的事件列表」数据保存到{symbol_event_detail}
73 | """
74 | event_symbol = result_df[['event_id', 'event_stock']]
75 | # print event_symbol
76 | lst = {}
77 | for i in range(len(event_symbol)):
78 |     event_id = event_symbol.loc[i]['event_id']
79 |     event_stock = event_symbol.loc[i]['event_stock'].strip()
80 |     # if event_stock != '':   # 剔除没有股票的事件
81 |     #     for symbol in event_stock.split(','):
82 |     #         lst.setdefault(symbol, []).append("'" + str(event_id) + "'")
83 |     for symbol in event_stock.split(','):
84 |         lst.setdefault(symbol, []).append("'" + str(event_id) + "'")
85 | 
86 | tmp_result = pd.DataFrame(list(lst.items()), columns=['SYMBOL', 'event_id'])
87 | tmp_result['event_id'] = tmp_result['event_id'].apply(lambda x: ','.join(x))
88 | # print tmp_result
89 | # 将「股票以及股票涉及的事件列表」数据保存到mysql中
90 | tmp_result.to_sql('symbol_event_detail', engine_mysql, if_exists='replace', index=False)
91 | logging.logger.info('symbol_event_detail update success')
92 | 


--------------------------------------------------------------------------------
/src/event_update.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | echo $(date)
 4 | 
 5 | source ../venv/bin/activate
 6 | 
 7 | U_V1=`python -V 2>&1|awk '{print $2}'|awk -F '.' '{print $1}'`
 8 | U_V2=`python -V 2>&1|awk '{print $2}'|awk -F '.' '{print $2}'`
 9 | U_V3=`python -V 2>&1|awk '{print $2}'|awk -F '.' '{print $3}'`
10 | 
11 | if [[ ${U_V1}.${U_V2}.${U_V3} == '2.7.14' ]];then
12 |     echo 'dynamic_update_event.py start'
13 |     python dynamic_update_event.py
14 | 
15 |     sleep 5s
16 | 
17 |     echo 'Save Event to MySQL'
18 |     python event2mysql.py
19 |     echo 'Event Update Finished'
20 | else
21 |     echo 'Virtualenv Start Sailed，Event Update failed'
22 | fi
23 | 


--------------------------------------------------------------------------------
/src/history_event.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | @version: ??
  6 | @author: li
  7 | @file: history_event.py
  8 | @time: 2018/11/14 3:33 PM
  9 | 将类簇转换成事件单元，并根据类簇中的节点id从文本中提取每个类簇对应的新闻，构成事件单元，然后提取每个事件单元涉及的股票。并且对每个事件单元提取关键词代表每个事件单元。所有的结果打包成pickle文件保存到本地。
 10 | """
 11 | import gc
 12 | import pickle
 13 | import pandas as pd
 14 | 
 15 | import sys
 16 | sys.path.append('..')
 17 | sys.path.append('../')
 18 | sys.path.append('../../')
 19 | 
 20 | from src.configure import conf
 21 | from src.utils import event_util
 22 | from src.utils.log import log_util
 23 | from src.utils.VSM import tfidf
 24 | from src.data_reader import import_news, import_title, get_event_news
 25 | 
 26 | # import logger
 27 | logging = log_util.Logger('history_event')
 28 | # 导入通过singlePass聚类生成的类簇
 29 | # clustering_path = '/Users/li/PycharmProjects/event_parser/src/model/clustering_new.pkl'
 30 | clustering_path = conf.clustering_save_path
 31 | try:
 32 |     with open(clustering_path, 'rb') as fr:
 33 |         clustering = pickle.load(fr)
 34 |         logging.logger.info('load cluster units from: {}'.format(clustering_path))
 35 | except IOError as err:
 36 |     logging.logger.error('cluster units pickle file load failed: {} and program stopped'.format(clustering_path))
 37 |     sys.exit()
 38 | # clustering.print_result()
 39 | 
 40 | # 读取新闻文本
 41 | # 新闻保存的路径
 42 | # corpus_news = "/Users/li/PycharmProjects/event_parser/src/data/text_full_index.txt"
 43 | corpus_news = conf.corpus_news
 44 | # 新闻标题保存的路径
 45 | # corpus_news_title = "/Users/li/PycharmProjects/event_parser/src/data/text_title_index.txt"
 46 | corpus_news_title = conf.corpus_news_title
 47 | logging.logger.info('load corpus_news_title from: {}'.format(corpus_news_title))
 48 | # 构建新闻正文词典
 49 | news_dict = import_news(corpus_news)
 50 | # 构建新闻标题词典
 51 | news_title_dict = import_title(corpus_news_title)
 52 | 
 53 | # load tf-idf VSM
 54 | tfidf_feature_path = conf.tfidf_feature_path
 55 | tfidf_transformer_path = conf.tfidftransformer_path
 56 | try:
 57 |     tfidf_feature = tfidf.load_tfidf_feature(tfidf_feature_path)
 58 |     logging.logger.info("TF-IDF feature load success")
 59 |     tfidf_transformer = tfidf.load_tfidf_transformer(tfidf_transformer_path)
 60 |     logging.logger.info("TF-IDF transformer load success")
 61 | except:
 62 |     logging.logger.info("TF-IDF model load failed, please check path %s,%s" % (tfidf_feature_path,
 63 |                                                                                tfidf_transformer_path))
 64 |     sys.exit()
 65 | # 股票及股票代码
 66 | stock_df = pd.read_csv(conf.stock_new_path, encoding='utf-8').set_index('SESNAME')
 67 | # 事件有效性判断
 68 | # effectiveness_events, non_effectiveness_events = event_util.events_effectiveness(clustering.cluster_list, news_dict)
 69 | 
 70 | '''
 71 | 构建事件单元
 72 | '''
 73 | event_unit_lists = []
 74 | # for cluster_index, cluster in enumerate(effectiveness_events):
 75 | for cluster_index, cluster in enumerate(clustering.cluster_list):
 76 |     logging.logger.info('[event_id]: {}'.format(cluster_index))  # 簇的序号
 77 |     logging.logger.info('[event_node_id]: {}'.format(cluster.node_list))  # 该簇的节点列表
 78 | 
 79 |     event_unit = event_util.EventUnit()
 80 |     event_unit.node_list = cluster.node_list
 81 |     event_unit.node_num = cluster.node_num
 82 |     event_unit.centroid = cluster.centroid
 83 |     event_unit.event_id = cluster_index
 84 | 
 85 |     # 获取事件单元中的标题
 86 |     event_title_lists = get_event_news(news_title_dict, cluster.node_list)
 87 |     # 获取事件单元中的新闻正文
 88 |     event_news_lists = get_event_news(news_dict, cluster.node_list)
 89 |     # # 事件表示,提取事件中涉及的股票，对所有新闻提取关键词, 添加事件标题
 90 |     # stock_list, keywords_list = event_util.event_expression(event_title_lists, event_news_lists)
 91 |     # # 事件表示， 计算事件的标题
 92 |     # topic_title = event_util.units_title(cluster, news_dict, news_title_dict)
 93 |     # print "[事件标题]:\n %s " % topic_title
 94 |     # event_unit.topic_title, event_unit.stocks, event_unit.keywords = topic_title, stock_list, keywords_list
 95 | 
 96 |     # 添加涉及的股票和事件关键词
 97 |     event_unit.event_expression(event_title_lists, event_news_lists, stock_df)
 98 |     # 添加事件标题
 99 |     event_unit.add_unit_title(news_dict, news_title_dict, tfidf_feature, tfidf_transformer)
100 |     event_unit_lists.append(event_unit)
101 |     del event_unit
102 |     gc.collect()
103 | logging.logger.info('[聚类类簇的个数]: {}'.format(len(clustering.cluster_list)))
104 | logging.logger.info('[事件库中事件的个数]: {}'.format(len(event_unit_lists)))
105 | 
106 | # event_lib = EventLib()
107 | # event_lib.event_unit_list = event_unit_lists
108 | 
109 | # 保存事件库
110 | # event_unit_path = '/Users/li/PycharmProjects/event_parser/src/model/event_units_new.pkl'
111 | # event_unit_path = conf.event_unit_path
112 | event_save_name = conf.data_time
113 | event_unit_path = conf.event_save_path + event_save_name + '.pkl'
114 | 
115 | with open(event_unit_path, 'wb') as fw:
116 |     # pickle.dump(event_lib, fw)
117 |     pickle.dump(event_unit_lists, fw)
118 | logging.logger.info('[历史事件运行结束]事件库保存目录为:{}'.format(event_unit_path))
119 | 


--------------------------------------------------------------------------------
/src/load_event_data.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: load_event_data.py
 8 | @time: 2018-12-25 18:18
 9 | """
10 | from src.configure import conf
11 | from src.utils import file_util, event_util
12 | 
13 | 
14 | event_save_path = conf.event_save_path
15 | # event_save_path = "/Users/li/PycharmProjects/event_parser/src/model/event_model/"
16 | 
17 | # 从文件目录中导入最新的更新文件
18 | file_new = file_util.find_newest_file(event_save_path)
19 | new_event_units = event_util.load_history_event(file_new)
20 | 
21 | for i in new_event_units:
22 |     print("topic_title %s" % i.topic_title)
23 |     print("event_id %s" % i.event_id)
24 |     print("node_list %s" % i.node_list)
25 |     print("stock_list %s\n" % i.stocks)
26 | 


--------------------------------------------------------------------------------
/src/model/__init__.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | 
4 | """
5 | @version: ??
6 | @author: li
7 | @file: __init__.py.py
8 | @time: 2018/11/5 3:43 PM
9 | """


--------------------------------------------------------------------------------
/src/parser/__init__.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: __init__.py.py
 8 | @time: 2019-03-05 10:33
 9 | """
10 | import argparse
11 | import datetime
12 | if __name__ == '__main__':
13 |     parser = argparse.ArgumentParser()
14 | 
15 |     date = str(datetime.date.today().strftime("%Y-%m-%d"))
16 |     parser.add_argument('--start_date', type=str, default=date)
17 |     parser.add_argument('--end_date', type=int, default=0)
18 |     parser.add_argument('--count', type=int, default=10)
19 |     parser.add_argument('--rebuild', type=bool, default=False)
20 |     parser.add_argument('--schedule', type=bool, default=False)
21 | 
22 |     args = parser.parse_args()
23 |     print(args.start_date)


--------------------------------------------------------------------------------
/src/parser/news_parser/__init__.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: __init__.py.py
 8 | @time: 2018/10/30 10:44 AM
 9 | """
10 | import sys
11 | sys.path.append('../')
12 | sys.path.append('..')
13 | sys.path.append('../../')
14 | 


--------------------------------------------------------------------------------
/src/parser/news_parser/dbscan.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | @version: ??
  6 | @author: li
  7 | @file: dbscan.py
  8 | @time: 2018/10/31 10:51 AM
  9 | """
 10 | 
 11 | from sklearn import cluster
 12 | from sklearn.metrics import adjusted_rand_score
 13 | import numpy as np
 14 | import matplotlib.pyplot as plt
 15 | from sklearn.datasets.samples_generator import make_blobs
 16 | from sklearn import mixture
 17 | from sklearn.svm.libsvm import predict
 18 | 
 19 | 
 20 | def create_data(centers, num=100, std=0.7):
 21 |     # 产生数据
 22 |     X, labels_true = make_blobs(n_samples=num, centers=centers, cluster_std=std)
 23 |     return X, labels_true
 24 | 
 25 | 
 26 | def plot_data(*data):
 27 |     """
 28 |         数据作图
 29 |     """
 30 |     X,labels_true = data
 31 |     labels=np.unique(labels_true)
 32 |     fig=plt.figure()
 33 |     ax=fig.add_subplot(1,1,1)
 34 |     colors='rgbycm'
 35 |     for i,label in enumerate(labels):
 36 |         position=labels_true==label
 37 |         ax.scatter(X[position,0],X[position,1],label="cluster %d"%label),
 38 |         color=colors[i%len(colors)]
 39 | 
 40 |     ax.legend(loc="best",framealpha=0.5)
 41 |     ax.set_xlabel("X[0]")
 42 |     ax.set_ylabel("Y[1]")
 43 |     ax.set_title("data")
 44 |     plt.show()
 45 | 
 46 | 
 47 | # 测试函数
 48 | def test_DBSCAN(*data):
 49 |     X,labels_true = data
 50 |     clst = cluster.DBSCAN()
 51 |     predict_labels = clst.fit_predict(X)
 52 |     print("ARI:%s"%adjusted_rand_score(labels_true, predict_labels))
 53 |     print("Core sample num:%d"%len(clst.core_sample_indices_))
 54 | 
 55 | 
 56 | def test_DBSCAN_epsilon(*data):
 57 |     X,labels_true = data
 58 |     epsilons = np.logspace(-1,1.5)
 59 |     ARIs=[]
 60 |     Core_nums = []
 61 |     for epsilon in epsilons:
 62 |         clst = cluster.DBSCAN(eps=epsilon)
 63 |         predicted_labels = clst.fit_predict(X)
 64 |         ARIs.append(adjusted_rand_score(labels_true,predicted_labels))
 65 |         Core_nums.append(len(clst.core_sample_indices_))
 66 | 
 67 |     fig = plt.figure(figsize=(10,5))
 68 |     ax = fig.add_subplot(1,2,1)
 69 |     ax.plot(epsilons,ARIs,marker = '+')
 70 |     ax.set_xscale('log')
 71 |     ax.set_xlabel(r"$\epsilon$")
 72 |     ax.set_ylim(0,1)
 73 |     ax.set_ylabel('ARI')
 74 | 
 75 |     ax = fig.add_subplot(1,2,2)
 76 |     ax.plot(epsilons,Core_nums,marker='o')
 77 |     ax.set_xscale('log')
 78 |     ax.set_xlabel(r"$\epsilon$")
 79 |     ax.set_ylabel('Core_num')
 80 | 
 81 |     fig.suptitle("DBSCAN")
 82 |     plt.show()
 83 | 
 84 | 
 85 | def test_DBSCAN_min_samples(*data):
 86 |     X,labels_true=data
 87 |     min_samples = range(1,100)
 88 |     ARIs = []
 89 |     Core_nums = []
 90 |     for num in min_samples:
 91 |         clst = cluster.DBSCAN(min_samples=num)
 92 |         predicted_labels = clst.fit_predict(X)
 93 |         ARIs.append(adjusted_rand_score(labels_true, predicted_labels))
 94 |         Core_nums.append(len(clst.core_sample_indices_))
 95 | 
 96 |     fig=plt.figure(figsize=(10,5))
 97 |     ax=fig.add_subplot(1,2,1)
 98 |     ax.plot(min_samples,ARIs,marker='+')
 99 |     ax.set_xlabel("min_samples")
100 |     ax.set_ylim(0,1)
101 |     ax.set_ylabel('ARI')
102 | 
103 |     ax=fig.add_subplot(1,2,2)
104 |     ax.plot(min_samples,Core_nums,marker='o')
105 |     ax.set_xlabel("min_samples")
106 |     ax.set_ylabel('Core_nums')
107 | 
108 |     fig.suptitle("DBSCAN")
109 |     plt.show()
110 | 
111 | 
112 | if __name__ == '__main__':
113 |     X, labels_true = create_data(4)
114 |     # plot_data(data)
115 |     test_DBSCAN(X,labels_true)
116 | 
117 |     centers = [[1,1],[1,2],[2,2],[10,20]]
118 |     X,labels_true = create_data(centers,1000,0.5)
119 |     test_DBSCAN_epsilon(X, labels_true)
120 | 
121 |     centers = [[1,1],[1,2],[2,2],[10,20]]
122 |     X,labels_true = create_data(centers,1000,0.5)
123 |     test_DBSCAN_min_samples(X,labels_true)


--------------------------------------------------------------------------------
/src/parser/news_parser/tonghuashun.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: tonghuashun.py
 8 | @time: 2018/10/30 11:13 AM
 9 | """
10 | 
11 | import pyhanlp
12 | import jpype
13 | from jpype import *
14 | # jvmPath = jpype.getDefaultJVMPath()
15 | # print(jvmPath)
16 | # # jpype.startJVM(jvmPath)
17 | # jpype.java.lang.System.out.println("hello world!")
18 | # java.lang.System.out.println("hello world")
19 | # # jpype.shutdownJVM()
20 | 
21 | 
22 | HanLP = JClass('com.hankcs.hanlp.HanLP')
23 | #中文分词
24 | print(HanLP.segment("你好，欢迎在Python中调用HanLP的API").toString())
25 | testCases = [
26 |     "商品和服务",
27 |     "结婚的和尚未结婚的确实在干扰分词啊",
28 |     "买水果然后来世博园最后去世博会",
29 |     "中国的首都是北京",
30 |     "欢迎新老师生前来就餐",
31 |     "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
32 |     "随着页游兴起到现在的页游繁盛，依赖于存档进行逻辑判断的设计减少了，但这块也不能完全忽略掉。"]
33 | for sentence in testCases: print(HanLP.segment(sentence))
34 | # 命名实体识别与词性标注
35 | NLPTokenizer = JClass('com.hankcs.hanlp.tokenizer.NLPTokenizer')
36 | print(NLPTokenizer.segment('中国科学院计算技术研究所的宗成庆教授正在教授自然语言处理课程'))
37 | # 关键词提取
38 | document = "水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露，" \
39 |            "根据刚刚完成了水资源管理制度的考核，有部分省接近了红线的指标，" \
40 |            "有部分省超过红线的指标。对一些超过红线的地方，陈明忠表示，对一些取用水项目进行区域的限批，" \
41 |            "严格地进行水资源论证和取水许可的批准。"
42 | print(HanLP.extractKeyword(document, 2))
43 | # 自动摘要
44 | print(HanLP.extractSummary(document, 2))
45 | # 依存句法分析
46 | print(HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。"))
47 | jpype.shutdownJVM()
48 | 
49 | 


--------------------------------------------------------------------------------
/src/parser/news_parser/xueqiu.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: xueqiu.py
 8 | @time: 2018/10/30 2:31 PM
 9 | """
10 | import sys
11 | sys.path.append("../")
12 | 
13 | from pyhanlp import *
14 | from src.data_reader import read_full_data
15 | 
16 | news = read_full_data()
17 | # print news['title']
18 | # print news['content']
19 | 
20 | for index, item in news.iterrows():
21 |     title, content = item['title'], item['content']
22 |     title_list = HanLP.extractKeyword(title, 8)
23 |     content_list = HanLP.extractSummary(content, 2)
24 |     print(title, title_list)
25 |     print(content, content_list)
26 | 
27 | 
28 | 
29 | # 新闻标题聚合
30 | 
31 | # 个股事件
32 | # 新闻事件
33 | 
34 | 
35 | # 关键词提取
36 | # document = "水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露，" \
37 | #            "根据刚刚完成了水资源管理制度的考核，有部分省接近了红线的指标，" \
38 | #            "有部分省超过红线的指标。对一些超过红线的地方，陈明忠表示，对一些取用水项目进行区域的限批，" \
39 | #            "严格地进行水资源论证和取水许可的批准。"
40 | # print(HanLP.extractKeyword(document, 8))
41 | 
42 | 
43 | # 自动摘要
44 | # print(HanLP.extractSummary(document, 3))
45 | 
46 | 
47 | # 依存句法分析
48 | # print(HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。"))


--------------------------------------------------------------------------------
/src/parser/requirement.txt:
--------------------------------------------------------------------------------
 1 | appnope==0.1.0
 2 | attrs==19.1.0
 3 | backcall==0.1.0
 4 | backports.functools-lru-cache==1.5
 5 | bleach==3.1.0
 6 | boto==2.49.0
 7 | boto3==1.9.34
 8 | botocore==1.12.34
 9 | bz2file==0.98
10 | certifi==2018.10.15
11 | chardet==3.0.4
12 | cycler==0.10.0
13 | Cython==0.29.6
14 | decorator==4.4.0
15 | defusedxml==0.6.0
16 | docutils==0.14
17 | entrypoints==0.3
18 | gensim==3.6.0
19 | hanlp==5.0.0
20 | idna==2.7
21 | ipykernel==5.1.0
22 | ipython==7.5.0
23 | ipython-genutils==0.2.0
24 | ipywidgets==7.4.2
25 | jedi==0.13.3
26 | jieba==0.39
27 | Jinja2==2.10.1
28 | jmespath==0.9.3
29 | joblib==0.13.2
30 | JPype1==0.6.3
31 | jsonschema==3.0.1
32 | jupyter==1.0.0
33 | jupyter-client==5.2.4
34 | jupyter-console==6.0.0
35 | jupyter-core==4.4.0
36 | kiwisolver==1.0.1
37 | MarkupSafe==1.1.1
38 | matplotlib==2.2.3
39 | mistune==0.8.4
40 | mysql-connector==2.1.6
41 | nbconvert==5.5.0
42 | nbformat==4.4.0
43 | notebook==5.7.8
44 | numpy==1.16.2
45 | pandas==0.23.4
46 | pandocfilters==1.4.2
47 | parso==0.4.0
48 | pexpect==4.7.0
49 | pickleshare==0.7.5
50 | prometheus-client==0.6.0
51 | prompt-toolkit==2.0.9
52 | ptyprocess==0.6.0
53 | Pygments==2.3.1
54 | pyhanlp==0.1.44
55 | pymssql==2.1.4
56 | pyparsing==2.2.2
57 | pyrsistent==0.15.1
58 | python-dateutil==2.7.5
59 | pytz==2018.7
60 | pyzmq==18.0.1
61 | qtconsole==4.4.3
62 | requests==2.20.0
63 | s3transfer==0.1.13
64 | scikit-learn==0.20.0
65 | scipy==1.1.0
66 | Send2Trash==1.5.0
67 | six==1.11.0
68 | sklearn==0.0
69 | smart-open==1.7.1
70 | SQLAlchemy==1.2.12
71 | subprocess32==3.5.3
72 | terminado==0.8.2
73 | testpath==0.4.2
74 | tornado==6.0.2
75 | tqdm==4.31.1
76 | traitlets==4.3.2
77 | urllib3==1.24
78 | wcwidth==0.1.7
79 | webencodings==0.5.1
80 | widgetsnbextension==3.4.2
81 | 


--------------------------------------------------------------------------------
/src/parser/xueqiu/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # 代码维护
 3 | ## discuss_parser/xueqiu_discuss_daily.py
 4 | ### 描述
 5 | 实时统计每天新增讨论的中涉及股票， 并且转换成股票以及股票的讨论数。
 6 | ###  运行方式
 7 | 每天定时运行
 8 | 
 9 | ### 保存格式
10 | ["stock", "xid_list",  "xid_count", "created_at"]
11 | 
12 | 
13 | ## focus_parser/xueqiu_focus_statistics.py
14 | ### 描述
15 | 每天定时统计大V关注的股票, 增量式计算每只股票的大V关注数。然后跟当天的时间一起入库
16 | 
17 | ### 运行方式
18 | 每天定时运行
19 | 
20 | ### 保存格式
21 | ["symbol",  "focus_total_count", "created_at"]
22 | 
23 | 
24 | # 定时任务维护
25 | ## discuss_focus_statistic_daily.sh
26 | 使用crontab每天定时运行该脚本文件，运行之前注意配置python的虚拟环境。
27 | 
28 | 
29 | 
30 | 
31 | # 存在的问题
32 | 将stock.csv中的股票代码需要重新转换， 将引号去掉，比如['300315']
33 | 
34 | 


--------------------------------------------------------------------------------
/src/parser/xueqiu/__init__.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | 
4 | """
5 | @version: ??
6 | @author: li
7 | @file: __init__.py.py
8 | @time: 2019-04-29 14:46
9 | """


--------------------------------------------------------------------------------
/src/parser/xueqiu/discuss_focus_statistics_daily.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | echo $(date)
 4 | 
 5 | #source ../venv/bin/activate
 6 | 
 7 | U_V1=`python -V 2>&1|awk '{print $2}'|awk -F '.' '{print $1}'`
 8 | U_V2=`python -V 2>&1|awk '{print $2}'|awk -F '.' '{print $2}'`
 9 | U_V3=`python -V 2>&1|awk '{print $2}'|awk -F '.' '{print $3}'`
10 | 
11 | if [[ ${U_V1}.${U_V2}.${U_V3} == '3.7.2' ]];then
12 |     echo 'xueqiu_discuss_daily.py start'
13 |     python ./discuss_parser/xueqiu_discuss_daily.py
14 | 
15 |     sleep 5s
16 | 
17 |     echo 'xueqiu_focus_statistics.py start'
18 |     python ./focus_parser/xueqiu_focus_statistics.py
19 |     echo 'Finished'
20 | else
21 |     echo 'Virtualenv Start Failed，Event Update failed'
22 | fi
23 | 


--------------------------------------------------------------------------------
/src/parser/xueqiu/discuss_parser/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/STHSF/EventsParser/2e3b8100e1e9d7140a6215d07070a90381b3007f/src/parser/xueqiu/discuss_parser/__init__.py


--------------------------------------------------------------------------------
/src/parser/xueqiu/discuss_parser/discuss_data/__init__.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | 
4 | """
5 | @version: ??
6 | @author: li
7 | @file: __init__.py.py
8 | @time: 2019-04-09 15:19
9 | """


--------------------------------------------------------------------------------
/src/parser/xueqiu/discuss_parser/discuss_data/discuss.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/STHSF/EventsParser/2e3b8100e1e9d7140a6215d07070a90381b3007f/src/parser/xueqiu/discuss_parser/discuss_data/discuss.db


--------------------------------------------------------------------------------
/src/parser/xueqiu/discuss_parser/discuss_data/discuss.db':
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/STHSF/EventsParser/2e3b8100e1e9d7140a6215d07070a90381b3007f/src/parser/xueqiu/discuss_parser/discuss_data/discuss.db'


--------------------------------------------------------------------------------
/src/parser/xueqiu/discuss_parser/discuss_parser.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | @version: 1.0
  6 | @author: LiYu
  7 | @file: discuss_parser.py
  8 | @time: 2019-03-26 22:51
  9 | # 识别评论中的股票实体。
 10 | # 对讨论进行分词,然后提取评论中的股票实体
 11 | # 代码中使用了多进程来处理DataFrame数据。
 12 | """
 13 | import os
 14 | import glob
 15 | import time
 16 | import pandas as pd
 17 | import multiprocessing
 18 | from joblib import Parallel, delayed
 19 | from src.utils import dicts
 20 | from src.utils.data_process import DataPressing
 21 | from src.utils.tokenization import Tokenizer, load_stop_words
 22 | 
 23 | 
 24 | class DiscussParser(object):
 25 |     """
 26 |     讨论解析器
 27 |     """
 28 |     def __init__(self):
 29 |         # 加载分词自定义词典
 30 |         dicts.init()
 31 |         self.data_process = DataPressing()
 32 |         # 停用词
 33 |         self.stop_words = load_stop_words()
 34 |         # 股票-股票代码对, 并且对股票代码做一些变换，比如
 35 |         _, self.stocks_df = dicts.load_stock_data()
 36 |         self.tokenizer = Tokenizer(self.data_process, self.stop_words)
 37 | 
 38 |     def __cut_process(self, text):
 39 |         """
 40 |         数据处理模块, 分词、提取股票实体词
 41 |         :param text:
 42 |         :return:
 43 |         """
 44 |         print('cut_process进程: %sd   父进程ID：%s' % (os.getpid(), os.getppid()))
 45 |         # 分词
 46 |         # 用到多进程处理DataFrame，所以将类申明放到每个进程中，不然在调用token的时候，每个子进程不能再调用初始化词典
 47 |         text_list = self.tokenizer.token(text)
 48 |         # print("text_list %s" % text_list)
 49 |         # 提取text中涉及到的股票实体，并且转换成股票代码
 50 |         stock_list = self.data_process.find_stocks(text_list, self.stocks_df)
 51 |         # stock_list = ','.join(stock_list)  # 展示使用
 52 |         return stock_list
 53 | 
 54 |     def tmp_func(self, tmp_df, column="text"):
 55 |         """
 56 |         apply函数封装
 57 |         :param column: 需要处理的列名
 58 |         :param tmp_df:
 59 |         :return:
 60 |         """
 61 |         print('tmp_func进程: %sd   父进程ID：%s' % (os.getpid(), os.getppid()))
 62 |         tmp_df['stock_list'] = tmp_df[column].apply(self.__cut_process)
 63 |         return tmp_df
 64 | 
 65 |     @staticmethod
 66 |     def __apply_parallel(df_grouped, func):
 67 |         """
 68 |         # 多进程处理dataframe
 69 |         :param df_grouped:
 70 |         :param func:
 71 |         :return:
 72 |         """
 73 |         print('apply_parallel是进程: %sd   父进程ID：%s' % (os.getpid(), os.getppid()))
 74 |         num_cpu = multiprocessing.cpu_count()
 75 | 
 76 |         # Parallel不使用参数的时候, 程序多进程运行, 但是字典没有加载
 77 |         # res_list = Parallel(n_jobs=num_cpu - 2)(delayed(func)(group) for name, group in df_grouped)
 78 |         # 单独使用prefer参数, 依然是单进程
 79 |         # res_list = Parallel(n_jobs=num_cpu - 2, prefer="threads")(delayed(func)(group) for name, group in df_grouped)
 80 |         # 单独使用backend, 词典可以加载成功
 81 |         # res_list = Parallel(n_jobs=num_cpu - 2, backend="multiprocessing")(delayed(func)(group) for name, group in df_grouped)
 82 |         # 两个参数都设置, 词典加载成功, 而且运行时间略有缩短
 83 |         res_list = Parallel(n_jobs=(num_cpu - 2), backend="multiprocessing", prefer="threads")(delayed(func)(group) for name, group in df_grouped)
 84 |         return pd.concat(res_list)
 85 | 
 86 |     def run(self, target_df):
 87 |         """
 88 |         多进程处理主程序
 89 |         :param target_df:
 90 |         :return:
 91 |         """
 92 |         # print('run进程: %sd   父进程ID：%s' % (os.getpid(), os.getppid()))
 93 |         # 将输入数据按照
 94 |         df_grouped = target_df.groupby(target_df.index)
 95 |         res_df = self.__apply_parallel(df_grouped, self.tmp_func)
 96 |         return res_df
 97 | 
 98 | 
 99 | # 测试用接口
100 | def read_csv():
101 |     path = '/Users/li/Desktop/sets1'
102 |     file_list = glob.glob(os.path.join(path, "*.csv"))
103 |     data_list = []
104 |     for f in file_list:
105 |         data_list.append(pd.read_csv(f,
106 |                                      header=0,
107 |                                      names=[u'create_at', u'desc', u'id', u'rcreate_at', u'rdesc',
108 |                                             u'rid', u'rtitle', u'ruid', u'screen_sname', u'uid'], encoding='utf-8'))
109 |         # data_list.append(pd.read_csv(f))
110 | 
111 |     df_result = pd.concat(data_list, sort=True)
112 |     print(len(df_result))
113 |     return df_result
114 | 
115 | 
116 | def create_dic(target_df, xid, unix_time):
117 |     tmp_dic = dict()
118 |     tmp_dic['xid'] = target_df[xid]
119 |     tmp_dic['unix_time'] = target_df[unix_time]
120 |     return [target_df[xid], target_df[unix_time]]
121 | 
122 | 
123 | if __name__ == '__main__':
124 | 
125 |     print('main进程: %sd   父进程ID：%s' % (os.getpid(), os.getppid()))
126 |     discuss_parser = DiscussParser()
127 | 
128 |     # test_df = pd.DataFrame()
129 |     # target_df = read_csv()
130 |     # # 测试用
131 |     # target_df = target_df.head(5)
132 |     # # 可以优化
133 |     # funccc = lambda x: str(x)  # 类型转换
134 |     # test_df['xid'] = target_df['id'].apply(funccc)
135 |     # test_df['uid'] = target_df['uid'].apply(funccc)
136 |     # # 转换时间格式
137 |     # test_df['unix_time'] = target_df['create_at'].apply(time_util.timestamp_to_time, style='%Y-%m-%d')
138 |     # # 将id和unix_time构建成一个整体
139 |     # test_df['information_dic'] = test_df[['xid', 'unix_time']].apply(create_dic, axis=1, args=('xid', 'unix_time'))
140 |     #
141 |     # # desc和rdesc两个讨论合并在一起处理
142 |     # funcc = lambda x: str(x[0]) + '.' + str(x[1])
143 |     # test_df['text'] = target_df[['desc', 'rdesc']].apply(funcc, axis=1)
144 |     #
145 |     # res = discuss_parser.run(test_df)
146 |     df = pd.DataFrame({'text': ['大智慧的股票真烂，中美贸易战打得好，中美贸易摩擦擦出爱情火花！科创板也上市了，中信证券不错'] * 1000})
147 |     start_time = time.time()
148 | 
149 |     res = discuss_parser.run(df)
150 |     print('spend time %s' % (time.time() - start_time))
151 | 


--------------------------------------------------------------------------------
/src/parser/xueqiu/discuss_parser/format_transform.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: format_transform.py
 8 | @time: 2019-04-09 10:41
 9 | 数据转换，统计讨论数目
10 | """
11 | import pandas as pd
12 | from src.utils.engine import data_source
13 | 
14 | 
15 | def county(x):
16 |     """
17 |     统计list中元素的个数
18 |     :param x:
19 |     :return:
20 |     """
21 |     tmp = x.split(',')
22 |     return len(tmp)
23 | 
24 | 
25 | def symbol_format(x):
26 |     """
27 |     转换股票代码的格式
28 |     :param x:
29 |     :return:
30 |     """
31 |     symbol = x.split('\'')[1]
32 |     head = int(symbol[:1])
33 |     if head == 6 or head == 9:
34 |         return symbol + '.' + 'XSHG'
35 |     elif head == 0 or head == 3 or head == 2:
36 |         return str(symbol) + '.' + 'XSHE'
37 |     elif head == 8 or head == 4:
38 |         return str(symbol) + '.' + 'OC'
39 |     else:
40 |         return str(symbol)
41 | 
42 | 
43 | if __name__ == '__main__':
44 | 
45 |     engine_sqlite = data_source.GetDataEngine("XAVIER_SQLITE")
46 | 
47 |     sql = "SELECT * FROM history_stock_discuss_filter"
48 |     # sql = "SELECT * FROM history_discuss_stock_filter"
49 |     df = pd.read_sql(sql, engine_sqlite)
50 | 
51 |     df['symbol'] = df['stock'].apply(symbol_format)
52 | 
53 |     df['xid_count'] = df['xid_list'].apply(county)
54 | 
55 |     df_grouped = df.groupby('created_date')
56 | 
57 |     engine_mysql = data_source.GetDataEngine("VISION")
58 | 
59 |     for i, j in df_grouped:
60 |         print(len(j))
61 |         # 这边可以统计每天的总讨论数量，由此作为大盘的特征因子
62 |         # 数据按天分批插入数据库，数据如果需要重跑，则需要删掉原始的表
63 |         j.to_sql('history_discuss_stock_filter', engine_mysql, if_exists='append', index=False)
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/src/parser/xueqiu/discuss_parser/participle/__init__.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: __init__.py.py
 8 | @time: 2018/10/30 10:45 AM
 9 | """
10 | 
11 | from pyhanlp import *
12 | import jieba
13 | 
14 | 
15 | print(HanLP.segment('你好，欢迎在Python中调用HanLP的API'))
16 | 
17 | for term in HanLP.segment('下雨天地面积水'):
18 |     print('{}\t{}'.format(term.word, term.nature)) # 获取单词与词性
19 | 
20 | # jieba和hanlp分词结果对比
21 | testCases = [
22 |     "中美贸易战开打了，大家小心钱包",
23 |     '该来的没来，不该来的来了一大堆',
24 |     "商品和服务",
25 |     "结婚的和尚未结婚的确实在干扰分词啊",
26 |     "买水果然后来世博园最后去世博会",
27 |     "中国的首都是北京",
28 |     "欢迎新老师生前来就餐",
29 |     "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
30 |     "随着页游兴起到现在的页游繁盛，依赖于存档进行逻辑判断的设计减少了，但这块也不能完全忽略掉。"]
31 | for sentence in testCases:
32 |     print('\t'.join(jieba.cut(sentence)))
33 | 
34 | testCases = [
35 |     "中美贸易战开打了，大家小心钱包",
36 |     '该来的没来，不该来的来了一大堆',
37 |     "商品和服务",
38 |     "结婚的和尚未结婚的确实在干扰分词啊",
39 |     "买水果然后来世博园最后去世博会",
40 |     "中国的首都是北京",
41 |     "欢迎新老师生前来就餐",
42 |     "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
43 |     "随着页游兴起到现在的页游繁盛，依赖于存档进行逻辑判断的设计减少了，但这块也不能完全忽略掉。"]
44 | for sentence in testCases: print(HanLP.segment(sentence))
45 | 
46 | # 关键词提取
47 | document = "水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露，" \
48 |            "根据刚刚完成了水资源管理制度的考核，有部分省接近了红线的指标，" \
49 |            "有部分省超过红线的指标。对一些超过红线的地方，陈明忠表示，对一些取用水项目进行区域的限批，" \
50 |            "严格地进行水资源论证和取水许可的批准。"
51 | print(HanLP.extractKeyword(document, 2))
52 | 
53 | # 自动摘要
54 | print(HanLP.extractSummary(document, 3))
55 | 
56 | # 依存句法分析
57 | print(HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。"))


--------------------------------------------------------------------------------
/src/parser/xueqiu/discuss_parser/xueqiu_dicsuss_batch.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | @version: ??
  6 | @author: li
  7 | @file: xueqiu_dicsuss_batch.py
  8 | @time: 2019-03-29 14:05
  9 | """
 10 | import gc
 11 | import os
 12 | import glob
 13 | import pandas as pd
 14 | from datetime import datetime, timedelta
 15 | from src.utils import time_util, dicts
 16 | from src.utils.log import log_util
 17 | 
 18 | from src.utils.data_process import DataPressing
 19 | from src.utils.tokenization import Tokenizer, load_stop_words
 20 | 
 21 | logging = log_util.Logger('xueqiu_discuss_batch')
 22 | 
 23 | 
 24 | # 使用desc和rdesc作为当前用户的讨论数据，用户id为当前用户id，讨论id为
 25 | def read_csv(path=None):
 26 |     """
 27 |     读取原始csv文件
 28 |     :param path:
 29 |     :return:
 30 |     """
 31 |     if path is None:
 32 |         path = '/Users/li/Desktop/sets1'
 33 |     file_list = glob.glob(os.path.join(path, "*.csv"))
 34 |     data_list = []
 35 |     for f in file_list:
 36 |         data_list.append(pd.read_csv(f, header=0,
 37 |                                      names=[u'create_at', u'desc', u'id', u'rcreate_at', u'rdesc', u'rid', u'rtitle',
 38 |                                             u'ruid', u'screen_sname', u'uid'], dtype={u'id': str, u'uid': str},
 39 |                                      encoding='utf-8'))
 40 |         # data_list.append(pd.read_csv(f))
 41 | 
 42 |     df_result = pd.concat(data_list, sort=True)
 43 | 
 44 |     return df_result
 45 | 
 46 | 
 47 | def create_dic(df, xid, unix_time):
 48 |     """
 49 |     将xid和unix_time构建成一个list
 50 |     :param df:
 51 |     :param xid:
 52 |     :param unix_time:
 53 |     :return:
 54 |     """
 55 |     tmp = dict()
 56 |     tmp['xid'] = df[xid]
 57 |     tmp['unix_time'] = df[unix_time]
 58 |     return [df[xid], df[unix_time]]
 59 | 
 60 | 
 61 | # 数据结构调整
 62 | def transform_fuc(id, stock_list):
 63 |     """
 64 |     将user_id和stock_list两两组合成tuple的list集合
 65 |     :param id: str
 66 |     :param stock_list: list
 67 |     :return:
 68 |     """
 69 |     if len(stock_list) <= 0:
 70 |         pass
 71 |     user_id_list = [id] * len(stock_list)
 72 |     tuple_zip = zip(stock_list, user_id_list)
 73 |     tuple_list = list(tuple_zip)
 74 |     return tuple_list
 75 | 
 76 | 
 77 | def cut_process(text, data_process, tokenizer, stocks_df):
 78 |     text_list = tokenizer.token(text)
 79 |     # 提取text中涉及到的股票实体，并且转换成股票代码
 80 |     stock_list = data_process.find_stocks(text_list, stocks_df)
 81 |     del data_process, tokenizer
 82 |     gc.collect()
 83 |     logging.logger.info("__cut_process ing")
 84 |     return stock_list
 85 | 
 86 | 
 87 | def discuss_batch(discuss_df, data_process, tokenizer, stocks_df):
 88 |     # 对讨论数据做分词并提取股票列表
 89 |     discuss_df['stock_list'] = discuss_df['text'].apply(cut_process, args=(data_process, tokenizer, stocks_df))
 90 | 
 91 |     # 对result_df下的文章id和股票集合进行结构调整
 92 |     # 可以改进成直接调用transform_fuc
 93 |     apply_func = lambda x: transform_fuc(x[0], x[1])
 94 |     discuss_df['transform_res'] = discuss_df[['xid', 'stock_list']].apply(apply_func, axis=1)
 95 |     # print(result_df['transform_res'])
 96 | 
 97 |     # 将若干个list合并成一个list
 98 |     transform_res_list = []
 99 |     for i in discuss_df['transform_res'].values:
100 |         transform_res_list += i
101 | 
102 |     # 转换成DataFrame格式
103 |     transform_res_df = pd.DataFrame(transform_res_list, columns=['stock', 'xid'])
104 | 
105 |     # 将数据根据股票分组
106 |     transform_res_grouped = transform_res_df.groupby('stock')
107 | 
108 |     # 合并每个分组中的文章id
109 |     res_grouped = []
110 |     for group_index, group_df in transform_res_grouped:
111 |         res_grouped.append([group_index, ','.join(group_df['xid'])])
112 |     # print(res_grouped)
113 | 
114 |     # 构建成dataFrame格式，结合运行日期，保存到数据库中
115 |     batch_result = pd.DataFrame(res_grouped, columns=['stock', 'xid_list'])
116 | 
117 |     logging.logger.info("length of batch_result: %s" % len(batch_result))
118 |     del transform_res_df, transform_res_grouped, discuss_df
119 |     gc.collect()
120 |     return batch_result
121 | 
122 | 
123 | if __name__ == '__main__':
124 |     st_time = datetime.now()
125 | 
126 |     # 解析器所需要的数据初始化
127 |     stop_words = load_stop_words()
128 |     _, stocks_df = dicts.load_stock_data()
129 |     data_process = DataPressing()
130 |     tokenizer = Tokenizer(data_process, stop_words)
131 | 
132 |     # 新建空df用于存放预处理的数据
133 |     test_df = pd.DataFrame()
134 |     # 读取数据
135 |     target_df = read_csv()
136 |     logging.logger.info('length of target from csv:{}'.format(len(target_df)))
137 |     # 测试用
138 |     target_df = target_df.head(500)
139 |     # 可以优化， 在read_csv中添加dtype
140 |     # funccc = lambda x: str(x)  # 类型转换
141 |     # test_df['xid'] = target_df['id'].apply(funccc)
142 |     # test_df['uid'] = target_df['uid'].apply(funccc)
143 |     test_df['xid'] = target_df['id']
144 |     # 转换时间格式
145 |     test_df['unix_time'] = target_df['create_at'].apply(time_util.timestamp_to_time, style='%Y-%m-%d')
146 |     # 将id和unix_time构建成一个整体
147 |     test_df['information_dic'] = test_df[['xid', 'unix_time']].apply(create_dic, axis=1, args=('xid', 'unix_time'))
148 | 
149 |     # desc和rdesc两个讨论合并在一起处理
150 |     funcc = lambda x: str(x[0]) + '.' + str(x[1])
151 |     test_df['text'] = target_df[['desc', 'rdesc']].apply(funcc, axis=1)
152 | 
153 |     test_df['unix_time'] = target_df['create_at'].apply(time_util.timestamp_to_time, style='%Y-%m-%d')
154 |     # print(test_df[['unix_time', 'xid', 'text']])
155 |     del target_df
156 |     gc.collect()
157 | 
158 |     # 获取时间的最大值和最小值
159 |     # start_date = time_util.timestamp_to_time(target_df['create_at'].min(), style='%Y-%m-%d')
160 |     # stop_date = time_util.timestamp_to_time(target_df['create_at'].max(), style='%Y-%m-%d')
161 |     # 获取时间的最大值和最小值
162 |     start_date = test_df['unix_time'].min()
163 |     stop_date = test_df['unix_time'].max()
164 |     logging.logger.info('start_time: {}, stop_time: {}'.format(start_date, stop_date))
165 | 
166 |     start_time = datetime.strptime(str(start_date), "%Y-%m-%d")
167 |     stop_time = datetime.strptime(str(stop_date), "%Y-%m-%d")
168 |     tmp_time = stop_time
169 |     while tmp_time >= start_time:
170 |         # 从最大的一天开始倒数着计算每一天
171 |         # 读取当天的数据
172 |         tmp_date = datetime.strftime(tmp_time, "%Y-%m-%d")
173 |         discuss_df = test_df.loc[test_df['unix_time'] == tmp_date]
174 |         if len(discuss_df) == 0:
175 |             logging.logger.warning("{} has no discuss data".format(tmp_date))
176 |             tmp_time = tmp_time - timedelta(days=1)
177 |             continue
178 |         logging.logger.info("computing {} data at ".format(len(discuss_df), tmp_date))
179 |         # 单进程调用解析器
180 |         result = discuss_batch(discuss_df, data_process, tokenizer, stocks_df)
181 |         result['created_date'] = tmp_date
182 |         logging.logger.info("{} has {} result data".format(tmp_date, len(result)))
183 | 
184 |         # # 创建数据库引擎
185 |         # engine_mysql_test = data_source.GetDataEngine("XAVIER_DB")
186 |         # engine_mysql = data_source.GetDataEngine("XAVIER")
187 |         # result.to_sql('history_discuss_filter', engine_mysql, if_exists='append', index=False)
188 |         # engine_sqlite = data_source.GetDataEngine('XAVIER_SQLITE')
189 |         # result.to_sql('history_discuss_filter', engine_sqlite, if_exists='append', index=False)
190 | 
191 |         # 计算前一天时间
192 |         del result, discuss_df
193 |         gc.collect()
194 |         tmp_time = tmp_time - timedelta(days=1)
195 | 
196 |     end_time = datetime.now()
197 |     print((end_time - st_time).seconds)
198 | 
199 | 
200 | 
201 | 
202 | 


--------------------------------------------------------------------------------
/src/parser/xueqiu/discuss_parser/xueqiu_discuss_batch_multi.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | @version: ??
  6 | @author: li
  7 | @file: xueqiu_dicsuss_batch.py
  8 | @time: 2019-03-29 14:05
  9 | """
 10 | import gc
 11 | import os
 12 | import glob
 13 | import pandas as pd
 14 | from datetime import datetime, timedelta
 15 | from src.utils import time_util
 16 | from src.utils.log import log_util
 17 | from src.parser.xueqiu.discuss_parser import discuss_parser
 18 | 
 19 | logging = log_util.Logger('xueqiu_discuss_batch')
 20 | 
 21 | 
 22 | # 使用desc和rdesc作为当前用户的讨论数据，用户id为当前用户id，讨论id为
 23 | def read_csv(path=None):
 24 |     """
 25 |     读取原始csv文件
 26 |     :param path:
 27 |     :return:
 28 |     """
 29 |     if path is None:
 30 |         path = '/Users/li/Desktop/sets1'
 31 |     file_list = glob.glob(os.path.join(path, "*.csv"))
 32 |     data_list = []
 33 |     for f in file_list:
 34 |         data_list.append(pd.read_csv(f, header=0,
 35 |                                      names=[u'create_at', u'desc', u'id', u'rcreate_at', u'rdesc', u'rid', u'rtitle',
 36 |                                             u'ruid', u'screen_sname', u'uid'], dtype={u'id': str, u'uid': str}, encoding='utf-8'))
 37 |         # data_list.append(pd.read_csv(f))
 38 | 
 39 |     df_result = pd.concat(data_list, sort=True)
 40 | 
 41 |     return df_result
 42 | 
 43 | 
 44 | def create_dic(df, xid, unix_time):
 45 |     """
 46 |     将xid和unix_time构建成一个list
 47 |     :param df:
 48 |     :param xid:
 49 |     :param unix_time:
 50 |     :return:
 51 |     """
 52 |     tmp = dict()
 53 |     tmp['xid'] = df[xid]
 54 |     tmp['unix_time'] = df[unix_time]
 55 |     return [df[xid], df[unix_time]]
 56 | 
 57 | 
 58 | # 数据结构调整
 59 | def transform_fuc(id, stock_list):
 60 |     """
 61 |     将user_id和stock_list两两组合成tuple的list集合
 62 |     :param id: str
 63 |     :param stock_list: list
 64 |     :return:
 65 |     """
 66 |     if len(stock_list) <= 0:
 67 |         pass
 68 |     user_id_list = [id] * len(stock_list)
 69 |     tuple_zip = zip(stock_list, user_id_list)
 70 |     tuple_list = list(tuple_zip)
 71 |     return tuple_list
 72 | 
 73 | 
 74 | def discuss_batch(discuss_df, xq_discuss_parser):
 75 |     # 对讨论数据做分词并提取股票列表，使用多进程解析器
 76 |     result_df = xq_discuss_parser.run(discuss_df)
 77 | 
 78 |     # 对result_df下的文章id和股票集合进行结构调整
 79 |     # 可以改进成直接调用transform_fuc
 80 |     apply_func = lambda x: transform_fuc(x[0], x[1])
 81 |     result_df['transform_res'] = result_df[['xid', 'stock_list ']].apply(apply_func, axis=1)
 82 |     # print(result_df['transform_res'])
 83 | 
 84 |     # 将若干个list合并成一个list
 85 |     transform_res_list = []
 86 |     for i in result_df['transform_res'].values:
 87 |         transform_res_list += i
 88 | 
 89 |     # 转换成DataFrame格式
 90 |     transform_res_df = pd.DataFrame(transform_res_list, columns=['stock', 'xid'])
 91 | 
 92 |     # 将数据根据股票分组
 93 |     transform_res_grouped = transform_res_df.groupby('stock')
 94 | 
 95 |     # 合并每个分组中的文章id
 96 |     res_grouped = []
 97 |     for group_index, group_df in transform_res_grouped:
 98 |         res_grouped.append([group_index, ','.join(group_df['xid'])])
 99 |     # print(res_grouped)
100 | 
101 |     # 构建成dataFrame格式，结合运行日期，保存到数据库中
102 |     result = pd.DataFrame(res_grouped, columns=['stock', 'xid_list'])
103 | 
104 |     logging.logger.info("length of result: %s" % len(result))
105 |     del xq_discuss_parser, result_df, transform_res_df, transform_res_grouped
106 |     gc.collect()
107 |     return result
108 | 
109 | 
110 | if __name__ == '__main__':
111 |     st_time = datetime.now()
112 | 
113 |     # 新建雪球多进程解析器
114 |     xq_discuss_parser = discuss_parser.DiscussParser()
115 | 
116 |     # 新建空df用于存放预处理的数据
117 |     test_df = pd.DataFrame()
118 |     target_df = read_csv()
119 |     logging.logger.info('length of target from csv:{}'.format(len(target_df)))
120 |     # 测试用
121 |     target_df = target_df.head(50)
122 |     # 可以优化
123 |     # funccc = lambda x: str(x)  # 类型转换
124 |     # test_df['xid'] = target_df['id'].apply(funccc)
125 |     # test_df['uid'] = target_df['uid'].apply(funccc)
126 |     test_df['xid'] = target_df['id']
127 |     # 转换时间格式
128 |     test_df['unix_time'] = target_df['create_at'].apply(time_util.timestamp_to_time, style='%Y-%m-%d')
129 |     # 将id和unix_time构建成一个整体
130 |     test_df['information_dic'] = test_df[['xid', 'unix_time']].apply(create_dic, axis=1, args=('xid', 'unix_time'))
131 | 
132 |     # desc和rdesc两个讨论合并在一起处理
133 |     funcc = lambda x: str(x[0]) + '.' + str(x[1])
134 |     test_df['text'] = target_df[['desc', 'rdesc']].apply(funcc, axis=1)
135 | 
136 |     test_df['unix_time'] = target_df['create_at'].apply(time_util.timestamp_to_time, style='%Y-%m-%d')
137 |     # print(test_df[['unix_time', 'xid', 'text']])
138 |     del target_df
139 |     gc.collect()
140 | 
141 |     # 获取时间的最大值和最小值
142 |     # start_date = time_util.timestamp_to_time(target_df['create_at'].min(), style='%Y-%m-%d')
143 |     # stop_date = time_util.timestamp_to_time(target_df['create_at'].max(), style='%Y-%m-%d')
144 |     # 获取时间的最大值和最小值
145 |     start_date = test_df['unix_time'].min()
146 |     stop_date = test_df['unix_time'].max()
147 |     logging.logger.info('start_time: {}, stop_time: {}'.format(start_date, stop_date))
148 | 
149 |     start_time = datetime.strptime(str(start_date), "%Y-%m-%d")
150 |     stop_time = datetime.strptime(str(stop_date), "%Y-%m-%d")
151 |     tmp_time = stop_time
152 |     while tmp_time >= start_time:
153 |         # 从最大的一天开始倒数着计算每一天
154 |         # 读取当天的数据
155 |         tmp_date = datetime.strftime(tmp_time, "%Y-%m-%d")
156 |         discuss_df = test_df.loc[test_df['unix_time'] == tmp_date]
157 |         if len(discuss_df) == 0:
158 |             logging.logger.warning("{} has no discuss data".format(tmp_date))
159 |             tmp_time = tmp_time - timedelta(days=1)
160 |             continue
161 |         logging.logger.info("computing {} data at ".format(len(discuss_df), tmp_date))
162 |         # 调用多进程解析器
163 |         result = discuss_batch(discuss_df, xq_discuss_parser)
164 |         result['created_date'] = tmp_date
165 |         logging.logger.info("{} has {} result data".format(tmp_date, len(result)))
166 | 
167 |         # # 创建数据库引擎
168 |         # engine_mysql_test = data_source.GetDataEngine("XAVIER_DB")
169 |         # engine_mysql = data_source.GetDataEngine("XAVIER")
170 |         # result.to_sql('history_discuss_filter', engine_mysql, if_exists='append', index=False)
171 | 
172 |         # 计算前一天时间
173 |         del result, discuss_df
174 |         gc.collect()
175 |         tmp_time = tmp_time - timedelta(days=1)
176 | 
177 |     end_time = datetime.now()
178 |     print((end_time - st_time).seconds)
179 | 
180 | 
181 | 
182 | 
183 | 


--------------------------------------------------------------------------------
/src/parser/xueqiu/discuss_parser/xueqiu_discuss_csv.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | @version: ??
  6 | @author: li
  7 | @file: xueqiu_discuss_csv.py
  8 | @time: 2019-03-23 14:23
  9 | """
 10 | 
 11 | # 从csv文件中读取文件
 12 | import time
 13 | import os
 14 | import glob
 15 | import pandas as pd
 16 | from src.utils import time_util
 17 | from src.utils.engine import data_source
 18 | from src.utils.log import log_util
 19 | from src.parser.xueqiu.discuss_parser import discuss_parser
 20 | 
 21 | logging = log_util.Logger('xueqiu_discuss_csv')
 22 | 
 23 | 
 24 | # 使用desc和rdesc作为当前用户的讨论数据，用户id为当前用户id，讨论id为
 25 | def read_csv(path=None):
 26 |     """
 27 |     读取原始csv文件
 28 |     :param path:
 29 |     :return:
 30 |     """
 31 |     if path is None:
 32 |         path = '/Users/li/Desktop/sets1'
 33 |     file_list = glob.glob(os.path.join(path, "*.csv"))
 34 |     data_list = []
 35 |     for f in file_list:
 36 |         data_list.append(pd.read_csv(f, header=0,
 37 |                                      names=[u'create_at', u'desc', u'id', u'rcreate_at', u'rdesc', u'rid', u'rtitle',
 38 |                                             u'ruid', u'screen_sname', u'uid'], encoding='utf-8'))
 39 |         # data_list.append(pd.read_csv(f))
 40 | 
 41 |     df_result = pd.concat(data_list, sort=True)
 42 | 
 43 |     return df_result
 44 | 
 45 | 
 46 | def create_dic(df, xid, unix_time):
 47 |     """
 48 |     将xid和unix_time构建成一个list
 49 |     :param df:
 50 |     :param xid:
 51 |     :param unix_time:
 52 |     :return:
 53 |     """
 54 |     tmp = dict()
 55 |     tmp['xid'] = df[xid]
 56 |     tmp['unix_time'] = df[unix_time]
 57 |     return [df[xid], df[unix_time]]
 58 | 
 59 | 
 60 | # 数据结构调整
 61 | def transform_fuc(id, stock_list):
 62 |     """
 63 |     将user_id和stock_list两两组合成tuple的list集合
 64 |     :param id: str
 65 |     :param stock_list: list
 66 |     :return:
 67 |     """
 68 |     if len(stock_list) <= 0:
 69 |         pass
 70 |     user_id_list = [id] * len(stock_list)
 71 |     tuple_zip = zip(stock_list, user_id_list)
 72 |     tuple_list = list(tuple_zip)
 73 |     return tuple_list
 74 | 
 75 | 
 76 | if __name__ == '__main__':
 77 |     start_time = time.time()
 78 |     xq_discuss_parser = discuss_parser.DiscussParser()
 79 | 
 80 |     test_df = pd.DataFrame()
 81 |     target_df = read_csv()
 82 |     logging.logger.info('length of target from csv:{}'.format(len(target_df)))
 83 |     # 测试用
 84 |     target_df = target_df.head(50)
 85 |     # 可以优化
 86 |     funccc = lambda x: str(x)  # 类型转换
 87 |     test_df['xid'] = target_df['id'].apply(funccc)
 88 |     test_df['uid'] = target_df['uid'].apply(funccc)
 89 |     # 转换时间格式
 90 |     test_df['unix_time'] = target_df['create_at'].apply(time_util.timestamp_to_time, style='%Y-%m-%d')
 91 |     # 将id和unix_time构建成一个整体
 92 |     test_df['information_dic'] = test_df[['xid', 'unix_time']].apply(create_dic, axis=1, args=('xid', 'unix_time'))
 93 | 
 94 |     # desc和rdesc两个讨论合并在一起处理
 95 |     funcc = lambda x: str(x[0]) + '.' + str(x[1])
 96 |     test_df['text'] = target_df[['desc', 'rdesc']].apply(funcc, axis=1)
 97 | 
 98 |     # 提取股票实体词
 99 |     result_df = xq_discuss_parser.run(test_df)
100 |     # print(result_df[['stock_list', 'information_dic']])
101 | 
102 |     # 对result_df下的文章id和股票集合进行结构调整
103 |     apply_func = lambda x: transform_fuc(x[0], x[1])
104 |     result_df['transform_res'] = result_df[['information_dic', 'stock_list']].apply(apply_func, axis=1)
105 |     # print(result_df['transform_res'])
106 |     # print(result_df['transform_res'])
107 | 
108 |     # 将若干个list合并成一个list
109 |     transform_res_list = []
110 |     for i in result_df['transform_res'].values:
111 |         transform_res_list += i
112 | 
113 |     # 转换成DataFrame格式
114 |     transform_res_df = pd.DataFrame(transform_res_list, columns=['stock', 'information_list'])
115 |     # print(transform_res_df[['stock', 'information_list']])
116 |     # 将数据根据股票分组
117 |     transform_res_grouped = transform_res_df.groupby('stock')
118 | 
119 |     # 合并每个分组中的文章id
120 |     res_grouped = []
121 |     for group_index, group_value in transform_res_grouped:
122 |         if group_index is None or group_index:
123 |             pass
124 |         tmp_res = []
125 |         for value in group_value['information_list']:
126 |             tmp_res.append(value)
127 |         res_grouped.append([group_index, tmp_res])
128 | 
129 |     # 构建成dataFrame格式，结合运行日期，保存到数据库中
130 |     result = pd.DataFrame(res_grouped, columns=['stock', 'information_list'])
131 |     # print("result %s" % result)
132 | 
133 |     result_dataframe = pd.DataFrame()
134 |     for i, j in result.iterrows():
135 |         tt = pd.DataFrame(j['information_list'], columns=['xid', 'created_time'])
136 |         # print('tt %s' % tt)
137 |         tt_grouped = tt.groupby('created_time')
138 | 
139 |         # 合并每个分组中的文章id
140 |         res_grouped = []
141 |         for i1, j1 in tt_grouped:
142 |             # print('i1 %s' % i1)
143 |             # print('j1 %s' % j1)
144 |             res_grouped.append([i1, ','.join(j1['xid'])])
145 |         # print(res_grouped)
146 | 
147 |         result_df = pd.DataFrame(res_grouped, columns=['creates_time', 'xid_list'])
148 |         result_df['stock'] = j['stock']
149 | 
150 |         # print(result_df)
151 |         result_dataframe = result_dataframe.append(result_df, ignore_index=True)
152 |     logging.logger.info('spend %s' % (time.time() - start_time))
153 |     logging.logger.info('length of result dataframe: %s' % len(result_dataframe))
154 | 
155 |     # 数据库存储
156 |     # engine_sqlite = data_source.GetDataEngine('XAVIER_SQLITE')
157 |     engine_mysql = data_source.GetDataEngine("XAVIER")
158 | 
159 |     result_dataframe.to_sql('history_discuss_stock_filter_test', engine_mysql, if_exists='replace', index=False)
160 |     # result_dataframe.to_sql('history_discuss_stock_filter', engine_sqlite, if_exists='replace', index=False)
161 | #


--------------------------------------------------------------------------------
/src/parser/xueqiu/discuss_parser/xueqiu_discuss_csv_bak.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | @version: ??
  6 | @author: li
  7 | @file: xueqiu_discuss_csv.py
  8 | @time: 2019-03-23 14:23
  9 | """
 10 | 
 11 | # 从csv文件中读取文件]
 12 | import sys
 13 | sys.path.append('../')
 14 | sys.path.append('../../')
 15 | sys.path.append('../../../')
 16 | sys.path.append('../../../../')
 17 | sys.path.append('../../../../../')
 18 | import os
 19 | import glob
 20 | import time
 21 | import pandas as pd
 22 | from joblib import Parallel, delayed
 23 | import multiprocessing
 24 | from src.configure import conf
 25 | from src.utils import time_util, dicts
 26 | from src.utils.engine import data_source
 27 | from src.utils.data_process import DataPressing
 28 | from src.utils.tokenization import Tokenizer, load_stop_words
 29 | 
 30 | 
 31 | # 使用desc和rdesc作为当前用户的讨论数据，用户id为当前用户id，讨论id为
 32 | def read_csv():
 33 |     path = '/Users/li/Desktop/sets1'
 34 |     file_list = glob.glob(os.path.join(path, "*.csv"))
 35 |     data_list = []
 36 |     for f in file_list:
 37 |         data_list.append(pd.read_csv(f, header=0,
 38 |                                      names=[u'create_at', u'desc', u'id', u'rcreate_at', u'rdesc', u'rid', u'rtitle',
 39 |                                             u'ruid', u'screen_sname', u'uid'], encoding='utf-8'))
 40 |         # data_list.append(pd.read_csv(f))
 41 | 
 42 |     df_result = pd.concat(data_list, sort=True)
 43 |     print(len(df_result))
 44 |     # print(df_result.keys())
 45 |     # print(df_result.head())
 46 | 
 47 |     return df_result
 48 | 
 49 | 
 50 | # 导入股票实体词
 51 | stock_code_dict = []  # 股票代码
 52 | stock_dict = []
 53 | 
 54 | 
 55 | def load_stock_data():
 56 |     dic_path = conf.dic_path
 57 |     st_path = dic_path + "/stock_words.txt"
 58 |     st_new_path = dic_path + "/stock.csv"
 59 |     for st in open(st_path):
 60 |         # st = st.decode("utf8")
 61 |         code1, st_code = st.split("\t")
 62 |         code, stock = st_code.split(",")
 63 |         stock_code_dict.append(code.strip("\n"))
 64 |         stock_dict.append(stock.strip("\n"))
 65 | 
 66 |     stocks_df = pd.read_csv(st_new_path, encoding='utf-8')
 67 |     # stock_df.append(stocks_df.set_index('SESNAME'))
 68 |     for index, row in stocks_df.iterrows():
 69 |         stock_dict.append(row.SESNAME)
 70 |         stock_dict.append(row.SYMBOL)
 71 |     return stock_dict, stocks_df
 72 | 
 73 | 
 74 | _, stocks_df = load_stock_data()
 75 | 
 76 | # 识别评论中的股票实体。
 77 | # 对讨论进行分词,然后提取评论中的股票实体。
 78 | data_process = DataPressing()
 79 | dict_init = dicts.init()
 80 | stop_words = load_stop_words()
 81 | tokenizer = Tokenizer(data_process, stop_words)
 82 | 
 83 | 
 84 | # 整理股票代码
 85 | stocks_df = stocks_df.set_index('SESNAME')
 86 | # print('stocks_df %s' % stocks_df)
 87 | 
 88 | 
 89 | def cut_process(text):
 90 |     """
 91 |     数据处理模块, 分词、提取股票实体词
 92 |     :param text:
 93 |     :return:
 94 |     """
 95 |     # 分词
 96 |     dicts.init()
 97 |     text_list = tokenizer.token(text)
 98 |     # 提取text中涉及到的股票实体，并且转换成股票代码
 99 |     stock_list = data_process.find_stocks(text_list, stocks_df)
100 |     # stock_list = ','.join(stock_list)
101 |     return stock_list
102 | 
103 | 
104 | def tmp_func(df):
105 |     """
106 |     apply函数封装
107 |     :param df:
108 |     :return:
109 |     """
110 |     df['stock_list'] = df['text'].apply(cut_process)
111 |     return df[['xid', 'uid', 'stock_list', 'unix_time', 'information_dic']]
112 | 
113 | 
114 | def apply_parallel(df_grouped, func):
115 |     """
116 |     # 多进程处理
117 |     :param df_grouped:
118 |     :param func:
119 |     :return:
120 |     """
121 |     ret_lst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(group) for name, group in df_grouped)
122 | 
123 |     return pd.concat(ret_lst)
124 | 
125 | 
126 | def run(target_df):
127 |     """
128 |     多进程处理主程序
129 |     :param target_df:
130 |     :return:
131 |     """
132 |     # 将输入数据按照
133 |     df_grouped = target_df.groupby(target_df.index)
134 |     res = apply_parallel(df_grouped, tmp_func)
135 |     return res
136 | 
137 | 
138 | def create_dic(df, xid, unix_time):
139 |     tmp = dict()
140 |     tmp['xid'] = df[xid]
141 |     tmp['unix_time'] = df[unix_time]
142 |     # return str(tmp
143 |     return [df[xid], df[unix_time]]
144 | 
145 | 
146 | # 数据结构调整
147 | def transform_fuc(id, stock_list):
148 |     """
149 |     将user_id和stock_list两两组合成tuple的list集合
150 |     :param id: str
151 |     :param stock_list: list
152 |     :return:
153 |     """
154 |     if len(stock_list) <= 0:
155 |         pass
156 |     user_id_list = [id] * len(stock_list)
157 |     tuple_zip = zip(stock_list, user_id_list)
158 |     tuple_list = list(tuple_zip)
159 |     return tuple_list
160 | 
161 | 
162 | if __name__ == '__main__':
163 | 
164 |     start_time = time.time()
165 |     test_df = pd.DataFrame()
166 |     target_df = read_csv()
167 |     # 测试用
168 |     target_df = target_df.head(5)
169 |     # 可以优化
170 |     funccc = lambda x: str(x)  # 类型转换
171 |     test_df['xid'] = target_df['id'].apply(funccc)
172 |     test_df['uid'] = target_df['uid'].apply(funccc)
173 |     # 转换时间格式
174 |     test_df['unix_time'] = target_df['create_at'].apply(time_util.timestamp_to_time, style='%Y-%m-%d')
175 |     # 将id和unix_time构建成一个整体
176 |     test_df['information_dic'] = test_df[['xid', 'unix_time']].apply(create_dic, axis=1, args=('xid', 'unix_time'))
177 | 
178 |     # desc和rdesc两个讨论合并在一起处理
179 |     funcc = lambda x: str(x[0]) + '.' + str(x[1])
180 |     test_df['text'] = target_df[['desc', 'rdesc']].apply(funcc, axis=1)
181 | 
182 |     # 提取股票实体词
183 |     result_df = run(test_df)
184 |     # print(result_df[['stock_list', 'information_dic']])
185 | 
186 |     # 对result_df下的文章id和股票集合进行结构调整
187 |     apply_func = lambda x: transform_fuc(x[0], x[1])
188 |     result_df['transform_res'] = result_df[['information_dic', 'stock_list']].apply(apply_func, axis=1)
189 |     # print(result_df['transform_res'])
190 |     # print(result_df['transform_res'])
191 | 
192 |     # 将若干个list合并成一个list
193 |     transform_res_list = []
194 |     for i in result_df['transform_res'].values:
195 |         transform_res_list += i
196 | 
197 |     # 转换成DataFrame格式
198 |     transform_res_df = pd.DataFrame(transform_res_list, columns=['stock', 'information_list'])
199 |     # print(transform_res_df[['stock', 'information_list']])
200 |     # 将数据根据股票分组
201 |     transform_res_grouped = transform_res_df.groupby('stock')
202 | 
203 |     # 合并每个分组中的文章id
204 |     res_grouped = []
205 |     for i, j in transform_res_grouped:
206 |         if i is None or i:
207 |             pass
208 |         tmp_res = []
209 |         for k in j['information_list']:
210 |             tmp_res.append(k)
211 |         res_grouped.append([i, tmp_res])
212 | 
213 |     # 构建成dataFrame格式，结合运行日期，保存到数据库中
214 |     result = pd.DataFrame(res_grouped, columns=['stock', 'information_list'])
215 |     # print("result %s" % result)
216 | 
217 |     result_dataframe = pd.DataFrame()
218 |     for i, j in result.iterrows():
219 |         tt = pd.DataFrame(j['information_list'], columns=['xid', 'created_time'])
220 |         # print('tt %s' % tt)
221 |         tt_grouped = tt.groupby('created_time')
222 | 
223 |         # 合并每个分组中的文章id
224 |         res_grouped = []
225 |         for i1, j1 in tt_grouped:
226 |             # print('i1 %s' % i1)
227 |             # print('j1 %s' % j1)
228 |             res_grouped.append([i1, ','.join(j1['xid'])])
229 |         # print(res_grouped)
230 | 
231 |         result_df = pd.DataFrame(res_grouped, columns=['creates_time', 'xid_list'])
232 |         result_df['stock'] = j['stock']
233 | 
234 |         # print(result_df)
235 |         result_dataframe = result_dataframe.append(result_df, ignore_index=True)
236 |     print('spend %s' % (time.time() - start_time))
237 |     print('result_dataframe %s' % result_dataframe)
238 | 
239 |     # 数据库存储
240 |     engine_sqlite = data_source.GetDataEngine('XAVIER_SQLITE')
241 |     engine_mysql = data_source.GetDataEngine("XAVIER")
242 | 
243 |     result_dataframe.to_sql('history_discuss_stock_filter', engine_mysql, if_exists='replace', index=False)
244 |     result_dataframe.to_sql('history_discuss_stock_filter', engine_sqlite, if_exists='replace', index=False)
245 | 


--------------------------------------------------------------------------------
/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | @version: ??
  6 | @author: li
  7 | @file: xueqiu_discuss_daily.py
  8 | @time: 2019-03-18 16:35
  9 | 对大V评论进行分析，提取大V评论中的股票实体，并且整合成股票ID:{评论ID, 评论大VID, 评论时间, 评论内容}
 10 | 计算时间粒度，一天
 11 | 每天定时统计, 每天早上八点定时运行
 12 | """
 13 | import sys
 14 | sys.path.append('../')
 15 | sys.path.append('../../')
 16 | sys.path.append('../../../')
 17 | sys.path.append('../../../../')
 18 | sys.path.append('../../../../../')
 19 | 
 20 | from src.utils import time_util
 21 | from src.utils.log import log_util
 22 | import pandas as pd
 23 | from sqlalchemy import create_engine
 24 | from src.data_reader import read_all_data
 25 | from src.parser.xueqiu.discuss_parser import discuss_parser, format_transform
 26 | 
 27 | logging = log_util.Logger('discuss_stock_filter_daily')
 28 | 
 29 | 
 30 | # 数据结构调整
 31 | def transform_fuc(id, stock_list):
 32 |     """
 33 |     将user_id和stock_list两两组合成tuple的list集合
 34 |     :param id: str
 35 |     :param stock_list: list
 36 |     :return:
 37 |     """
 38 |     if len(stock_list) <= 0:
 39 |         pass
 40 |     user_id_list = [id] * len(stock_list)
 41 |     tuple_zip = zip(stock_list, user_id_list)
 42 |     tuple_list = list(tuple_zip)
 43 |     return tuple_list
 44 | 
 45 | 
 46 | if __name__ == '__main__':
 47 |     pd.set_option('display.max_rows', None, 'display.max_columns', None, "display.max_colwidth", 1000, 'display.width', 1000)
 48 |     # engine_mysql_test = GetDataEngine("VISIONTEST")
 49 |     # engine_mysql = data_source.GetDataEngine("VISION")
 50 |     engine_mysql_test = create_engine('mysql+mysqlconnector://test_edit:test_edit_2019@db1.irongliang.com:3306/test')
 51 | 
 52 |     xq_parser = discuss_parser.DiscussParser()
 53 | 
 54 |     '''数据读取部分'''
 55 |     '''根据指定的时间格式, 从指定数据库中读取指定表中的数据'''
 56 |     # 获取两个指定的时间点
 57 |     # 起始时间
 58 |     stop_time = time_util.get_integral_point_time(0)
 59 |     # 截止时间为起始时间的前一天
 60 |     start_time = time_util.get_integral_point_time(0) - 86400  # (24*60*60)
 61 | 
 62 |     # 测试用
 63 |     # start_time = 1556380800
 64 |     # stop_time = 1556380800 + 86400
 65 | 
 66 |     logging.logger.info("program start at {}".format(time_util.timestamp_to_time(start_time), "%Y-%m-%d"))
 67 |     logging.logger.info("program stop at {}".format(time_util.timestamp_to_time(stop_time), "%Y-%m-%d"))
 68 |     # 读取原始雪球评论数据
 69 |     # sheet_name = 'xueqiu_discuss'
 70 |     # sql = "SELECT xid, uid, title, text, unix_time FROM xavier.{} WHERE unix_time >={} AND unix_time <= {} order by unix_time".format(sheet_name, str(start_time), str(stop_time))
 71 |     # 雪球评论所保存的表
 72 |     sheet_name = 'xq_comment'
 73 |     # 读取指定时间段的所有数据
 74 |     sql = "SELECT * FROM test.{} WHERE created_at >={} AND created_at <= {} order by created_at".format(sheet_name, str(start_time * 1000), str(stop_time * 1000))
 75 | 
 76 |     # 读取需要处理的数据，从数据库中以DataFrame的格式读取。
 77 |     discuss_df = read_all_data(sheet_name, engine_mysql_test, sql)
 78 |     # 测试用
 79 |     # discuss_df = discuss_df.head()
 80 |     # print('discuss_df %s' % discuss_df)
 81 |     '''数据读取部分'''
 82 | 
 83 |     if len(discuss_df) <= 0:
 84 |         logging.logger.warning('there is no new discuss yesterday')
 85 |         exit()
 86 |     else:
 87 |         logging.logger.info('load discuss data from mysql successful')
 88 |     # 进行分词，提取股票特征词等操作
 89 |     result_df = xq_parser.run(discuss_df)
 90 | 
 91 |     # 对result_df下的文章id和股票集合进行结构调整
 92 |     # 可以改进成直接调用transform_fuc
 93 |     apply_func = lambda x: transform_fuc(str(x[0]), x[1])
 94 |     # id: 文章id， stock_list: 提取的的股票集合
 95 |     result_df['transform_res'] = result_df[['id', 'stock_list']].apply(apply_func, axis=1)
 96 |     # print(result_df['transform_res'])
 97 | 
 98 |     # 将dataframe中每一行的若干个list合并成一个list
 99 |     transform_res_list = []
100 |     for i in result_df['transform_res'].values:
101 |         transform_res_list += i
102 | 
103 |     # 转换成DataFrame格式
104 |     transform_res_df = pd.DataFrame(transform_res_list, columns=['stock', 'xid'])
105 |     # print(transform_res_df)
106 | 
107 |     # # 将数据根据股票分组
108 |     transform_res_grouped = transform_res_df.groupby('stock')
109 |     #
110 |     # 合并每个分组中的文章id
111 |     res_grouped = []
112 |     for stock, group_df in transform_res_grouped:
113 |         res_grouped.append([stock, ','.join(group_df['xid'])])
114 |     # print(res_grouped)
115 | 
116 |     # # 构建成dataFrame格式，结合运行日期，保存到数据库中
117 |     result = pd.DataFrame(res_grouped, columns=['stock', 'xid_list'])
118 |     # 格式化股票代码
119 |     result['stock'] = result['stock'].apply(format_transform.symbol_format)
120 |     # 统计讨论的数目
121 |     result['xid_count'] = result['xid_list'].apply(format_transform.county)
122 |     # result['created_at'] = str(datetime.date.today().strftime("%Y-%m-%d"))
123 |     result['created_at'] = str(time_util.timestamp_to_time(start_time, "%Y-%m-%d"))
124 | 
125 |     print(result)
126 |     logging.logger.info("length of result: %s" % len(result))
127 | 
128 |     # #存储到表中
129 |     # 创建数据库引擎
130 |     result.to_sql('xueqiu_discuss_count', engine_mysql_test, if_exists='append', index=False)
131 |     logging.logger.info('数据保存到第 %s 天' % str(time_util.timestamp_to_time(start_time, "%Y-%m-%d")))
132 | 
133 |     logging.logger.info('program finished')
134 | 
135 | 
136 | 


--------------------------------------------------------------------------------
/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily_bak.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | @version: ??
  6 | @author: li
  7 | @file: xueqiu_discuss_daily_bak.py
  8 | @time: 2019-03-18 16:35
  9 | 对大V评论进行分析，提取大V评论中的股票实体，并且整合成股票ID:{评论ID, 评论大VID, 评论时间, 评论内容}
 10 | 计算时间粒度，一天
 11 | 每天定时统计, 每天早上八点定时运行
 12 | """
 13 | 
 14 | import datetime
 15 | from joblib import Parallel, delayed
 16 | import multiprocessing
 17 | from src.utils import dicts, time_util
 18 | import pandas as pd
 19 | from src.data_reader import read_all_data
 20 | from src.configure import conf
 21 | from src.utils.log import log_util
 22 | from src.utils.data_process import DataPressing
 23 | from src.utils.tokenization import Tokenizer, load_stop_words
 24 | 
 25 | logging = log_util.Logger('discuss_stock_filter')
 26 | 
 27 | # 获取两个时间点
 28 | # 指定时间
 29 | stop_time = time_util.get_integral_point_time(9)
 30 | # 指定时间前一天
 31 | start_time = time_util.get_integral_point_time(9) - 86400  # (24*60*60)
 32 | 
 33 | start_time = 1552179600
 34 | stop_time = 1552179600 + 86400
 35 | 
 36 | logging.logger.info("program start at {}".format(start_time))
 37 | # 读取原始雪球评论数据
 38 | sheet_name = 'xueqiu_discuss'
 39 | sql = "SELECT xid, uid, title, text, unix_time FROM xavier.{} WHERE unix_time >={} AND unix_time <= {} order by unix_time".format(sheet_name, str(start_time), str(stop_time))
 40 | # sql = "SELECT count(*) FROM xavier_db.%s  ORDER BY unix_time" % sheet_name
 41 | 
 42 | # 读取需要处理的数据，从数据库中以DataFrame的格式读取。
 43 | discuss_df = read_all_data(sheet_name, sql)
 44 | # 测试用
 45 | discuss_df = discuss_df.head()
 46 | if len(discuss_df) <= 0:
 47 |     logging.logger.warning('there is no new discuss yesterday')
 48 |     exit()
 49 | else:
 50 |     logging.logger.info('load discuss data from mysql successful')
 51 | 
 52 | # print('discuss_df %s' % discuss_df)
 53 | 
 54 | 
 55 | # 导入股票实体词
 56 | stock_code_dict = []  # 股票代码
 57 | stock_dict = []
 58 | 
 59 | 
 60 | def load_stock_data():
 61 |     dic_path = conf.dic_path
 62 |     st_path = dic_path + "/stock_words.txt"
 63 |     st_new_path = dic_path + "/stock.csv"
 64 |     for st in open(st_path):
 65 |         st = st.decode("utf8")
 66 |         code1, st_code = st.split("\t")
 67 |         code, stock = st_code.split(",")
 68 |         stock_code_dict.append(code.strip("\n"))
 69 |         stock_dict.append(stock.strip("\n"))
 70 | 
 71 |     stocks_df = pd.read_csv(st_new_path, encoding='utf-8')
 72 |     # stock_df.append(stocks_df.set_index('SESNAME'))
 73 |     for index, row in stocks_df.iterrows():
 74 |         stock_dict.append(row.SESNAME)
 75 |         stock_dict.append(row.SYMBOL)
 76 |     return stock_dict, stocks_df
 77 | 
 78 | 
 79 | _, stocks_df = load_stock_data()
 80 | 
 81 | # 识别评论中的股票实体。
 82 | # 对讨论进行分词,然后提取评论中的股票实体。
 83 | data_process = DataPressing()
 84 | dict_init = dicts.init()
 85 | stop_words = load_stop_words()
 86 | tokenizer = Tokenizer(data_process, stop_words)
 87 | 
 88 | 
 89 | # 整理股票代码
 90 | stocks_df = stocks_df.set_index('SESNAME')
 91 | # print('stocks_df %s' % stocks_df)
 92 | 
 93 | 
 94 | def cut_process(text):
 95 |     """
 96 |     数据处理模块, 分词、提取股票实体词
 97 |     :param text:
 98 |     :return:
 99 |     """
100 |     # 分词
101 |     dicts.init()
102 |     text_list = tokenizer.token(text)
103 |     # 提取text中涉及到的股票实体，并且转换成股票代码
104 |     stock_list = data_process.find_stocks(text_list, stocks_df)
105 |     # res = ','.join(stock_list)
106 |     # return res
107 |     return stock_list
108 | 
109 | 
110 | def tmp_func(df):
111 |     """
112 |     apply函数封装
113 |     :param df:
114 |     :return:
115 |     """
116 |     df['stock_list'] = df['text'].apply(cut_process)
117 |     return df[['xid', 'uid', 'stock_list', 'unix_time']]
118 | 
119 | 
120 | def apply_parallel(df_grouped, func):
121 |     """
122 |     # 多进程处理
123 |     :param df_grouped:
124 |     :param func:
125 |     :return:
126 |     """
127 |     ret_lst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(group) for name, group in df_grouped)
128 | 
129 |     return pd.concat(ret_lst)
130 | 
131 | 
132 | def run(target_df):
133 |     """
134 |     多进程处理主程序
135 |     :param target_df:
136 |     :return:
137 |     """
138 |     # 将输入数据按照
139 |     df_grouped = target_df.groupby(target_df.index)
140 |     res = apply_parallel(df_grouped, tmp_func)
141 |     return res
142 | 
143 | 
144 | def kk_test():
145 |     """
146 |     测试股票实体是否提取成功
147 |     :return:
148 |     """
149 |     text = "大智慧的股票真烂，中美贸易战打得好，中美贸易摩擦擦出爱情火花！中信证券也上市了，还是注册制的, 中信建投也不错。"
150 |     cut_res = cut_process(text)
151 |     print('tmp_res %s' % cut_res)
152 | 
153 |     test_df = pd.DataFrame({'text': [text, text, text, text, text]})
154 |     # 非多进程下直接提取股票实体
155 |     test_df['stock_list'] = test_df['text'].apply(cut_process)
156 |     print('test_df %s' % test_df)
157 | 
158 |     # 多进程下提取股票实体
159 |     df_grouped = test_df.groupby(test_df.index)
160 |     pp = apply_parallel(df_grouped, tmp_func)
161 |     print('pp %s' % pp['stock_list'])
162 | 
163 | 
164 | # 对dicuss_df做提取股票代码操作
165 | result_df = run(discuss_df)
166 | # print(result_df[['xid', 'stock_list']])
167 | 
168 | 
169 | # 数据结构调整
170 | def transform_fuc(id, stock_list):
171 |     """
172 |     将user_id和stock_list两两组合成tuple的list集合
173 |     :param id: str
174 |     :param stock_list: list
175 |     :return:
176 |     """
177 |     if len(stock_list) <= 0:
178 |         pass
179 |     user_id_list = [id] * len(stock_list)
180 |     tuple_zip = zip(stock_list, user_id_list)
181 |     tuple_list = list(tuple_zip)
182 |     return tuple_list
183 | 
184 | 
185 | # 对result_df下的文章id和股票集合进行结构调整
186 | # 可以改进成直接调用transform_fuc
187 | apply_func = lambda x: transform_fuc(x[0], x[1])
188 | result_df['transform_res'] = result_df[['xid', 'stock_list']].apply(apply_func, axis=1)
189 | print(result_df['transform_res'])
190 | 
191 | # 将若干个list合并成一个list
192 | transform_res_list = []
193 | for i in result_df['transform_res'].values:
194 |     transform_res_list += i
195 | 
196 | # 转换成DataFrame格式
197 | transform_res_df = pd.DataFrame(transform_res_list, columns=['stock', 'xid'])
198 | 
199 | # 将数据根据股票分组
200 | transform_res_grouped = transform_res_df.groupby('stock')
201 | 
202 | # 合并每个分组中的文章id
203 | res_grouped = []
204 | for i, j in transform_res_grouped:
205 |     res_grouped.append([i, ','.join(j['xid'])])
206 | print(res_grouped)
207 | 
208 | # 构建成dataFrame格式，结合运行日期，保存到数据库中
209 | result = pd.DataFrame(res_grouped, columns=['stock', 'xid_list'])
210 | 
211 | result['created_date'] = str(datetime.date.today().strftime("%Y-%m-%d"))
212 | # print(result)
213 | 
214 | # # 存储到表中
215 | # # 创建数据库引擎
216 | # engine_mysql_test = data_source.GetDataEngine("XAVIER_DB")
217 | # engine_mysql = data_source.GetDataEngine("XAVIER")
218 | # result.to_sql('discuss_stock_filter_lists', engine_mysql, if_exists='replace', index=False)
219 | #
220 | # logging.logger.info('program finished')
221 | 


--------------------------------------------------------------------------------
/src/parser/xueqiu/discuss_parser/xueqiu_discuss_parser.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | @version: ??
  6 | @author: li
  7 | @file: xueqiu_discuss_parser_bak.py
  8 | @time: 2019-03-05 10:31
  9 | 对大V评论进行分析，提取大V评论中的股票实体，并且整合成股票ID:{评论ID, 评论大VID, 评论时间, 评论内容}
 10 | 计算时间粒度，一天
 11 | 
 12 | """
 13 | from joblib import Parallel, delayed
 14 | import multiprocessing
 15 | from src.utils import dicts
 16 | import pandas as pd
 17 | from src.data_reader import read_all_data
 18 | from src.configure import conf
 19 | from src.utils.engine import data_source
 20 | from src.utils.data_process import DataPressing
 21 | from src.utils.tokenization import Tokenizer, load_stop_words
 22 | 
 23 | # 读取原始雪球评论数据
 24 | sheet_name = 'xueqiu_discuss'
 25 | sql = "SELECT xid, uid, title, text, mood, unix_time FROM xavier.%s  ORDER BY unix_time" % sheet_name
 26 | # sql = "SELECT count(*) FROM xavier_db.%s  ORDER BY unix_time" % sheet_name
 27 | 
 28 | 
 29 | # 导入股票实体词
 30 | stock_code_dict = []  # 股票代码
 31 | stock_dict = []
 32 | 
 33 | 
 34 | def load_stock_data():
 35 |     dic_path = conf.dic_path
 36 |     st_path = dic_path + "/stock_words.txt"
 37 |     st_new_path = dic_path + "/stock.csv"
 38 |     for st in open(st_path):
 39 |         st = st.decode("utf8")
 40 |         code1, st_code = st.split("\t")
 41 |         code, stock = st_code.split(",")
 42 |         stock_code_dict.append(code.strip("\n"))
 43 |         stock_dict.append(stock.strip("\n"))
 44 | 
 45 |     stocks_df = pd.read_csv(st_new_path, encoding='utf-8')
 46 |     # stock_df.append(stocks_df.set_index('SESNAME'))
 47 |     for index, row in stocks_df.iterrows():
 48 |         stock_dict.append(row.SESNAME)
 49 |         stock_dict.append(row.SYMBOL)
 50 |     return stock_dict, stocks_df
 51 | 
 52 | 
 53 | _, stocks_df = load_stock_data()
 54 | 
 55 | # 识别评论中的股票实体。
 56 | # 对讨论进行分词,然后提取评论中的股票实体。
 57 | data_process = DataPressing()
 58 | dict_init = dicts.init()
 59 | stop_words = load_stop_words()
 60 | tokenizer = Tokenizer(data_process, stop_words)
 61 | 
 62 | # 读取需要处理的数据，从数据库中以DataFrame的格式读取。
 63 | discuss_df = read_all_data(sheet_name, sql)
 64 | # discuss_df = discuss_df.head()
 65 | # print('discuss_df %s' % discuss_df['mood'])
 66 | 
 67 | # 整理股票代码
 68 | stocks_df = stocks_df.set_index('SESNAME')
 69 | # print('stocks_df %s' % stocks_df)
 70 | 
 71 | 
 72 | def cut_process(text):
 73 |     """
 74 |     数据处理模块, 分词、提取股票实体词
 75 |     :param text:
 76 |     :return:
 77 |     """
 78 |     # 分词
 79 |     dicts.init()
 80 |     text_list = tokenizer.token(text)
 81 |     # 提取text中涉及到的股票实体，并且转换成股票代码
 82 |     stock_list = data_process.find_stocks(text_list, stocks_df)
 83 |     res = ','.join(stock_list)
 84 |     return res
 85 | 
 86 | 
 87 | def tmp_func(df):
 88 |     """
 89 |     apply函数封装
 90 |     :param df:
 91 |     :return:
 92 |     """
 93 |     df['stock_list'] = df['text'].apply(cut_process)
 94 |     return df[['xid', 'uid', 'mood', 'stock_list', 'unix_time']]
 95 | 
 96 | 
 97 | def apply_parallel(df_grouped, func):
 98 |     """
 99 |     # 多进程处理
100 |     :param df_grouped:
101 |     :param func:
102 |     :return:
103 |     """
104 |     ret_lst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(group) for name, group in df_grouped)
105 | 
106 |     return pd.concat(ret_lst)
107 | 
108 | 
109 | def run(target_df):
110 |     """
111 |     多进程处理主程序
112 |     :param target_df:
113 |     :return:
114 |     """
115 |     # 将输入数据按照
116 |     df_grouped = target_df.groupby(target_df.index)
117 |     res = apply_parallel(df_grouped, tmp_func)
118 |     return res
119 | 
120 | 
121 | def kk_test():
122 |     """
123 |     测试股票实体是否提取成功
124 |     :return:
125 |     """
126 |     text = "大智慧的股票真烂，中美贸易战打得好，中美贸易摩擦擦出爱情火花！中信证券也上市了，还是注册制的, 中信建投也不错。"
127 |     cut_res = cut_process(text)
128 |     print('tmp_res %s' % cut_res)
129 | 
130 |     test_df = pd.DataFrame({'text': [text, text, text, text, text]})
131 |     # 非多进程下直接提取股票实体
132 |     test_df['stock_list'] = test_df['text'].apply(cut_process)
133 |     print('test_df %s' % test_df)
134 | 
135 |     # 多进程下提取股票实体
136 |     df_grouped = test_df.groupby(test_df.index)
137 |     pp = apply_parallel(df_grouped, tmp_func)
138 |     print('pp %s' % pp['stock_list'])
139 | 
140 | 
141 | # 结构调整
142 | result_df = run(discuss_df)
143 | # 存储到表中
144 | print(result_df.head())
145 | 
146 | # # 创建数据库引擎
147 | engine_mysql_test = data_source.GetDataEngine("XAVIER_DB")
148 | engine_mysql = data_source.GetDataEngine("XAVIER")
149 | 
150 | result_df.to_sql('discuss_stock_filter', engine_mysql, if_exists='replace', index=False)
151 | 
152 | 


--------------------------------------------------------------------------------
/src/parser/xueqiu/discuss_parser/xueqiu_discuss_parser_bak.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | @version: ??
  6 | @author: li
  7 | @file: xueqiu_discuss_parser_bak.py
  8 | @time: 2019-03-05 10:31
  9 | 对大V评论进行分析，提取大V评论中的股票实体，并且整合成股票ID:{评论ID, 评论大VID, 评论时间, 评论内容}
 10 | 计算时间粒度，一天
 11 | 
 12 | """
 13 | from joblib import Parallel, delayed
 14 | import multiprocessing
 15 | from src.utils import dicts
 16 | import pandas as pd
 17 | from src.data_reader import read_all_data
 18 | from src.configure import conf
 19 | from src.utils.engine import data_source
 20 | from src.utils.data_process import DataPressing
 21 | from src.utils.tokenization import Tokenizer, load_stop_words
 22 | 
 23 | # 读取原始雪球评论数据
 24 | sheet_name = 'xueqiu_discuss'
 25 | sql = "SELECT xid, uid, title, text, mood, unix_time FROM xavier.%s  ORDER BY unix_time" % sheet_name
 26 | # sql = "SELECT count(*) FROM xavier_db.%s  ORDER BY unix_time" % sheet_name
 27 | 
 28 | 
 29 | # 导入股票实体词
 30 | stock_code_dict = []  # 股票代码
 31 | stock_dict = []
 32 | 
 33 | 
 34 | def load_stock_data():
 35 |     dic_path = conf.dic_path
 36 |     st_path = dic_path + "/stock_words.txt"
 37 |     st_new_path = dic_path + "/stock.csv"
 38 |     for st in open(st_path):
 39 |         st = st.decode("utf8")
 40 |         code1, st_code = st.split("\t")
 41 |         code, stock = st_code.split(",")
 42 |         stock_code_dict.append(code.strip("\n"))
 43 |         stock_dict.append(stock.strip("\n"))
 44 | 
 45 |     stocks_df = pd.read_csv(st_new_path, encoding='utf-8')
 46 |     # stock_df.append(stocks_df.set_index('SESNAME'))
 47 |     for index, row in stocks_df.iterrows():
 48 |         stock_dict.append(row.SESNAME)
 49 |         stock_dict.append(row.SYMBOL)
 50 |     return stock_dict, stocks_df
 51 | 
 52 | 
 53 | _, stocks_df = load_stock_data()
 54 | 
 55 | # 识别评论中的股票实体。
 56 | # 对讨论进行分词,然后提取评论中的股票实体。
 57 | data_process = DataPressing()
 58 | dict_init = dicts.init()
 59 | stop_words = load_stop_words()
 60 | tokenizer = Tokenizer(data_process, stop_words)
 61 | 
 62 | # 读取需要处理的数据，从数据库中以DataFrame的格式读取。
 63 | discuss_df = read_all_data(sheet_name, sql)
 64 | # discuss_df = discuss_df.head()
 65 | # print('discuss_df %s' % discuss_df['mood'])
 66 | 
 67 | # 整理股票代码
 68 | stocks_df = stocks_df.set_index('SESNAME')
 69 | # print('stocks_df %s' % stocks_df)
 70 | 
 71 | 
 72 | def cut_process(text):
 73 |     """
 74 |     数据处理模块, 分词、提取股票实体词
 75 |     :param text:
 76 |     :return:
 77 |     """
 78 |     # 分词
 79 |     dicts.init()
 80 |     text_list = tokenizer.token(text)
 81 |     # 提取text中涉及到的股票实体，并且转换成股票代码
 82 |     stock_list = data_process.find_stocks(text_list, stocks_df)
 83 |     res = ','.join(stock_list)
 84 |     return res
 85 | 
 86 | 
 87 | def tmp_func(df):
 88 |     """
 89 |     apply函数封装
 90 |     :param df:
 91 |     :return:
 92 |     """
 93 |     df['stock_list'] = df['text'].apply(cut_process)
 94 |     return df[['xid', 'uid', 'mood', 'stock_list', 'unix_time']]
 95 | 
 96 | 
 97 | def apply_parallel(df_grouped, func):
 98 |     """
 99 |     # 多进程处理
100 |     :param df_grouped:
101 |     :param func:
102 |     :return:
103 |     """
104 |     ret_lst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(group) for name, group in df_grouped)
105 | 
106 |     return pd.concat(ret_lst)
107 | 
108 | 
109 | def run(target_df):
110 |     """
111 |     多进程处理主程序
112 |     :param target_df:
113 |     :return:
114 |     """
115 |     # 将输入数据按照
116 |     df_grouped = target_df.groupby(target_df.index)
117 |     res = apply_parallel(df_grouped, tmp_func)
118 |     return res
119 | 
120 | 
121 | def kk_test():
122 |     """
123 |     测试股票实体是否提取成功
124 |     :return:
125 |     """
126 |     text = "大智慧的股票真烂，中美贸易战打得好，中美贸易摩擦擦出爱情火花！中信证券也上市了，还是注册制的, 中信建投也不错。"
127 |     cut_res = cut_process(text)
128 |     print('tmp_res %s' % cut_res)
129 | 
130 |     test_df = pd.DataFrame({'text': [text, text, text, text, text]})
131 |     # 非多进程下直接提取股票实体
132 |     test_df['stock_list'] = test_df['text'].apply(cut_process)
133 |     print('test_df %s' % test_df)
134 | 
135 |     # 多进程下提取股票实体
136 |     df_grouped = test_df.groupby(test_df.index)
137 |     pp = apply_parallel(df_grouped, tmp_func)
138 |     print('pp %s' % pp['stock_list'])
139 | 
140 | 
141 | # 结构调整
142 | result_df = run(discuss_df)
143 | # 存储到表中
144 | print(result_df.head())
145 | 
146 | # # 创建数据库引擎
147 | engine_mysql_test = data_source.GetDataEngine("XAVIER_DB")
148 | engine_mysql = data_source.GetDataEngine("XAVIER")
149 | 
150 | result_df.to_sql('discuss_stock_filter', engine_mysql, if_exists='replace', index=False)
151 | 
152 | 


--------------------------------------------------------------------------------
/src/parser/xueqiu/focus_parser/__init__.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | 
4 | """
5 | @version: ??
6 | @author: li
7 | @file: __init__.py.py
8 | @time: 2019-04-29 14:41
9 | """


--------------------------------------------------------------------------------
/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: xueqiu_focus_statistics.py
 8 | @time: 2019-04-29 14:47
 9 | 统计累计大V的股票关注数
10 | """
11 | import sys
12 | sys.path.append('../')
13 | sys.path.append('../../')
14 | sys.path.append('../../../')
15 | sys.path.append('../../../../')
16 | sys.path.append('../../../../../')
17 | 
18 | import pandas as pd
19 | import time
20 | from sqlalchemy import create_engine
21 | from src.utils import time_util
22 | from src.utils.log import log_util
23 | from src.data_reader import read_all_data
24 | 
25 | 
26 | logging = log_util.Logger('xueqiu_focus_statistic')
27 | 
28 | 
29 | def f(row):
30 |     if row[:2] == 'SH':
31 |         return str(row[2:]) + '.' + 'XSHG'
32 |     elif row[:2] == 'SZ':
33 |         return str(row[2:]) + '.' + 'XSHE'
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     pd.set_option('display.max_rows', None, 'display.max_columns', None, "display.max_colwidth", 1000, 'display.width', 1000)
38 |     # engine_mysql_test = data_source.GetDataEngine("VISIONTEST")
39 |     # engine_mysql = data_source.GetDataEngine("VISION")
40 |     engine_mysql_test = create_engine('mysql+mysqlconnector://test_edit:test_edit_2019@db1.irongliang.com:3306/test')
41 | 
42 |     date_time = time_util.get_integral_point_time(0)
43 |     logging.logger.info("program start at {}".format(time_util.timestamp_to_time(date_time), "%Y-%m-%d"))
44 | 
45 |     # 读取原始大V关注数据
46 |     # 大V关注所保存的表
47 |     sheet_name = 'xq_user_stock'
48 |     # 读取指定时间段的所有数据
49 |     sql = "SELECT * FROM test.{} WHERE created <={}".format(sheet_name, str(date_time * 1000))
50 | 
51 |     # 读取需要处理的数据，从数据库中以DataFrame的格式读取。
52 |     focus_df = read_all_data(sheet_name, engine_mysql_test, sql)
53 |     # print(focus_df)
54 |     logging.logger.info("导入 %s 条数据" % len(focus_df))
55 | 
56 |     res_grouped = []
57 |     focus_grouped = focus_df.groupby('symbol')
58 |     for symbol, value in focus_grouped:
59 |         counts = value['uid'].count()
60 |         res_grouped.append([f(symbol), counts])
61 | 
62 |     result = pd.DataFrame(res_grouped, columns=['symbol', 'focus_total_count'])
63 |     save_time = date_time - 86400  # (24*60*60)
64 | 
65 |     result['created_at'] = str(time_util.timestamp_to_time(save_time, "%Y-%m-%d"))
66 | 
67 |     # 存储到表中
68 |     # 创建数据库引擎
69 |     result.to_sql('xueqiu_focus_total_count', engine_mysql_test, if_exists='append', index=False)
70 |     # print(result)
71 |     logging.logger.info('生成 %s 条数据' % len(result))
72 |     logging.logger.info('数据保存到第 %s 天' % str(time_util.timestamp_to_time(save_time, "%Y-%m-%d")))
73 |     logging.logger.info('program finished, end at %s' % str(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))))


--------------------------------------------------------------------------------
/src/parser/xueqiu/focus_parser/雪球大V关注股票.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "pycharm": {}
  7 |       },
  8 |       "source": [
  9 |         "### 获取数据进行处理"
 10 |       ]
 11 |     },
 12 |     {
 13 |       "cell_type": "code",
 14 |       "execution_count": 5,
 15 |       "metadata": {
 16 |         "pycharm": {}
 17 |       },
 18 |       "outputs": [],
 19 |       "source": [
 20 |         "import os\n",
 21 |         "import pdb\n",
 22 |         "import pandas as pd\n",
 23 |         "import datetime\n",
 24 |         "from jqdatasdk import *"
 25 |       ]
 26 |     },
 27 |     {
 28 |       "cell_type": "code",
 29 |       "execution_count": 6,
 30 |       "metadata": {
 31 |         "pycharm": {}
 32 |       },
 33 |       "outputs": [
 34 |         {
 35 |           "name": "stderr",
 36 |           "output_type": "stream",
 37 |           "text": [
 38 |             "/home/kerry/work/workenv/alpha_mind/lib/python3.6/site-packages/numpy/lib/arraysetops.py:569: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n",
 39 |             "  mask |\u003d (ar1 \u003d\u003d a)\n"
 40 |           ]
 41 |         },
 42 |         {
 43 |           "ename": "NameError",
 44 |           "evalue": "name \u0027vip_stock_sets\u0027 is not defined",
 45 |           "output_type": "error",
 46 |           "traceback": [
 47 |             "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 48 |             "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
 49 |             "\u001b[0;32m\u003cipython-input-6-282aeaea1455\u003e\u001b[0m in \u001b[0;36m\u003cmodule\u003e\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mip_stock_sets\u001b[0m \u001b[0;34m\u003d\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u0027/kywk/data/xq/vip_stock/vip_stock_sets.csv\u0027\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex_col\u001b[0m\u001b[0;34m\u003d\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreset_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----\u003e 2\u001b[0;31m vip_stock_sets[\u0027created\u0027] \u003d vip_stock_sets[\u0027created\u0027].apply(lambda x: datetime.datetime.strptime(x, \n\u001b[0m\u001b[1;32m      3\u001b[0m                                                                                                  \u0027%Y-%m-%d %H:%M:%S\u0027))\n",
 50 |             "\u001b[0;31mNameError\u001b[0m: name \u0027vip_stock_sets\u0027 is not defined"
 51 |           ]
 52 |         }
 53 |       ],
 54 |       "source": [
 55 |         "ip_stock_sets \u003d pd.read_csv(\u0027/kywk/data/xq/vip_stock/vip_stock_sets.csv\u0027, index_col\u003d0).reset_index()\n",
 56 |         "vip_stock_sets[\u0027created\u0027] \u003d vip_stock_sets[\u0027created\u0027].apply(lambda x: datetime.datetime.strptime(x, \n",
 57 |         "                                                                                                 \u0027%Y-%m-%d %H:%M:%S\u0027))"
 58 |       ]
 59 |     },
 60 |     {
 61 |       "cell_type": "code",
 62 |       "execution_count": null,
 63 |       "metadata": {
 64 |         "pycharm": {}
 65 |       },
 66 |       "outputs": [],
 67 |       "source": [
 68 |         "vip_stock_sets"
 69 |       ]
 70 |     },
 71 |     {
 72 |       "cell_type": "markdown",
 73 |       "metadata": {
 74 |         "pycharm": {}
 75 |       },
 76 |       "source": [
 77 |         "## 只保留沪深股票"
 78 |       ]
 79 |     },
 80 |     {
 81 |       "cell_type": "code",
 82 |       "execution_count": 3,
 83 |       "metadata": {
 84 |         "pycharm": {}
 85 |       },
 86 |       "outputs": [],
 87 |       "source": [
 88 |         "def f(row):\n",
 89 |         "    if row[:2] \u003d\u003d \u0027SH\u0027:\n",
 90 |         "        return str(row[2:]) + \u0027.\u0027 + \u0027XSHG\u0027\n",
 91 |         "    elif row[:2] \u003d\u003d \u0027SZ\u0027:\n",
 92 |         "        return str(row[2:]) + \u0027.\u0027 + \u0027XSHE\u0027"
 93 |       ]
 94 |     },
 95 |     {
 96 |       "cell_type": "code",
 97 |       "execution_count": 4,
 98 |       "metadata": {
 99 |         "scrolled": false,
100 |         "pycharm": {}
101 |       },
102 |       "outputs": [],
103 |       "source": [
104 |         "ex_vip_stock_sets \u003d vip_stock_sets.set_index(\u0027exchange\u0027)\n",
105 |         "xsh_market_stock \u003d ex_vip_stock_sets.loc[\u0027SZ\u0027].append(ex_vip_stock_sets.loc[\u0027SH\u0027]).reset_index()\n",
106 |         "xsh_market_stock[\u0027symbol\u0027] \u003d xsh_market_stock[\u0027symbol\u0027].apply(f)"
107 |       ]
108 |     },
109 |     {
110 |       "cell_type": "markdown",
111 |       "metadata": {
112 |         "pycharm": {}
113 |       },
114 |       "source": [
115 |         "## 统计近段时间股票关注比例"
116 |       ]
117 |     },
118 |     {
119 |       "cell_type": "code",
120 |       "execution_count": 22,
121 |       "metadata": {
122 |         "scrolled": false,
123 |         "pycharm": {}
124 |       },
125 |       "outputs": [],
126 |       "source": [
127 |         "bwteen_time \u003d datetime.datetime(2019,2,10)\n",
128 |         "time_marekt_stock \u003d xsh_market_stock[xsh_market_stock[\u0027created\u0027] \u003e bwteen_time]\n",
129 |         "time_marekt_stock \u003d time_marekt_stock[[\u0027exchange\u0027,\u0027name\u0027,\u0027symbol\u0027,\u0027vid\u0027]]\n",
130 |         "symbol_market_stock \u003d time_marekt_stock.groupby(\u0027symbol\u0027).count()\n",
131 |         "symbol_market_stock[\u0027ratio\u0027] \u003d symbol_market_stock[\u0027vid\u0027] / len(symbol_market_stock)\n",
132 |         "symbol_market_stock \u003d symbol_market_stock.sort_values(by\u003d\u0027ratio\u0027, ascending\u003dFalse)[[\u0027ratio\u0027,\u0027vid\u0027]]\n",
133 |         "symbol_market_stock.rename(columns\u003d{\u0027vid\u0027:\u0027count\u0027}, inplace\u003dTrue)"
134 |       ]
135 |     },
136 |     {
137 |       "cell_type": "code",
138 |       "execution_count": 23,
139 |       "metadata": {
140 |         "pycharm": {}
141 |       },
142 |       "outputs": [],
143 |       "source": [
144 |         "market_stock \u003d time_marekt_stock.drop_duplicates(subset\u003d\u0027symbol\u0027, keep\u003d\u0027first\u0027, inplace\u003dFalse).set_index(\u0027symbol\u0027)\n",
145 |         "market_stock \u003d market_stock[[\u0027name\u0027]]\n",
146 |         "v_market_stock \u003d symbol_market_stock.merge(market_stock, left_index\u003dTrue, right_index\u003dTrue)"
147 |       ]
148 |     },
149 |     {
150 |       "cell_type": "markdown",
151 |       "metadata": {
152 |         "pycharm": {}
153 |       },
154 |       "source": [
155 |         "# 获取对应行业"
156 |       ]
157 |     },
158 |     {
159 |       "cell_type": "code",
160 |       "execution_count": 24,
161 |       "metadata": {
162 |         "pycharm": {}
163 |       },
164 |       "outputs": [],
165 |       "source": "industry_set \u003d [\u0027801010\u0027, \u0027801020\u0027, \u0027801030\u0027, \u0027801040\u0027, \u0027801050\u0027, \u0027801080\u0027, \u0027801110\u0027, \u0027801120\u0027, \u0027801130\u0027, \n                  \u0027801140\u0027, \u0027801150\u0027, \u0027801160\u0027, \u0027801170\u0027, \u0027801180\u0027, \u0027801200\u0027, \u0027801210\u0027, \u0027801230\u0027, \u0027801710\u0027,\n                  \u0027801720\u0027, \u0027801730\u0027, \u0027801740\u0027, \u0027801750\u0027, \u0027801760\u0027, \u0027801770\u0027, \u0027801780\u0027, \u0027801790\u0027, \u0027801880\u0027,\u0027801890\u0027]\nindustry_df \u003d pd.DataFrame(columns\u003dv_market_stock.index)\nfor industry in industry_set:\n    industry_stocks \u003d get_industry_stocks(industry)\n    industry_stocks \u003d list(set(industry_stocks)\u0026set(v_market_stock.index))\n    industry_df.loc[\u0027industry\u0027,industry_stocks] \u003d industry\n\nindustry_df \u003d industry_df.T.dropna()\nindustry_df.reset_index(inplace \u003d True)\nindustry_df.set_index(\u0027symbol\u0027,inplace\u003dTrue)"
166 |     },
167 |     {
168 |       "cell_type": "code",
169 |       "execution_count": 25,
170 |       "metadata": {
171 |         "pycharm": {}
172 |       },
173 |       "outputs": [],
174 |       "source": [
175 |         "industry_market_stock \u003d v_market_stock.merge(industry_df,left_index\u003dTrue, right_index\u003dTrue)"
176 |       ]
177 |     },
178 |     {
179 |       "cell_type": "code",
180 |       "execution_count": 26,
181 |       "metadata": {
182 |         "pycharm": {}
183 |       },
184 |       "outputs": [],
185 |       "source": [
186 |         "industry_set \u003d [\n",
187 |         "    {\u0027industry\u0027:\u0027801010\u0027,\u0027indname\u0027:\u0027农林牧渔I\u0027},\n",
188 |         "    {\u0027industry\u0027:\u0027801020\u0027,\u0027indname\u0027:\u0027采掘I\u0027},\n",
189 |         "    {\u0027industry\u0027:\u0027801030\u0027,\u0027indname\u0027:\u0027化工I\u0027},\n",
190 |         "    {\u0027industry\u0027:\u0027801040\u0027,\u0027indname\u0027:\u0027钢铁I\u0027},\n",
191 |         "    {\u0027industry\u0027:\u0027801050\u0027,\u0027indname\u0027:\u0027有色金属I\u0027},\n",
192 |         "    {\u0027industry\u0027:\u0027801080\u0027,\u0027indname\u0027:\u0027电子I\u0027},\n",
193 |         "    {\u0027industry\u0027:\u0027801110\u0027,\u0027indname\u0027:\u0027家用电器I\u0027},\n",
194 |         "    {\u0027industry\u0027:\u0027801120\u0027,\u0027indname\u0027:\u0027食品饮料I\u0027},\n",
195 |         "    {\u0027industry\u0027:\u0027801130\u0027,\u0027indname\u0027:\u0027纺织服装I\u0027},\n",
196 |         "    {\u0027industry\u0027:\u0027801140\u0027,\u0027indname\u0027:\u0027轻工制造I\u0027},\n",
197 |         "    {\u0027industry\u0027:\u0027801150\u0027,\u0027indname\u0027:\u0027医药生物I\u0027},\n",
198 |         "    {\u0027industry\u0027:\u0027801160\u0027,\u0027indname\u0027:\u0027公用事业I\u0027},\n",
199 |         "    {\u0027industry\u0027:\u0027801170\u0027,\u0027indname\u0027:\u0027交通运输I\u0027},\n",
200 |         "    {\u0027industry\u0027:\u0027801180\u0027,\u0027indname\u0027:\u0027房地产I\u0027},\n",
201 |         "    {\u0027industry\u0027:\u0027801200\u0027,\u0027indname\u0027:\u0027商业贸易I\u0027},\n",
202 |         "    {\u0027industry\u0027:\u0027801210\u0027,\u0027indname\u0027:\u0027休闲服务I\u0027},\n",
203 |         "    {\u0027industry\u0027:\u0027801230\u0027,\u0027indname\u0027:\u0027综合I\u0027},\n",
204 |         "    {\u0027industry\u0027:\u0027801710\u0027,\u0027indname\u0027:\u0027建筑材料I\u0027},\n",
205 |         "    {\u0027industry\u0027:\u0027801720\u0027,\u0027indname\u0027:\u0027建筑装饰I\u0027},\n",
206 |         "    {\u0027industry\u0027:\u0027801730\u0027,\u0027indname\u0027:\u0027电气设备I\u0027},\n",
207 |         "    {\u0027industry\u0027:\u0027801740\u0027,\u0027indname\u0027:\u0027国防军工I\u0027},\n",
208 |         "    {\u0027industry\u0027:\u0027801750\u0027,\u0027indname\u0027:\u0027计算机I\u0027},\n",
209 |         "    {\u0027industry\u0027:\u0027801760\u0027,\u0027indname\u0027:\u0027传媒I\u0027},\n",
210 |         "    {\u0027industry\u0027:\u0027801770\u0027,\u0027indname\u0027:\u0027通信I\u0027},\n",
211 |         "    {\u0027industry\u0027:\u0027801780\u0027,\u0027indname\u0027:\u0027银行I\u0027},\n",
212 |         "    {\u0027industry\u0027:\u0027801790\u0027,\u0027indname\u0027:\u0027非银金融I\u0027},\n",
213 |         "    {\u0027industry\u0027:\u0027801880\u0027,\u0027indname\u0027:\u0027汽车I\u0027},\n",
214 |         "    {\u0027industry\u0027:\u0027801890\u0027,\u0027indname\u0027:\u0027机械设备I\u0027}]\n",
215 |         "market_industry \u003d pd.DataFrame(industry_set).set_index(\u0027industry\u0027)"
216 |       ]
217 |     },
218 |     {
219 |       "cell_type": "code",
220 |       "execution_count": 27,
221 |       "metadata": {
222 |         "pycharm": {}
223 |       },
224 |       "outputs": [],
225 |       "source": [
226 |         "industry_market_stock \u003dindustry_market_stock.reset_index().set_index(\n",
227 |         "    \u0027industry\u0027).merge(market_industry,left_index\u003dTrue, right_index\u003dTrue)"
228 |       ]
229 |     },
230 |     {
231 |       "cell_type": "code",
232 |       "execution_count": 28,
233 |       "metadata": {
234 |         "scrolled": false,
235 |         "pycharm": {}
236 |       },
237 |       "outputs": [],
238 |       "source": [
239 |         "result_socket \u003d industry_market_stock.sort_values(by\u003d\u0027count\u0027, ascending\u003dFalse)[:50]"
240 |       ]
241 |     },
242 |     {
243 |       "cell_type": "code",
244 |       "execution_count": 29,
245 |       "metadata": {
246 |         "pycharm": {}
247 |       },
248 |       "outputs": [],
249 |       "source": [
250 |         "result_socket.to_csv(bwteen_time.strftime(\"%Y-%m-%d\") + \u0027_stock_50.csv\u0027,encoding\u003d\u0027UTF-8\u0027)\n",
251 |         "result_socket.groupby(\u0027indname\u0027).count().to_csv(bwteen_time.strftime(\"%Y-%m-%d\") + \u0027group_50.csv\u0027,encoding\u003d\u0027UTF-8\u0027)\n",
252 |         "\n",
253 |         "industry_market_stock.groupby(\u0027indname\u0027).count().to_csv(bwteen_time.strftime(\"%Y-%m-%d\") + \u0027group_all.csv\u0027,encoding\u003d\u0027UTF-8\u0027)"
254 |       ]
255 |     },
256 |     {
257 |       "cell_type": "code",
258 |       "execution_count": null,
259 |       "metadata": {
260 |         "pycharm": {}
261 |       },
262 |       "outputs": [],
263 |       "source": []
264 |     }
265 |   ],
266 |   "metadata": {
267 |     "kernelspec": {
268 |       "display_name": "Python 3",
269 |       "language": "python",
270 |       "name": "python3"
271 |     },
272 |     "language_info": {
273 |       "codemirror_mode": {
274 |         "name": "ipython",
275 |         "version": 3
276 |       },
277 |       "file_extension": ".py",
278 |       "mimetype": "text/x-python",
279 |       "name": "python",
280 |       "nbconvert_exporter": "python",
281 |       "pygments_lexer": "ipython3",
282 |       "version": "3.6.7"
283 |     }
284 |   },
285 |   "nbformat": 4,
286 |   "nbformat_minor": 2
287 | }


--------------------------------------------------------------------------------
/src/parser/xueqiu/log/dict_log.log.2019-04-29:
--------------------------------------------------------------------------------
1 | 2019-04-29 16:09:34,506 - /Users/li/PycharmProjects/event_parser/src/utils/dicts.py[line:189] - INFO: [Info] jieba总共添加了10889个自定义词汇。
2 | 2019-04-29 16:09:51,171 - /Users/li/PycharmProjects/event_parser/src/utils/dicts.py[line:189] - INFO: [Info] jieba总共添加了10889个自定义词汇。
3 | 2019-04-29 16:18:37,725 - /Users/li/PycharmProjects/event_parser/src/utils/dicts.py[line:189] - INFO: [Info] jieba总共添加了10889个自定义词汇。
4 | 2019-04-29 16:19:32,879 - /Users/li/PycharmProjects/event_parser/src/utils/dicts.py[line:189] - INFO: [Info] jieba总共添加了10889个自定义词汇。
5 | 2019-04-29 16:20:55,599 - /Users/li/PycharmProjects/event_parser/src/utils/dicts.py[line:189] - INFO: [Info] jieba总共添加了10889个自定义词汇。
6 | 


--------------------------------------------------------------------------------
/src/parser/xueqiu/log/discuss_stock_filter_daily.log.2019-04-29:
--------------------------------------------------------------------------------
 1 | 2019-04-29 16:09:35,871 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:62] - INFO: program start at 2019-04-28 00:00:00
 2 | 2019-04-29 16:09:37,541 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:82] - INFO: load discuss data from mysql successful
 3 | 2019-04-29 16:09:38,095 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:120] - INFO: length of result: 55
 4 | 2019-04-29 16:09:38,095 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:126] - INFO: program finished
 5 | 2019-04-29 16:09:52,444 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:61] - INFO: program start at 2019-04-28 09:00:00
 6 | 2019-04-29 16:09:53,530 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:81] - INFO: load discuss data from mysql successful
 7 | 2019-04-29 16:09:53,981 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:119] - INFO: length of result: 55
 8 | 2019-04-29 16:09:53,981 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:125] - INFO: program finished
 9 | 2019-04-29 16:18:39,087 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:61] - INFO: program start at 2019-04-28 09:00:00
10 | 2019-04-29 16:18:39,087 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:62] - INFO: program stop at 2019-04-29 09:00:00
11 | 2019-04-29 16:18:40,227 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:82] - INFO: load discuss data from mysql successful
12 | 2019-04-29 16:18:40,686 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:120] - INFO: length of result: 55
13 | 2019-04-29 16:18:40,686 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:126] - INFO: program finished
14 | 2019-04-29 16:19:34,182 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:61] - INFO: program start at 2019-04-28 00:00:00
15 | 2019-04-29 16:19:34,182 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:62] - INFO: program stop at 2019-04-29 00:00:00
16 | 2019-04-29 16:19:35,319 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:82] - INFO: load discuss data from mysql successful
17 | 2019-04-29 16:19:35,775 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:120] - INFO: length of result: 55
18 | 2019-04-29 16:19:35,775 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:126] - INFO: program finished
19 | 2019-04-29 16:20:56,877 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:61] - INFO: program start at 2019-04-28 00:00:00
20 | 2019-04-29 16:20:56,877 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:62] - INFO: program stop at 2019-04-29 00:00:00
21 | 2019-04-29 16:20:57,998 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:82] - INFO: load discuss data from mysql successful
22 | 2019-04-29 16:20:58,450 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:121] - INFO: length of result: 55
23 | 2019-04-29 16:20:58,450 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:126] - INFO: 数据保存到第 2019-04-28 天
24 | 2019-04-29 16:20:58,450 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/discuss_parser/xueqiu_discuss_daily.py[line:128] - INFO: program finished
25 | 


--------------------------------------------------------------------------------
/src/parser/xueqiu/log/tokenization_log.log.2019-04-29:
--------------------------------------------------------------------------------
1 | 2019-04-29 16:09:34,509 - /Users/li/PycharmProjects/event_parser/src/utils/tokenization.py[line:37] - INFO: Stopwords 导入成功！
2 | 2019-04-29 16:09:51,173 - /Users/li/PycharmProjects/event_parser/src/utils/tokenization.py[line:37] - INFO: Stopwords 导入成功！
3 | 2019-04-29 16:18:37,726 - /Users/li/PycharmProjects/event_parser/src/utils/tokenization.py[line:37] - INFO: Stopwords 导入成功！
4 | 2019-04-29 16:19:32,881 - /Users/li/PycharmProjects/event_parser/src/utils/tokenization.py[line:37] - INFO: Stopwords 导入成功！
5 | 2019-04-29 16:20:55,600 - /Users/li/PycharmProjects/event_parser/src/utils/tokenization.py[line:37] - INFO: Stopwords 导入成功！
6 | 


--------------------------------------------------------------------------------
/src/parser/xueqiu/log/xueqiu_focus_statistic.log.2019-04-29:
--------------------------------------------------------------------------------
 1 | 2019-04-29 15:32:14,329 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:36] - INFO: program start at 2019-04-29 00:00:00
 2 | 2019-04-29 15:32:37,232 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:36] - INFO: program start at 2019-04-29 00:00:00
 3 | 2019-04-29 15:34:08,280 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:36] - INFO: program start at 2019-04-29 00:00:00
 4 | 2019-04-29 15:34:32,826 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:36] - INFO: program start at 2019-04-29 00:00:00
 5 | 2019-04-29 15:35:56,258 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:36] - INFO: program start at 51292-06-25 00:00:00
 6 | 2019-04-29 15:36:11,297 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:36] - INFO: program start at 2019-04-29 00:00:00
 7 | 2019-04-29 15:38:13,175 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:36] - INFO: program start at 2019-04-29 00:00:00
 8 | 2019-04-29 15:40:33,020 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:38] - INFO: program start at 2019-04-29 00:00:00
 9 | 2019-04-29 15:42:01,647 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:38] - INFO: program start at 2019-04-29 00:00:00
10 | 2019-04-29 15:42:03,936 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:65] - INFO: 数据保存到第 2019-04-28 天
11 | 2019-04-29 15:42:03,937 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:66] - INFO: program finished
12 | 2019-04-29 15:42:50,969 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:38] - INFO: program start at 2019-04-29 00:00:00
13 | 2019-04-29 15:42:53,807 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:65] - INFO: 数据保存到第 2019-04-28 天
14 | 2019-04-29 15:42:53,807 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:66] - INFO: program finished, end at 2019-04-29
15 | 2019-04-29 15:43:43,864 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:38] - INFO: program start at 2019-04-29 00:00:00
16 | 2019-04-29 15:43:46,227 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:65] - INFO: 数据保存到第 2019-04-28 天
17 | 2019-04-29 15:43:46,227 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:66] - INFO: program finished, end at 2019-04-29 00:00:00
18 | 2019-04-29 15:44:10,233 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:38] - INFO: program start at 2019-04-29 00:00:00
19 | 2019-04-29 15:44:12,612 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:65] - INFO: 数据保存到第 2019-04-28 天
20 | 2019-04-29 15:44:12,612 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:66] - INFO: program finished, end at 2019-04-29
21 | 2019-04-29 15:44:47,022 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:38] - INFO: program start at 2019-04-29 00:00:00
22 | 2019-04-29 15:44:49,805 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:65] - INFO: 数据保存到第 2019-04-28 天
23 | 2019-04-29 15:47:00,046 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:38] - INFO: program start at 2019-04-29 00:00:00
24 | 2019-04-29 15:47:02,921 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:65] - INFO: 数据保存到第 2019-04-28 天
25 | 2019-04-29 15:47:02,922 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:66] - INFO: program finished, end at 2019.04.29
26 | 2019-04-29 15:47:47,126 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:38] - INFO: program start at 2019-04-29 00:00:00
27 | 2019-04-29 15:47:49,928 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:65] - INFO: 数据保存到第 2019-04-28 天
28 | 2019-04-29 15:47:49,928 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:66] - INFO: program finished, end at 2019-04-29 15:47:49
29 | 2019-04-29 15:50:14,033 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:38] - INFO: program start at 2019-04-29 00:00:00
30 | 2019-04-29 15:50:15,301 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:49] - INFO: load 20 data
31 | 2019-04-29 15:50:16,338 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:66] - INFO: 生成 20 条数据
32 | 2019-04-29 15:50:16,338 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:67] - INFO: 数据保存到第 2019-04-28 天
33 | 2019-04-29 15:50:16,339 - /Users/li/PycharmProjects/event_parser/src/parser/xueqiu/focus_parser/xueqiu_focus_statistics.py[line:68] - INFO: program finished, end at 2019-04-29 15:50:16
34 | 


--------------------------------------------------------------------------------
/src/singlepass_run.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: singlepass_run.py
 8 | @time: 2018/11/29 8:04 PM
 9 | 新闻聚类
10 | """
11 | import sys
12 | import time
13 | import pickle
14 | sys.path.append('..')
15 | sys.path.append('../')
16 | sys.path.append('../../')
17 | from src.configure import conf
18 | from src.utils.log import log_util
19 | from src.utils.VSM import tfidf
20 | from src.algorithm.cluster.singlePass import singlePassCluster
21 | 
22 | logging = log_util.Logger('singlepass_run')
23 | # corpus_train_path = "/Users/li/PycharmProjects/event_parser/src/data/text_full_index.txt"
24 | corpus_train_path = conf.corpus_train_path
25 | # tfidf_train, word_dict = tfidf_vector(corpus_train)
26 | # tfidf_train, word_dict = tfidf.tfidf_vector(corpus_train)
27 | corpus_train_dict = tfidf.load_data(corpus_train_path)
28 | 
29 | # load tf-idf VSM
30 | tfidf_feature_path = conf.tfidf_feature_path
31 | tfidf_transformer_path = conf.tfidftransformer_path
32 | 
33 | try:
34 |     tfidf_feature = tfidf.load_tfidf_feature(tfidf_feature_path)
35 |     logging.logger.info("TF-IDF feature load success")
36 |     tfidf_transformer = tfidf.load_tfidf_transformer(tfidf_transformer_path)
37 |     logging.logger.info("TF-IDF transformer load success")
38 | except:
39 |     logging.logger.info("TF-IDF model load failed, please check path %s,%s" % (tfidf_feature_path,
40 |                                                                                tfidf_transformer_path))
41 |     sys.exit()
42 | # 计算历史新闻文本的TF-IDF，并与news_id组成tuple
43 | tf_idf_start_time = time.time()
44 | tfidf_train_tuple = tfidf.load_batch_tfidf_vector(corpus_train_dict, tfidf_feature, tfidf_transformer)
45 | logging.logger.info('TF-IDF of news calculate success, using {} s'.format(time.time() - tf_idf_start_time))
46 | 
47 | # tfidf_train_tuple = []
48 | # for item in corpus_train_dict.items():
49 | #     catagory, corpus = item[1], item[0]
50 | #     tfidf_train_tuple.append((catagory, tfidf.load_tfidf_vectorizer([corpus], tfidf_feature, tfidf_transformer)))
51 | 
52 | # tfidf_train_dict, tfidf_train_tuple, word_dict = tfidf.tfidf_vectorizer(corpus_train_path)
53 | 
54 | # 对输入的历史新闻文本进行singlepass聚类。
55 | # clustering = OnePassCluster(vector_tuple=tfidf_train.toarray(), threshold=10)
56 | # clustering = singlePassCluster.OnePassCluster(vector_tuple=tfidf_train_tuple, threshold=10)
57 | statrt_time = time.time()
58 | clustering = singlePassCluster.OnePassCluster(vector_tuple=tfidf_train_tuple, threshold=10)
59 | clustering.print_result()
60 | logging.logger.info('singPass cluster done, it take\'s %s s' % (time.time()-statrt_time))
61 | 
62 | # 将聚好的类簇保存下来，为后面的事件表示和有效事件判断使用。
63 | # clustering_path = '/Users/li/PycharmProjects/event_parser/src/model/clustering_new.pkl'
64 | clustering_path = conf.clustering_save_path
65 | with open(clustering_path, 'wb') as fw:
66 |     pickle.dump(clustering, fw)
67 | logging.logger.info("cluster units save success in path{}".format(clustering_path))
68 | # for cluster_index, cluster in enumerate(cluster_list):
69 | #     print "cluster:%s" % cluster_index  # 簇的序号
70 | #     print cluster.node_list  # 该簇的节点列表
71 | 


--------------------------------------------------------------------------------
/src/singlepass_test.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: singlepass_test.py
 8 | @time: 2018-12-27 20:38
 9 | """
10 | 
11 | 
12 | import sys
13 | import numpy as np
14 | sys.path.append('..')
15 | sys.path.append('../')
16 | sys.path.append('../../')
17 | from src.configure import conf
18 | from src.utils.log import log_util
19 | from src.utils.VSM import tfidf
20 | 
21 | logging = log_util.Logger('singlepass_test')
22 | # corpus_train_path = "/Users/li/PycharmProjects/event_parser/src/data/text_full_index.txt"
23 | corpus_train_path = conf.corpus_train_path
24 | # tfidf_train, word_dict = tfidf_vector(corpus_train)
25 | # tfidf_train, word_dict = tfidf.tfidf_vector(corpus_train)
26 | corpus_train_dict = tfidf.load_data(corpus_train_path)
27 | 
28 | # load tf-idf VSM
29 | tfidf_feature_path = conf.tfidf_feature_path
30 | tfidf_transformer_path = conf.tfidftransformer_path
31 | 
32 | try:
33 |     tfidf_feature = tfidf.load_tfidf_feature(tfidf_feature_path)
34 |     tfidf_transformer = tfidf.load_tfidf_transformer(tfidf_transformer_path)
35 |     logging.logger.info("TF-IDF model load sucess")
36 | except:
37 |     logging.logger.info("TF-IDF model load failed, please check path %s,%s" % (tfidf_feature_path,
38 |                                                                                tfidf_transformer_path))
39 |     sys.exit()
40 | # 计算历史新闻文本的TF-IDF，并与news_id组成tuple
41 | tfidf_train_tuple = tfidf.load_batch_tfidf_vector(corpus_train_dict, tfidf_feature, tfidf_transformer)
42 | logging.logger.info('TF-IDF of news calculate success')
43 | 
44 | # tfidf_train_tuple = []
45 | # for item in corpus_train_dict.items():
46 | #     catagory, corpus = item[1], item[0]
47 | #     tfidf_train_tuple.append((catagory, tfidf.load_tfidf_vectorizer([corpus], tfidf_feature, tfidf_transformer)))
48 | 
49 | tfidf_train_dict, tfidf_train_tuple2, word_dict = tfidf.tfidf_vectorizer(corpus_train_path)
50 | 
51 | # for i in tfidf_train_tuple[0][1]:
52 | #     print i
53 | print(np.nonzero(tfidf_train_tuple[0][1]))
54 | print(np.nonzero(tfidf_train_tuple2[0][1]))
55 | 
56 | print(tfidf_train_tuple[0][1] == np.nonzero(tfidf_train_tuple2[0][1]))
57 | 
58 | # for i in dict(tfidf_train_tuple).items():
59 | #     print i[0]


--------------------------------------------------------------------------------
/src/utils/Keywords.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: Keywords.py
 8 | @time: 2018/11/14 8:23 PM
 9 | 关键词提取程序
10 | """
11 | import sys
12 | sys.path.append("../")
13 | import log_util
14 | from src.configure import Configure
15 | from pyhanlp import *
16 | from jieba import analyse
17 | from tokenization import Tokenizer
18 | 
19 | logging = log_util.Logger('keywordsLog')
20 | 
21 | # logging.logger.info("running %s" % ' '.join(sys.argv))
22 | 
23 | 
24 | class keywordsExtractor(object):
25 |     def __init__(self):
26 |         conf = Configure()
27 |         self.stopwords_path = conf.stop_words_path
28 |         self.persent = 0.1
29 | 
30 |     def run(self, document):
31 | 
32 |         # tk = Tokenizer()
33 |         # document = tk.token(document)
34 |         # 基于Hanlp库的关键词提取
35 |         print("[Info] keywords by Hanlp:")
36 |         keywords_hanlp = HanLP.extractKeyword(document, 20)
37 |         # print ",".join(keyword for keyword in keywords_hanlp)
38 | 
39 |         # 基于jieba库的关键词抽取
40 |         # 添加停用词
41 |         analyse.set_stop_words(self.stopwords_path)
42 |         # 引入TextRank关键词抽取接口
43 |         textrank = analyse.textrank
44 |         print "[Info] keywords by textrank:"
45 |         # keywords_jieba = textrank(document, 8, allowPOS=['n', 'nr', 'ns', 'vn', 'v'])
46 |         # keywords_jieba = textrank(document, 20, withWeight=True)
47 |         keywords_jieba = textrank(document, 20)
48 |         # 输出抽取出的关键词
49 |         # print ",".join(keyword for keyword in keywords_jieba)
50 | 
51 |         # 两种关键词提取接口做交集
52 |         print"[Info] 两个关键词提取方法取交集:"
53 |         join_set = set(keywords_hanlp).intersection(set(keywords_jieba))
54 |         # print ",".join(item for item in join_set)
55 |         return join_set
56 | 
57 | 
58 | def test():
59 |     # 关键词提取
60 |     document = u"水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露，" \
61 |                u"根据刚刚完成了水资源管理制度的考核，有部分省接近了红线的指标，\n" \
62 |                u"有部分省超过红线的指标。对一些超过红线的地方，\n陈明忠表示，对一些取用水项目进行区域的限批，" \
63 |                u"严格地进行水资源论证和取水许可的批准。"
64 | 
65 |     kex = keywordsExtractor()
66 |     keywords = kex.run(document)
67 |     print ",".join(item for item in keywords)
68 | 
69 | 
70 | if __name__ == '__main__':
71 |     test()


--------------------------------------------------------------------------------
/src/utils/VSM/__init__.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | 
4 | """
5 | @version: 1.0
6 | @author: li
7 | @file: __init__.py.py
8 | @time: 2019/12/12 1:58 下午
9 | """


--------------------------------------------------------------------------------
/src/utils/VSM/tfidf.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | @version: ??
  6 | @author: li
  7 | @file: tfidf.py
  8 | @time: 2018/11/28 4:03 PM
  9 | """
 10 | import pickle
 11 | import numpy as np
 12 | 
 13 | from sklearn.feature_extraction.text import TfidfTransformer
 14 | from sklearn.feature_extraction.text import CountVectorizer
 15 | 
 16 | import sys
 17 | sys.path.append('..')
 18 | sys.path.append('../')
 19 | sys.path.append('../../')
 20 | sys.path.append('/Users/li/PycharmProjects/event_parser/src')
 21 | from src.configure import Configure
 22 | 
 23 | conf = Configure()
 24 | 
 25 | 
 26 | def load_data(corpus_path):
 27 |     corpus_train_dic = {}
 28 |     for line in open(corpus_path):
 29 |         line = line.strip().split('\t')
 30 |         if len(line) == 3:
 31 |             category = line[0]
 32 |             words = line[2]
 33 |             corpus_train_dic[category] = words
 34 |     return corpus_train_dic
 35 | 
 36 | 
 37 | def tfidf_vectorizer(corpus_path):
 38 |     """vectorize the training documents"""
 39 |     corpus_train = []
 40 |     category_train = []
 41 |     for line in open(corpus_path):
 42 |         line = line.strip().split('\t')
 43 |         if len(line) == 3:
 44 |             category = line[0]
 45 |             words = line[2]
 46 |             category_train.append(category)
 47 |             corpus_train.append(words)
 48 |     print("build train-corpus done!!")
 49 |     print("corpus_train.shape %s" % np.shape(corpus_train))
 50 |     # replace 必须加，保存训练集的特征
 51 |     count_vectorizer = CountVectorizer(decode_error="replace")
 52 |     # count_vectorizer = CountVectorizer(max_df=0.4, min_df=0.01, decode_error="replace")
 53 |     counts_train = count_vectorizer.fit_transform(corpus_train)
 54 | 
 55 |     word_dict = {}
 56 |     for index, word in enumerate(count_vectorizer.get_feature_names()):
 57 |         word_dict[index] = word
 58 |     print("The VSM shape of train is" + repr(counts_train.shape))
 59 | 
 60 |     tfidftransformer = TfidfTransformer()
 61 |     # 注意在训练的时候必须用vectorizer.fit_transform、tfidftransformer.fit_transform
 62 |     # 在预测的时候必须用vectorizer.transform、tfidftransformer.transform
 63 |     tfidf_train = tfidftransformer.fit_transform(counts_train)
 64 |     # tfidf_train = tfidftransformer.fit(counts_train).transform(counts_train)
 65 | 
 66 |     tfidf_train_array = tfidf_train.toarray()
 67 |     tfidf_train_dict = {}
 68 |     for item in range(len(tfidf_train_array)):
 69 |         tfidf_train_dict[category_train[item]] = tfidf_train_array[item]
 70 | 
 71 |     tfidf_train_tuple = []
 72 |     for item in range(len(tfidf_train_array)):
 73 |         tfidf_train_tuple.append((category_train[item], tfidf_train_array[item]))
 74 | 
 75 |     # 保存经过fit的vectorizer 与 经过fit的tfidftransformer,预测时使用
 76 |     # tfidf_feature_path = '/Users/li/PycharmProjects/event_parser/src/model/tfidf_model/feature_1.pkl'
 77 |     tfidf_feature_path = conf.tfidf_feature_path
 78 |     with open(tfidf_feature_path, 'wb') as fw:
 79 |         pickle.dump(count_vectorizer.vocabulary_, fw)
 80 | 
 81 |     # tfidftransformer_path = '/Users/li/PycharmProjects/event_parser/src/model/tfidf_model/tfidftransformer_1.pkl'
 82 |     tfidftransformer_path = conf.tfidftransformer_path
 83 |     with open(tfidftransformer_path, 'wb') as fw:
 84 |         pickle.dump(tfidftransformer, fw)
 85 | 
 86 |     # word_dict_path = '/Users/li/PycharmProjects/event_parser/src/model/tfidf_model/word_dict_1.pkl'
 87 |     word_dict_path = conf.word_dict_path
 88 |     with open(word_dict_path, 'wb') as fw:
 89 |         pickle.dump(word_dict, fw)
 90 | 
 91 |     return tfidf_train_dict, tfidf_train_tuple, word_dict
 92 | 
 93 | 
 94 | def load_batch_tfidf_vector(corpus_train_dict, tfidf_feature, tfidf_transformer):
 95 |     """
 96 |     将语料库中的数据批量转换成VSM
 97 |     :param corpus_train_dict:
 98 |     :param tfidf_feature:
 99 |     :param tfidf_transformer:
100 |     :return:
101 |     """
102 |     tfidf_vsm_tuple = []
103 |     for item in corpus_train_dict.items():
104 |         category, corpus = item[0], item[1]
105 |         tfidf_vsm_tuple.append((category, load_tfidf_vectorizer([corpus], tfidf_feature, tfidf_transformer)))
106 | 
107 |     return tfidf_vsm_tuple
108 | 
109 | 
110 | def load_tfidf_feature(tfidf_feature_path):
111 |     """
112 |     load tf-idf VSM
113 |     :param tfidf_feature_path:
114 |     :return:
115 |     """
116 |     tfidf_feature = pickle.load(open(tfidf_feature_path, "rb"))
117 |     return tfidf_feature
118 | 
119 | 
120 | def load_tfidf_transformer(tfidf_transformer_path):
121 |     """
122 |     load tf-idf transformer
123 |     :param tfidf_transformer_path:
124 |     :return:
125 |     """
126 |     tfidf_transformer = pickle.load(open(tfidf_transformer_path, "rb"))
127 |     return tfidf_transformer
128 | 
129 | 
130 | def load_tfidf_vectorizer(corpus_path, tfidf_feature, tfidf_transformer):
131 |     """
132 |     :param tfidf_transformer:
133 |     :param tfidf_feature: tf-idf feature
134 |     :param corpus_path:
135 |     :return:
136 |     """
137 |     # if type(corpus_path) is not list:
138 |     #     corpus_test = []
139 |     #     target_test = []
140 |     #     for line in open(corpus_path):
141 |     #         line = line.strip().split('\t')
142 |     #         if len(line) == 3:
143 |     #             category = line[0]
144 |     #             words = line[2]
145 |     #             target_test.append(category)
146 |     #             corpus_test.append(words)
147 |     # else:
148 |     #     corpus_test = corpus_path
149 |     corpus_test = corpus_path
150 |     # 加载特征
151 |     loaded_vec = CountVectorizer(decode_error="replace", vocabulary=tfidf_feature)
152 |     # 加载TfidfTransformer
153 |     # 测试用transform，表示测试数据，为list
154 |     test_tfidf = tfidf_transformer.transform(loaded_vec.transform(corpus_test))
155 |     return test_tfidf.toarray().reshape(-1)
156 | 
157 | 
158 | def tfidf_vector_test(corpus_path):
159 |     """vectorize the input documents"""
160 |     corpus_train = []
161 |     # 利用train-corpus提取特征
162 |     target_train = []
163 |     for line in open(corpus_path):
164 |         line = line.strip().split('\t')
165 |         if len(line) == 3:
166 |             words = line[2]
167 |             category = line[0]
168 |             target_train.append(category)
169 |             corpus_train.append(words)
170 |     print("build train-corpus done!!")
171 |     count_v1 = CountVectorizer(max_df=0.4, min_df=0.01)
172 |     # count_v1 = CountVectorizer()
173 |     counts_train = count_v1.fit_transform(corpus_train)
174 | 
175 |     word_dict = {}
176 |     for index, word in enumerate(count_v1.get_feature_names()):
177 |         word_dict[index] = word
178 | 
179 |     print("the shape of train is " + repr(counts_train.shape))
180 |     tfidftransformer = TfidfTransformer()
181 |     tfidf_train = tfidftransformer.fit(counts_train).transform(counts_train)
182 |     return tfidf_train, word_dict
183 | 
184 | 
185 | if __name__ == '__main__':
186 |     # corpus_train = "/Users/li/PycharmProjects/event_parser/src/text_full_full.txt"
187 |     corpus_train = conf.corpus_train_path
188 |     tfidf_train_dic, tfidf_train_tuple, word_dict = tfidf_vectorizer(corpus_train)
189 |     print(np.nonzero(tfidf_train_dic["111755669"]))
190 |     print(np.shape(tfidf_train_dic['111755669']))
191 |     print(type(tfidf_train_dic['111755669']))
192 |     # print np.shape(tfidf_train.toarray()[0])
193 |     # print np.nonzero(tfidf_train.toarray()[0])
194 |     # for i in tfidf_train.toarray()[0]:
195 |     #     print i
196 | 
197 |     # corpus_data_dic = load_data(corpus_train)
198 |     # print type(corpus_data_dic['111755669'])
199 |     # tfidf_test = load_tfidf_vectorizer([corpus_data_dic['111755669']]).toarray().reshape(-1)
200 |     # print np.nonzero(tfidf_test)
201 | 


--------------------------------------------------------------------------------
/src/utils/VSM/vector.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | @version: ??
  6 | @author: li
  7 | @file: vector.py
  8 | @time: 2018/11/5 2:31 PM
  9 | 词向量，文本向量训练模块
 10 | 训练用的编码格式要与使用model时的编码格式一致。
 11 | """
 12 | 
 13 | import sys
 14 | import os
 15 | sys.path.append("..")
 16 | sys.path.append("../")
 17 | sys.path.append("../../")
 18 | import logging.handlers
 19 | from src.utils import file_util
 20 | import numpy as np
 21 | import multiprocessing
 22 | from gensim.models.word2vec import Word2Vec
 23 | from gensim.models.doc2vec import Doc2Vec, LabeledSentence
 24 | from src import data_reader
 25 | 
 26 | LOG_FILE = '../log/vectors.log'
 27 | file_util.check_path(LOG_FILE)
 28 | handler = logging.handlers.RotatingFileHandler(LOG_FILE, maxBytes=1024 * 1024, backupCount=1)  # 实例化handler
 29 | fmt = '%(asctime)s - %(filename)s:%(lineno)s - %(levelname)s - %(message)s'
 30 | # logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
 31 | formatter = logging.Formatter(fmt)
 32 | handler.setFormatter(formatter)
 33 | logger = logging.getLogger()
 34 | logger.addHandler(handler)
 35 | logger.setLevel(level=logging.INFO)
 36 | # logger.setLevel(level=logging.DEBUG)
 37 | logger.info("running %s" % ' '.join(sys.argv))
 38 | 
 39 | 
 40 | class word2vecs(object):
 41 |     """
 42 |     :keyword  Word2Vec模型训练
 43 |     """
 44 |     def __init__(self, wd_configure):
 45 |         """
 46 |         Word2Vec模型训练
 47 |         :param wd_configure: 模型的参数
 48 |         """
 49 | 
 50 |         if "size" in wd_configure.keys():
 51 |             # 训练时词向量维度，默认为100
 52 |             self.size = wd_configure["size"]
 53 |         else:
 54 |             self.size = 300
 55 | 
 56 |         if "min_count" in wd_configure.keys():
 57 |             # min_count 不能设置过大，不然词汇表中会没有词汇
 58 |             # 需要训练词语的最小出现次数，默认为5
 59 |             self.min_count = wd_configure["min_count"]
 60 |         else:
 61 |             self.min_count = 1
 62 | 
 63 |         if "window" in wd_configure.keys():
 64 |             self.window = wd_configure["window"]
 65 |         else:
 66 |             self.window = 5
 67 | 
 68 |         if "worker" in wd_configure.keys():
 69 |             # 完成训练过程的线程数，默认为1不使用多线程， 只有注意安装Cython的前提下该参数设置才有意义
 70 |             self.worker = wd_configure["worker"]
 71 |         else:
 72 |             self.worker = multiprocessing.cpu_count()
 73 | 
 74 |     def train(self, sentences):
 75 |         """
 76 |         模型训练
 77 |         :param sentences:每行为一个list 如sentences = [['A1', 'A2'], ['A1', 'A2'], ['A1', 'A2', 'A1', 'A2']]
 78 |         :return: word2vec 模型
 79 |         # """
 80 |         model_w2d = Word2Vec(size=self.size, window=self.window, min_count=self.min_count, workers=self.worker)
 81 |         model_w2d.build_vocab(sentences)
 82 |         model_w2d.train(sentences, total_examples=model_w2d.corpus_count, epochs=model_w2d.iter)
 83 |         return model_w2d
 84 | 
 85 |     def save(self, model, model_path):
 86 |         model.save(model_path)
 87 | 
 88 |     def load_model(self, model_path):
 89 |         return Word2Vec.load(model_path)
 90 | 
 91 | 
 92 | class doc2vec(object):
 93 |     def __init__(self, dm_configure):
 94 |         if dm_configure.min_count:
 95 |             self.min_count = dm_configure.min_count
 96 |         else:
 97 |             self.min_count = 1
 98 | 
 99 |         if dm_configure.window:
100 |             self.window = dm_configure.window
101 |         else:
102 |             self.window = 3
103 | 
104 |         if dm_configure.size:
105 |             self.size = dm_configure.size
106 |         else:
107 |             self.size = 200
108 | 
109 |         if dm_configure.sample:
110 |             self.sample = dm_configure.sample
111 |         else:
112 |             self.sample = 1e-3
113 | 
114 |         if dm_configure.negative:
115 |             self.negative = dm_configure.negative
116 |         else:
117 |             self.negative = 5
118 | 
119 |         if dm_configure.workers:
120 |             self.workers = dm_configure.workers
121 |         else:
122 |             self.workers = multiprocessing.cpu_count()
123 | 
124 |     def train(self, x_train):
125 |         model_dm = Doc2Vec(x_train, min_count=self.min_count, window=self.window, size=self.size,
126 |                            sample=self.sample, negative=self.negative, workers=self.workers)
127 |         model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=100)
128 | 
129 |         return model_dm
130 | 
131 |     def save(self, model_dm, model_path):
132 |         # model_dm.save('../model/model_dm')
133 |         model_dm.save(model_path)  # model_dm.load(model_path)
134 |         # model_dm.save_word2vec_format(model_path)  # model_dm.load_word2vec_format(model_path,encoding='utf-8')
135 | 
136 |     # def train(x_train, size=200, epoch_num=1):
137 |     #     model_dm = Doc2Vec(x_train, min_count=1, window=3, size=size, sample=1e-3, negative=5, workers=4)
138 |     #     model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=100)
139 |     #     model_dm.save('../model/model_dm')
140 |     #
141 |     #     return model_dm
142 | 
143 | 
144 | def word2vec_train(self):
145 |     # load training data
146 |     x_train = data_reader.get_news_data()
147 |     # word2vec 训练测试
148 |     wd_configure = {"size": 300,
149 |                     "window": 2,
150 |                     "min_count": 1}
151 |     wd = word2vecs(wd_configure)
152 |     model_wd = wd.train(x_train)
153 |     print("[Info] word2vec模型训练结束")
154 |     logger.info("[Info] word2vec模型训练结束")
155 |     print(model_wd.wv[u'食品饮料'])
156 |     # print model_wd.most_similar['食品饮料']
157 | 
158 |     # 模型保存
159 |     model_path = "/Users/li/PycharmProjects/event_parser/src/model/model_300_2_1"
160 |     if not os.path.exists(model_path):
161 |         wd.save(model_wd, model_path)
162 |     else:
163 |         print("[Exception] word2vec的保存路径已经存在。")
164 | 
165 | 
166 | def word2vec_load(model_path=None):
167 |     """
168 |     load word2vec model
169 |     :return:
170 |     """
171 |     if model_path:
172 |         model_path = model_path
173 |     else:
174 |         model_path = "/Users/li/PycharmProjects/event_parser/src/model/model_300_2_1"
175 | 
176 |     wd_conf = {"size": 300,
177 |                 "window": 5,
178 |                 "min_count": 1}
179 |     model_wd = word2vecs(wd_conf)
180 |     model_wd = model_wd.load_model(model_path)
181 |     # print model_wd.wv[u'食饮料']
182 |     return model_wd
183 | 
184 | 
185 | def word_vector(word, w2v_model):
186 |     """
187 |     查找某个词的词向量
188 |     :param word: 需要查找的词
189 |     :param w2v_model: 词向量 shape = (vector_size, )
190 |     :return:
191 |     """
192 |     try:
193 |         vector = w2v_model.wv[word]
194 |         return vector
195 |     except KeyError:
196 |         return np.zeros(w2v_model.vector_size)
197 | 
198 | 
199 | if __name__ == '__main__':
200 |     model_path = "/Users/li/PycharmProjects/event_parser/src/model/model_300_2_1"
201 | 
202 |     # word2vec 训练测试
203 |     wd_conf = {"size": 300,
204 |                "window": 5,
205 |                "min_count": 1}
206 |     x_train = data_reader.get_data_sets()
207 |     wd = word2vecs(wd_conf)
208 |     model_wd = wd.train(x_train)
209 |     print(model_wd.wv[u'球员'])
210 | 
211 |     # doc2vec 训练测试
212 |     # cluster_centers = cluster(x_train)
213 | 


--------------------------------------------------------------------------------
/src/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: __init__.py.py
 8 | @time: 2018/10/30 10:45 AM
 9 | """
10 | import sys
11 | sys.path.append('../')
12 | sys.path.append('../../')
13 | 
14 | 


--------------------------------------------------------------------------------
/src/utils/cluster_test.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: cluster_test.py
 8 | @time: 2018/11/6 1:35 PM
 9 | 聚类模块
10 | """
11 | from sklearn.cluster import KMeans
12 | from gensim.models.doc2vec import Doc2Vec, LabeledSentence
13 | 
14 | 
15 | def cluster(x_train):
16 |     infered_vectors_list = []
17 |     print("load doc2vec model...")
18 |     model_dm = Doc2Vec.load("model/model_dm")
19 |     print("load train vectors...")
20 |     i = 0
21 |     for text, label in x_train:
22 |         vector = model_dm.infer_vector(text)
23 |         infered_vectors_list.append(vector)
24 |         i += 1
25 | 
26 |     print("train kmean model...")
27 |     kmean_model = KMeans(n_clusters=15)
28 |     kmean_model.fit(infered_vectors_list)
29 |     labels = kmean_model.predict(infered_vectors_list[0:100])
30 |     cluster_centers = kmean_model.cluster_centers_
31 | 
32 |     with open("out/own_claasify.txt", 'w') as wf:
33 |         for i in range(100):
34 |             string = ""
35 |             text = x_train[i][0]
36 |             for word in text:
37 |                 string = string + word
38 |             string = string + '\t'
39 |             string = string + str(labels[i])
40 |             string = string + '\n'
41 |             wf.write(string)
42 | 
43 |     return cluster_centers
44 | 


--------------------------------------------------------------------------------
/src/utils/corpus_update.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: corpus_update.py
 8 | @time: 2018-12-19 16:51
 9 | """
10 | import sys
11 | sys.path.append('../')
12 | sys.path.append('../../')
13 | sys.path.append('../../../')
14 | from src.utils import dicts
15 | import codecs  # noqa: E402
16 | import pandas as pd  # noqa: E402
17 | from src import data_reader  # noqa: E402
18 | from src.configure import conf  # noqa: E402
19 | 
20 | 
21 | def stock_code_data_process():
22 |     dic_path = conf.dic_path
23 |     stock_new_path = dic_path + "/stock.csv"
24 |     n2_path = dic_path + "/新增2"
25 |     # 处理股票实体
26 |     # 将stock_words.txt中的股票词转换成jieba用户自定义词典的格式，然后添加到jieba的userdict中
27 |     # 读取股票代码
28 |     stock_df = data_reader.read_stock_code('TQ_SK_BASICINFO')
29 |     stock_df['SYMBOL'] = stock_df['SYMBOL'].apply(lambda x: "'" + x + "'")
30 | 
31 |     stock_dict = []
32 |     for s in stock_df.values:
33 |         code, stock = s[0], s[1]
34 |         stock_dict.append(code + ' ' + '5' + ' ' + 'n')
35 |         stock_dict.append(stock.strip('\n').decode('utf-8') + ' ' + '5' + ' ' + 'n')
36 |     f = codecs.open(n2_path, 'w', 'utf-8')
37 |     for i in stock_dict:
38 |         f.write(i + '\n')  # \n为换行符
39 |     f.close()
40 |     # 数据保存
41 |     stock_df.to_csv(path_or_buf=stock_new_path, index=False)
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     stock_code_data_process()
46 |     dic_path = conf.dic_path
47 |     stock_new_path = dic_path + "/stock.csv"
48 |     data_df = pd.read_csv(stock_new_path, encoding="utf-8").set_index('SESNAME')
49 |     print(data_df.loc[u'万科A'].values)
50 |     # dicts.init()
51 |     # print dicts.stock_dict
52 |     # for index, row in data_df.iterrows():
53 |     #     print row.SESNAME
54 | 


--------------------------------------------------------------------------------
/src/utils/data_process.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | @version: ??
  6 | @author: li
  7 | @file: data_process.py
  8 | @time: 2018/10/31 1:32 PM
  9 | 金融文本解析类
 10 | """
 11 | import sys
 12 | sys.path.append('../')
 13 | sys.path.append('../../')
 14 | import re
 15 | import pandas as pd
 16 | from src.configure import conf
 17 | 
 18 | dic_path = conf.dic_path
 19 | stock_new_path = dic_path + "/stock.csv"
 20 | data_df = pd.read_csv(stock_new_path, encoding="utf-8")
 21 | 
 22 | 
 23 | class DataPressing(object):
 24 |     def __init__(self):
 25 |         # 杂质词
 26 |         self.pattern_word = u'(\\[AI\u51b3\u7b56\\])|(\u3010\u4eca\u65e5\u9898\u6750\u3011)' \
 27 |                             u'|(\u5173\u6ce8\u540c.*\u673a\u4f1a\u3002)'
 28 |         # [关注同花顺财经(ths518)，获取更多机会。]
 29 |         self.pattern_text = u'(\\[AI\u51b3\u7b56\\])'
 30 |         self.num = 5
 31 | 
 32 |     def no_remove(self, text):
 33 |         """
 34 |         杂质词剔除，比如["今日走势", "AI决策"]
 35 |         :param text:
 36 |         :return:
 37 |         """
 38 |         # result = re.sub(self.pattern_word, "", text.decode('utf8'))
 39 |         result = re.sub(self.pattern_word, "", text)
 40 |         return result
 41 | 
 42 |     def useless_contain(self, content):
 43 |         """
 44 |         判断content中是否包含某些字符
 45 |         :return:
 46 |         """
 47 |         # py2使用
 48 |         # match_obj = re.search(self.pattern_text, content.decode('utf8'))
 49 |         match_obj = re.search(self.pattern_text, content)
 50 |         if match_obj:
 51 |             return True
 52 |         else:
 53 |             return False
 54 | 
 55 |     def useless_filter(self, content_list, stock_dicts):
 56 |         """
 57 |         如果文章中超过5只以上的股票，股市收报类的新闻，则将这篇文章剔除
 58 |         :param content_list: 分词之后的文章list
 59 |         :param stock_dicts: 股票代码
 60 |         :return:
 61 |         """
 62 |         stock_num = 0
 63 |         for item in set(content_list):
 64 |             if item in stock_dicts:
 65 |                 stock_num += 1
 66 | 
 67 |         if stock_num >= self.num:
 68 |             return True
 69 |         else:
 70 |             return False
 71 | 
 72 |     def find_stocks(self, content_list, stock_df):
 73 |         """
 74 |         提取content_list中所有的股票以及股票代码
 75 |         :param content_list: 分词之后的文章list
 76 |         :param stock_df: dataFrame 股票代码
 77 |         :return: 返回股票列表
 78 |         """
 79 |         stock_num = []
 80 |         for item in set(content_list):
 81 |             stock = []
 82 |             # py2 使用
 83 |             # item = item.decode('utf-8')
 84 |             if item in stock_df.index.tolist():
 85 |                 res = stock_df.loc[item].values.tolist()
 86 |                 if len(res) > 1:
 87 |                     for i in range(len(res)):
 88 |                         stock.extend(res[i])
 89 |                 else:
 90 |                     stock.extend(res)
 91 |             stock_num.extend(stock)
 92 |         if len(stock_num) > 0:
 93 |             return stock_num
 94 |         else:
 95 |             return []
 96 | 
 97 |     def find_keywords(self, content, key1, key2):
 98 |         """
 99 |         获取一大段文本之间两个关键字之间的内容
100 |         :param content:
101 |         :param key1:
102 |         :param key2:
103 |         :return:
104 |         """
105 |         form = re.compile(key1 + '(.*?)' + key2, re.S)
106 |         result = form.findall(content)
107 |         return result
108 | 


--------------------------------------------------------------------------------
/src/utils/dicts.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | """
  5 | @version: ??
  6 | @author: li
  7 | @file: dicts.py
  8 | @time: 2018/11/5 2:31 PM
  9 | jieba 字典初始化模块
 10 | 功能：添加用户自定义词典，结巴添加新词
 11 | 如果有新登陆词，可以在corpus中的新增中添加
 12 | """
 13 | import jieba
 14 | import pandas as pd
 15 | from src.utils.log import log_util
 16 | 
 17 | import sys
 18 | sys.path.append('../')
 19 | sys.path.append('../../')
 20 | sys.path.append('../../../')
 21 | try:
 22 |     from src.configure import conf
 23 | except Exception:
 24 |     raise
 25 | 
 26 | deg_dict = {}  # 程度副词
 27 | senti_dict = {}  # 情感词
 28 | eng_dict = {}  # 英语或拼音词
 29 | fou_dict = []  # 否定词
 30 | but_dict = []  # 转折词
 31 | lim_dict = []  # 限定词
 32 | new_dict = []  # 新词
 33 | zhi_dict = []  # 知网
 34 | stock_dict = []  # 股票词
 35 | stock_code_dict = []  # 股票代码
 36 | jg_dict = []  # 机构名
 37 | stock_df = []
 38 | 
 39 | logging = log_util.Logger('dict_log')
 40 | 
 41 | 
 42 | class DictInit(object):
 43 |     pass
 44 | 
 45 | 
 46 | def load_stock_data():
 47 |     dic_path = conf.dic_path
 48 |     st_path = dic_path + "/stock_words.txt"
 49 |     st_new_path = dic_path + "/stock.csv"
 50 |     for st in open(st_path):
 51 |         # st = st.decode("utf8")
 52 |         code1, st_code = st.split("\t")
 53 |         code, stock = st_code.split(",")
 54 |         stock_code_dict.append(code.strip("\n"))
 55 |         stock_dict.append(stock.strip("\n"))
 56 | 
 57 |     stocks_df = pd.read_csv(st_new_path, encoding='utf-8')
 58 |     # stock_df.append(stocks_df.set_index('SESNAME'))
 59 |     for index, row in stocks_df.iterrows():
 60 |         stock_dict.append(row.SESNAME)
 61 |         stock_dict.append(row.SYMBOL)
 62 |     # 整理股票代码
 63 |     stocks_df = stocks_df.set_index('SESNAME')
 64 |     return stock_dict, stocks_df
 65 | 
 66 | 
 67 | def init():
 68 |     # dic_path = '/Users/li/PycharmProjects/huihongcaihui/src/corpus'
 69 |     dic_path = conf.dic_path
 70 | 
 71 |     # 读取词典
 72 |     d_path = dic_path + "/程度副词_datatang.txt"
 73 |     s_path = dic_path + "/senti.txt"
 74 |     f_path = dic_path + "/fou.txt"
 75 |     b_path = dic_path + "/but.txt"
 76 |     e_path = dic_path + "/eng.txt"
 77 |     l_path = dic_path + "/limit.dict"
 78 |     a_path = dic_path + "/dic.txt"
 79 |     ns_path = dic_path + "/新增_stock"
 80 |     n_path = dic_path + "/新增"
 81 |     n2_path = dic_path + "/新增2"
 82 |     st_path = dic_path + "/stock_words.txt"
 83 |     st_new_path = dic_path + "/stock.csv"
 84 |     zhi_ne_path = dic_path + "/知网/zhi_neg.txt"
 85 |     zhi_po_path = dic_path + "/知网/zhi_pos.txt"
 86 |     jg_path = dic_path + "/机构"
 87 | 
 88 |     # 添加基金公司实体名字，比如("工银瑞信基金"， "华泰柏瑞基金"， "东方基金")
 89 | 
 90 |     # 结巴新词
 91 |     word_add = set()
 92 | 
 93 |     for d in open(d_path):
 94 |         # temp = d.decode("utf-8").split(" ")
 95 |         temp = d.split(" ")
 96 |         word_arr = temp[1].strip("\n").rstrip(" ").split("、")
 97 |         for w in word_arr:
 98 |             deg_dict[w] = float(temp[0])
 99 |             word_add.add(temp[0])
100 | 
101 |     for s in open(s_path):
102 |         # temp = s.decode("utf-8").split(" ")
103 |         temp = s.split(" ")
104 |         senti_dict[temp[0]] = float(temp[1])
105 |         word_add.add(temp[0])
106 | 
107 |     for e in open(e_path):
108 |         temp = e.split(" ")
109 |         eng_dict[temp[0]] = float(temp[1])
110 |         word_add.add(temp[0])
111 | 
112 |     for f in open(f_path):
113 |         # f = f.decode("utf-8-sig")
114 |         fou_dict.append(f.strip("\n"))
115 |         word_add.add(f.strip("\n"))
116 | 
117 |     for b in open(b_path):
118 |         but_dict.append(b.strip("\n"))
119 |         word_add.add(b.strip("\n"))
120 | 
121 |     for l in open(l_path):
122 |         lim_dict.append(l.strip("\n"))
123 |         word_add.add(l.strip("\n"))
124 | 
125 |     for a in open(a_path):
126 |         new_dict.append(a.strip("\n"))
127 |         word_add.add(a.strip("\n"))
128 | 
129 |     for st in open(st_path):
130 |         # st = st.decode("utf8")
131 |         code1, st_code = st.split("\t")
132 |         code, stock = st_code.split(",")
133 |         stock_code_dict.append(code.strip("\n"))
134 |         stock_dict.append(stock.strip("\n"))
135 |         word_add.add(code.strip("\n"))
136 |         word_add.add(stock.strip("\n"))
137 |     stocks_df = pd.read_csv(st_new_path, encoding='utf-8')
138 |     stock_df.append(stocks_df.set_index('SESNAME'))
139 |     for index, row in stocks_df.iterrows():
140 |         stock_dict.append(row.SESNAME)
141 | 
142 |     for z1 in open(zhi_ne_path):
143 |         # z1 = z1.decode("utf8")
144 |         new_dict.append(z1.strip("\n"))
145 |         word_add.add(z1.strip("\n"))
146 | 
147 |     for z2 in open(zhi_po_path):
148 |         # z2 = z2.decode("utf8")
149 |         z2_data = z2.strip("\n")
150 |         new_dict.append(z2_data)
151 |         word_add.add(z2_data)
152 | 
153 |     for jg in open(jg_path):
154 |         # jg = jg.decode("utf8")
155 |         jg_data = jg.split("\t")[0].strip("\n")
156 |         new_dict.append(jg_data)
157 |         word_add.add(jg_data)
158 | 
159 |     '''
160 |     # 将stock_words.txt中的股票词转换成jieba用户自定义词典的格式，然后添加到jieba的userdict中
161 |     for st in open(st_path):
162 |         code1, st_code = st.split("\t")
163 |         code, stock = st_code.split(",")
164 |         stock_dict.append(code + ' ' + '5' + ' ' + 'n')
165 |         stock_dict.append(stock.strip('\n').decode('utf-8') + ' ' + '5' + ' ' + 'n')
166 |     apply_func = codecs.open(n_path, 'w', 'utf-8')
167 |     for i in stock_dict:
168 |         apply_func.write(i + '\n')  # \n为换行符
169 |     apply_func.close()
170 |     '''
171 |     # 添加用户自定义字典
172 |     jieba.load_userdict(ns_path)
173 |     jieba.load_userdict(n_path)
174 |     jieba.load_userdict(jg_path)
175 |     jieba.load_userdict(n2_path)
176 | 
177 |     # 添加新词
178 |     for w in word_add:
179 |         jieba.add_word(w)
180 | 
181 |     # 结巴添加新词
182 |     jieba.add_word("淡定")
183 |     # jieba.add_word("加多宝")
184 |     # jieba.add_word("红罐")
185 |     jieba.add_word("非公开")
186 |     jieba.add_word("不成人形")
187 |     jieba.add_word("中美贸易战")
188 |     logging.logger.info("[Info] jieba总共添加了{}个自定义词汇。".format(len(word_add)))
189 | 
190 | 
191 | if __name__ == '__main__':
192 |     init()
193 | 


--------------------------------------------------------------------------------
/src/utils/engine/__init__.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | 
4 | """
5 | @version: 1.0
6 | @author: li
7 | @file: __init__.py.py
8 | @time: 2019/12/12 2:15 下午
9 | """


--------------------------------------------------------------------------------
/src/utils/engine/data_source.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: data_source.py
 8 | @time: 2018/10/30 6:39 PM
 9 | """
10 | 
11 | 
12 | URL = 'url'
13 | DTYPE = 'DTYPE'
14 | OBJ = 'OBJ'
15 | 
16 | SQLALCHEMY = 1
17 | 
18 | __DNS = {
19 |     'DNDS':{
20 |         URL:'mssql+pymssql://reader:reader@10.15.97.127:1433/dnds',
21 |         DTYPE:SQLALCHEMY
22 |     },
23 | 
24 |     'XAVIER': {
25 |         URL: 'mysql+mysqlconnector://root:t2R7P7@10.15.5.86:3306/xavier',
26 |         DTYPE: SQLALCHEMY
27 |     },
28 | 
29 |     'VISIONTEST': {
30 |         URL: 'mysql+mysqlconnector://root:1234@10.15.97.128:3306/test',
31 |         DTYPE: SQLALCHEMY
32 |     },
33 | 
34 |     'VISION': {
35 |         URL: 'mysql+mysqlconnector://root:1234@10.15.97.128:3306/vision',
36 |         DTYPE: SQLALCHEMY
37 |     },
38 | 
39 |     'XAVIER_DB': {
40 |         URL: 'mysql+mysqlconnector://root:t2R7P7@10.15.5.86:3306/xavier_db',
41 |         DTYPE: SQLALCHEMY
42 |     },
43 | 
44 |     'XAVIER_SQLITE': {
45 |         # URL: 'sqlite://///Users/li/workshop/dataset/database/xueqiu/discuss.db',
46 |         URL: 'sqlite://///Users/li/PycharmProjects/event_parser/src/parser/discuss_parser/discuss_data/discuss.db',
47 |         DTYPE: SQLALCHEMY
48 |     }
49 | 
50 | }
51 | 
52 | 
53 | def __getSqlAlchemyEngine(source):
54 |     if not OBJ in __DNS[source].keys():
55 |         import sqlalchemy as sa
56 |         __DNS[source][OBJ] = sa.create_engine(__DNS[source][URL])
57 |     return __DNS[source][OBJ]
58 | 
59 | 
60 | def GetDataEngine(source):
61 |     engine = None
62 |     if source in __DNS.keys():
63 |         if __DNS[source][DTYPE] == SQLALCHEMY:
64 |             return __getSqlAlchemyEngine(source)
65 |     else:
66 |         raise Exception("未知的数据源 --'{0}'")
67 |     return engine
68 | 
69 | 


--------------------------------------------------------------------------------
/src/utils/engine/mysql_util.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: mysql_util.py
 8 | @time: 2018/11/28 4:44 PM
 9 | """
10 | 
11 | import MySqldb
12 | import z
13 | 
14 | 
15 | class mysql:
16 |     def __init__(self):
17 |         self.conn = MySqldb.connect(host=z.mysql_ip,
18 |                                     user=z.mysql_user,
19 |                                     passwd=z.mysql_passwd,
20 |                                     db=z.mysql_db,
21 |                                     port=z.mysql_port)
22 |         self.cursor = self.conn.cursor()
23 | 
24 |     def close(self):
25 |         self.cursor.close()
26 |         self.conn.close()
27 | 
28 | 
29 | ms = mysql()
30 | 
31 | 
32 | def load_from_mysql(query_str):
33 |     ms.cursor.execute(query_str)
34 |     result = ms.cursor.fetchall()
35 |     ms.close()
36 |     return result
37 | 
38 | 
39 | def insert_data(query_insert):
40 |     # # SQL 插入语句
41 |     # sql = """INSERT INTO EMPLOYEE(FIRST_NAME,
42 |     #          LAST_NAME, AGE, SEX, INCOME)
43 |     #          VALUES ('Mac', 'Mohan', 20, 'M', 2000)"""
44 |     try:
45 |         # 执行sql语句
46 |         ms.cursor.execute(query_insert)
47 |         # 提交到数据库执行
48 |         ms.conn.commit()
49 |     except EOFError:
50 |         # Rollback in case there is any error
51 |         ms.conn.rollback()
52 | 
53 |     # 关闭数据库连接
54 |     ms.close()
55 | 
56 | 
57 | def delete_data(query_delete):
58 |     # SQL 删除语句
59 |     # sql = "DELETE FROM EMPLOYEE WHERE AGE > %s" % (20)
60 |     try:
61 |         # 执行SQL语句
62 |         ms.cursor.execute(query_delete)
63 |         # 提交修改
64 |         ms.conn.commit()
65 |     except IOError:
66 |         # 发生错误时回滚
67 |         ms.conn.rollback()
68 | 
69 |     # 关闭连接
70 |     ms.close()
71 | 
72 | 
73 | def update_data(query_update):
74 |     # SQL 更新语句
75 |     # sql = "DELETE FROM EMPLOYEE WHERE AGE > %s" % (20)
76 |     try:
77 |         # 执行SQL语句
78 |         ms.cursor.execute(query_update)
79 |         # 提交修改
80 |         ms.conn.commit()
81 |     except EOFError:
82 |         # 发生错误时回滚
83 |         ms.conn.rollback()
84 | 
85 |     # 关闭连接
86 |     ms.close()


--------------------------------------------------------------------------------
/src/utils/file_util.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: file_util.py
 8 | @time: 2018/11/19 8:52 AM
 9 | """
10 | import os
11 | import datetime
12 | 
13 | 
14 | def check_path(_path):
15 |     """check out weather the _path exists. If not, create a new _path dit"""
16 |     dir_name = os.path.dirname(_path)
17 |     if dir_name:
18 |         if not os.path.exists(dir_name):
19 |             os.makedirs(dir_name)
20 | 
21 | 
22 | def list_all_files(file_path):
23 |     _file = []
24 |     lists = os.listdir(file_path)
25 |     for i in range(len(lists)):
26 |         path = os.path.join(file_path, lists[i])
27 |         if os.path.isdir(path):
28 |             _file.append(list_all_files(path))
29 |         if os.path.isfile(path):
30 |             _file.append(path)
31 | 
32 |     return _file
33 | 
34 | 
35 | def find_newest_file(save_path):
36 |     """
37 |     从文件夹中读取最新保存或修改的文件。
38 |     :param save_path: 目录地址
39 |     :return:
40 |     """
41 |     _file = []
42 |     lists = os.listdir(save_path)  # 列出目录的下所有文件和文件夹保存到lists
43 |     if len(lists) > 0:
44 |         for i in range(len(lists)):
45 |             path = lists[i]
46 |             # 提取文件，剔除文件夹
47 |             if os.path.isfile(save_path + path):
48 |                 _file.append(path)
49 |         if len(_file) > 0:
50 |             _file.sort(key=lambda fn: os.path.getmtime(save_path + fn))  # 将文件按时间排序
51 |             file_new = _file[-1]  # 获取最新的文件保存到file_new
52 |             # filetime = datetime.datetime.fromtimestamp(os.path.getmtime(file_new))
53 |         else:
54 |             file_new = 'NULL'
55 |     else:
56 |         file_new = 'NULL'
57 |     # logging.logger.info("文件的最新修改时间：" + filetime.strftime('%Y-%m-%d %H:%M:%S'))
58 |     # logging.logger.info("最新修改的文件(夹)：" + lists[-1])
59 |     return file_new
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     print(find_newest_file("/Users/li/PycharmProjects/event_parser/src/model/event_model/"))
64 |     # print list_all_files("/Users/li/PycharmProjects/event_parser/src/log/")
65 | 


--------------------------------------------------------------------------------
/src/utils/keywords_extractor.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | @version: ??
  6 | @author: li
  7 | @file: keywords_extractor.py
  8 | @time: 2018/11/19 10:19 AM
  9 | 基于textRank的关键词提取
 10 | """
 11 | import numpy as np
 12 | 
 13 | from src.utils import data_process, dicts, tokenization
 14 | from src.utils.log import log_util
 15 | 
 16 | logging = log_util.Logger('keywordsExtractor_log')
 17 | 
 18 | 
 19 | class TextRank(object):
 20 |     def __init__(self, top_k=20, with_weight=False, window=5, alpha=0.85, min_diff=1000):
 21 |         """
 22 |         :param top_k: return how many top keywords. `None` for all possible words.
 23 |         :param with_weight: if True, return a list of (word, weight);
 24 |                             if False, return a list of words.
 25 |         :param window:
 26 |         :param alpha:
 27 |         :param min_diff:
 28 |         """
 29 |         # self.sentence = sentence
 30 |         self.word_list = ""
 31 |         self.window = window
 32 |         self.alpha = alpha
 33 |         self.edge_dict = {}  # 记录节点的边连接字典
 34 |         self.iter_num = min_diff  # 设置收敛阈值
 35 |         self.topK = top_k  # 提取关键词的个数
 36 |         self.withWeight = with_weight
 37 | 
 38 |     def _cut_sentence(self, sentence):
 39 |         """
 40 |         # 对句子进行分词
 41 |         :return:
 42 |         """
 43 |         # 使用多进程的时候需要修改一下
 44 |         dp, dict_init, stop_words = data_process.DataPressing(), dicts.init(), tokenization.load_stop_words()
 45 |         tk = tokenization.Tokenizer(dp, stop_words)
 46 |         self.word_list = tk.token(sentence)
 47 |         # dicts.init()
 48 |         # jieba.load_userdict('user_dict.txt')
 49 |         # tag_filter = ['a', 'd', 'n', 'v']
 50 |         # seg_result = pseg.cut(self.sentence)
 51 |         # self.word_list = [s.word for s in seg_result if s.flag in tag_filter]
 52 |         # print(self.word_list)
 53 | 
 54 |     def _create_nodes(self):
 55 |         """
 56 |         # 根据窗口，构建每个节点的相邻节点,返回边的集合
 57 |         :return:
 58 |         """
 59 |         tmp_list = []
 60 |         word_list_len = len(self.word_list)
 61 |         for index, word in enumerate(self.word_list):
 62 |             if word not in self.edge_dict.keys():
 63 |                 tmp_list.append(word)
 64 |                 tmp_set = set()
 65 |                 left = index - self.window + 1  # 窗口左边界
 66 |                 right = index + self.window  # 窗口右边界
 67 |                 if left < 0: left = 0
 68 |                 if right >= word_list_len: right = word_list_len
 69 |                 for i in range(left, right):
 70 |                     if i == index:
 71 |                         continue
 72 |                     tmp_set.add(self.word_list[i])
 73 |                 self.edge_dict[word] = tmp_set
 74 | 
 75 |     def _create_matrix(self):
 76 |         """
 77 |         # 根据边的相连关系，构建矩阵
 78 |         :return:
 79 |         """
 80 |         self.matrix = np.zeros([len(set(self.word_list)), len(set(self.word_list))])
 81 |         self.word_index = {}  # 记录词的index
 82 |         self.index_dict = {}  # 记录节点index对应的词
 83 | 
 84 |         for i, v in enumerate(set(self.word_list)):
 85 |             self.word_index[v] = i
 86 |             self.index_dict[i] = v
 87 |         for key in self.edge_dict.keys():
 88 |             for w in self.edge_dict[key]:
 89 |                 self.matrix[self.word_index[key]][self.word_index[w]] = 1
 90 |                 self.matrix[self.word_index[w]][self.word_index[key]] = 1
 91 |         # 归一化
 92 |         for j in range(self.matrix.shape[1]):
 93 |             summary = 0
 94 |             for i in range(self.matrix.shape[0]):
 95 |                 summary += self.matrix[i][j]
 96 |             for i in range(self.matrix.shape[0]):
 97 |                 self.matrix[i][j] /= summary
 98 | 
 99 |     def _cal_pr(self):
100 |         """
101 |         # 根据textRank公式计算权重
102 |         :return:
103 |         """
104 |         #
105 |         self.PR = np.ones([len(set(self.word_list)), 1])
106 |         for i in range(self.iter_num):
107 |             self.PR = (1 - self.alpha) + self.alpha * np.dot(self.matrix, self.PR)
108 | 
109 |     def _print_result(self):
110 |         """
111 |         # 输出词和相应的权重
112 |         :return:
113 |         """
114 |         word_pr = {}
115 |         for i in range(len(self.PR)):
116 |             word_pr[self.index_dict[i]] = self.PR[i][0]
117 |         if self.withWeight:
118 |             tags = sorted(word_pr.items(), key=lambda x: x[1], reverse=True)
119 |             # tags = sorted(word_pr.items(), key=itemgetter(1), reverse=True)
120 |         else:
121 |             tags = sorted(word_pr, key=word_pr.__getitem__, reverse=True)
122 | 
123 |         if self.topK:
124 |             return tags[:self.topK]
125 |         else:
126 |             return tags
127 | 
128 |     def run(self, sentence):
129 |         if type(sentence) is not list:
130 |             self._cut_sentence(sentence)
131 |         else:
132 |             self.word_list = sentence
133 | 
134 |         if len(self.word_list) > 1:  # bug 如果sentence分词后只有一个单词，则直接输出
135 |             self._create_nodes()
136 |             self._create_matrix()
137 |             self._cal_pr()
138 |             result = self._print_result()
139 |         else:
140 |             result = self.word_list
141 |         return result
142 | 
143 | 
144 | def d_test():
145 |     """
146 |     类接口测试
147 |     :return:
148 |     """
149 |     # s = '程序员(英文Programmer)是从事程序开发、维护的专业人员。' \
150 |     #     '一般将程序员分为程序设计人员和程序编码人员，但两者的界限并不非常清楚，' \
151 |     #     '特别是在中国。软件从业人员分为初级程序员、高级程序员、系统分析员和项目经理四大类。'
152 | 
153 |     # s = '【今日题材】[AI决策]大智慧的股票真烂，中美贸易战打得好，中美贸易摩擦擦出爱情火花！科创板也上市了，' \
154 |     #     '还是注册制的, 关注同花顺财经（ths58）， 获取更多机会。'
155 | 
156 |     s = '中兴通讯（000063）在经历七个一字跌停板后，于今天打开跌停板。债转股开盘大涨，天津普林（002134）、信达地产（600657）' \
157 |         '、海德股份（000567）集体涨停，长航凤凰（000520）、浙江东方（600120）、陕国投A（000563）大涨，消息面上，' \
158 |         '央行宣布定向降准0.5个百分点，将重点支持债转股。中兴通讯机构最低估值12.02元/股在复牌之前，' \
159 |         '多家基金公司对中兴通讯估值大多调整至20.54元/股。连续7个跌停板后，中兴通讯A股股价早就已经跌穿这一价格。' \
160 |         '据《中国经营报》记者不完全统计，6月20日～22日，多家基金公司再做出调整中兴通讯A股估值的公告，下调公司包括工银瑞信基金、' \
161 |         '华泰柏瑞基金、东方基金、大摩华鑫基金、融通基金、大成基金等22家基金公司。值得注意的是，此次基金公司估值下调幅度并不一致，' \
162 |         '调整估值在每股12.02～16.64元之间。其中，大摩华鑫基金、融通基金和安信基金给出的估值最高，为每股16.64元，而工银瑞信基金、' \
163 |         '富国基金和泰达宏利基金给出的估值最低，为每股12.02元。关注同花顺财经（ths518），获取更多机会'
164 | 
165 |     # s = u"水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露，" \
166 |     #     u"根据刚刚完成了水资源管理制度的考核，有部分省接近了红线的指标，\n" \
167 |     #     u"有部分省超过红线的指标。对一些超过红线的地方，\n陈明忠表示，对一些取用水项目进行区域的限批，" \
168 |     #     u"严格地进行水资源论证和取水许可的批准。"
169 | 
170 |     dp, dict_init, stop_words = data_process.DataPressing(), dicts.init(), tokenization.load_stop_words()
171 |     tk = tokenization.Tokenizer(dp, stop_words)
172 |     s_list = tk.token(s)
173 |     # 根据句子的长度，动态划分关键词的个数
174 |     # top_k = int(len(s_list) * 0.1)
175 |     # text_rank = TextRank(s_list, top_k=15, with_weight=True)
176 | 
177 |     text_rank = TextRank(top_k=15)
178 |     res = text_rank.run(s_list)
179 |     logging.logger.info("提取的%s个关键词: " % len(res))
180 |     if text_rank.withWeight:
181 |         print(",".join(item[0] for item in res))
182 |         print(",".join(str(item[1]) for item in res))
183 |     else:
184 |         print(",".join(str(item) for item in res))
185 | 
186 | 
187 | def parallel_test(text):
188 |     text_rank = TextRank(top_k=15)
189 |     return text_rank.run(text)
190 | 
191 | 
192 | def multi_extract(s_lists):
193 |     from multiprocessing import Pool
194 |     import multiprocessing as mp
195 |     res_l = []
196 |     pool = Pool(processes=int(mp.cpu_count()))
197 |     for s_list in s_lists:
198 |         res = pool.apply_async(parallel_test, (s_list,))
199 |         res_l.append(res.get())
200 |     pool.close()
201 |     pool.join()
202 | 
203 |     return res_l
204 | 
205 | 
206 | def multi_extract_test():
207 |     """
208 |     多进程测试
209 |     :return:
210 |     """
211 |     import time
212 |     from multiprocessing import Pool
213 |     import multiprocessing as mp
214 | 
215 |     s = '程序员(英文Programmer)是从事程序开发、维护的专业人员。' \
216 |         '一般将程序员分为程序设计人员和程序编码人员，但两者的界限并不非常清楚，' \
217 |         '特别是在中国。软件从业人员分为初级程序员、高级程序员、系统分析员和项目经理四大类。'
218 | 
219 |     dp = data_process.DataPressing()
220 |     dict_init = dicts.init()
221 |     stop_words = tokenization.load_stop_words()
222 |     # 分词
223 |     tk = tokenization.Tokenizer(dp, stop_words)
224 |     s_list = tk.token(s)
225 |     t0 = time.time()
226 |     for i in range(10000):
227 |         parallel_test(s_list)
228 |     logging.logger.info("串行处理花费时间{t}".format(t=time.time()-t0))
229 | 
230 |     pool = Pool(processes=int(mp.cpu_count()))
231 |     res_l = []
232 |     t1 = time.time()
233 |     for i in range(10000):
234 |         res = pool.apply_async(parallel_test, (s_list,))
235 |         res_l.append(res)
236 |     # pool.map(parallel_test, s_list)
237 | 
238 |     # for i in res_l:
239 |     #     print i.get()
240 |     pool.close()
241 |     pool.join()
242 |     logging.logger.info("并行处理花费时间{t}s".format(t=time.time()-t1))
243 | 
244 | 
245 | if __name__ == '__main__':
246 |     d_test()
247 |     # multi_extract_test()
248 | 


--------------------------------------------------------------------------------
/src/utils/log/__init__.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | 
4 | """
5 | @version: 1.0
6 | @author: li
7 | @file: __init__.py.py
8 | @time: 2019/12/12 2:14 下午
9 | """


--------------------------------------------------------------------------------
/src/utils/log/log2.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: logging.py
 8 | @time: 2018-12-21 17:15
 9 | """
10 | import logging.handlers
11 | import file_util
12 | 
13 | 
14 | class LoggerConfig(object):
15 |     def __init__(self, log_file_name):
16 |         self.log_file_name = log_file_name
17 | 
18 |     def logger_info(self):
19 |         log_file = '../log/%s_info.log' % self.log_file_name
20 |         file_util.check_path(log_file)
21 |         handler = logging.handlers.RotatingFileHandler(log_file, maxBytes=10240 * 1024, backupCount=5)  # 实例化handler
22 |         fmt = '%(asctime)s - %(filename)s:%(lineno)s - %(name)s - %(message)s'
23 |         formatter = logging.Formatter(fmt)                 # 实例化formatter
24 |         handler.setFormatter(formatter)               # 为handler添加formatter
25 |         logger = logging.getLogger('info')                 # 获取名为tst的logger
26 |         if not logger.handlers:
27 |             logger.addHandler(handler)  # 为logger添加handler
28 |             logger.setLevel(logging.INFO)
29 |         return logger
30 | 
31 |     def logger_error(self):
32 |         log_file = '../log/%s_error.log' % self.log_file_name
33 |         file_util.check_path(log_file)
34 |         handler = logging.handlers.RotatingFileHandler(log_file, maxBytes=10240 * 1024, backupCount=5)  # 实例化handler
35 |         fmt = '%(asctime)s - %(filename)s:%(lineno)s - %(name)s - %(message)s'
36 |         formatter = logging.Formatter(fmt)                 # 实例化formatter
37 |         handler.setFormatter(formatter)               # 为handler添加formatter
38 |         logger = logging.getLogger('error')                 # 获取名为tst的logger
39 |         if not logger.handlers:
40 |             logger.addHandler(handler)                    # 为logger添加handler
41 |             logger.setLevel(logging.ERROR)
42 |         return logger


--------------------------------------------------------------------------------
/src/utils/log/log_util.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: log_test.py
 8 | @time: 2018-12-26 16:33
 9 | """
10 | 
11 | import logging
12 | from src.utils import file_util
13 | from logging import handlers
14 | 
15 | 
16 | class Logger(object):
17 |     level_relations = {
18 |         # 读取的事件文件目录
19 |         'debug': logging.DEBUG,
20 |         'info': logging.INFO,
21 |         'warning': logging.WARNING,
22 |         'error': logging.ERROR,
23 |         'crit': logging.CRITICAL
24 |     }
25 | 
26 |     def __init__(self, log_file_name, level='debug', when='D', backup_count=5,
27 |                  fmt='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s'):
28 |         log_file = '../log/%s.log' % log_file_name
29 |         file_util.check_path(log_file)
30 |         self.logger = logging.getLogger(log_file)
31 |         format_str = logging.Formatter(fmt)  # 设置日志格式
32 |         self.logger.setLevel(self.level_relations.get(level))  # 设置日志级别
33 |         sh = logging.StreamHandler()  # 往屏幕上输出
34 |         sh.setFormatter(format_str)  # 设置屏幕上显示的格式
35 |         th = handlers.TimedRotatingFileHandler(filename=log_file, when=when, backupCount=backup_count,
36 |                                                encoding='utf-8')  # 往文件里写入#指定间隔时间自动生成文件的处理器
37 |         # 实例化TimedRotatingFileHandler
38 |         # interval是时间间隔，backupCount是备份文件的个数，如果超过这个个数，就会自动删除，when是间隔的时间单位，单位有以下几种：
39 |         # S 秒
40 |         # M 分
41 |         # H 小时、
42 |         # D 天、
43 |         # W 每星期（interval==0时代表星期一）
44 |         # midnight 每天凌晨
45 |         th.setFormatter(format_str)  # 设置文件里写入的格式
46 |         self.logger.addHandler(sh)  # 把对象加到logger里
47 |         self.logger.addHandler(th)
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     log = Logger('all', level='info')
52 |     log.logger.debug('debug')
53 |     log.logger.info('info')
54 |     log.logger.warning(u'警告')
55 |     log.logger.error(u'报错')
56 |     log.logger.critical(u'严重')
57 |     Logger('error.log', level='error').logger.error('error')
58 | 


--------------------------------------------------------------------------------
/src/utils/news.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: news.py
 8 | @time: 2018/11/28 10:56 AM
 9 | """
10 | 
11 | 
12 | class news(object):
13 |     def __init__(self, title, content, publish_time):
14 |         self.title = title
15 |         self.content = content
16 |         self.publish_time = publish_time
17 | 
18 |     def title(self):
19 |         return self.title()
20 | 
21 |     def content(self):
22 |         return self.title()
23 | 
24 |     def publish_time(self):
25 |         return self.publish_time()
26 | 
27 |     def news_detail(self):
28 |         if self.title and self.content:
29 |             print(self.title + self.content)
30 | 
31 |     def news_lists(self):
32 |         pass
33 | 


--------------------------------------------------------------------------------
/src/utils/test.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | @version: ??
  6 | @author: li
  7 | @file: test.py
  8 | @time: 2018/11/12 2:19 PM
  9 | """
 10 | import sys
 11 | sys.path.append("..")
 12 | from gensim import corpora, models, similarities
 13 | from src.utils.tokenization import Tokenizer, load_stop_words
 14 | from src.configure import Configure
 15 | from src.utils import data_process
 16 | 
 17 | 
 18 | conf = Configure()
 19 | 
 20 | 
 21 | raw_documents = [
 22 |     u'0无偿居间介绍买卖毒品的行为应如何定性',
 23 |     u'1吸毒男动态持有大量毒品的行为该如何认定',
 24 |     u'2如何区分是非法种植毒品原植物罪还是非法制造毒品罪',
 25 |     u'3为毒贩贩卖毒品提供帮助构成贩卖毒品罪',
 26 |     u'4将自己吸食的毒品原价转让给朋友吸食的行为该如何认定',
 27 |     u'5为获报酬帮人购买毒品的行为该如何认定',
 28 |     u'6毒贩出狱后再次够买毒品途中被抓的行为认定',
 29 |     u'7虚夸毒品功效劝人吸食毒品的行为该如何认定',
 30 |     u'8妻子下落不明丈夫又与他人登记结婚是否为无效婚姻',
 31 |     u'9一方未签字办理的结婚登记是否有效',
 32 |     u'10夫妻双方1990年按农村习俗举办婚礼没有结婚证 一方可否起诉离婚',
 33 |     u'11结婚前对方父母出资购买的住房写我们二人的名字有效吗',
 34 |     u'12身份证被别人冒用无法登记结婚怎么办？',
 35 |     u'13同居后又与他人登记结婚是否构成重婚罪',
 36 |     u'14未办登记只举办结婚仪式可起诉离婚吗',
 37 |     u'15同居多年未办理结婚登记，是否可以向法院起诉要求离婚'
 38 | ]
 39 | 
 40 | 
 41 | # def tokenization(filename):
 42 | #     """
 43 | #     对语料进行分词，分词之后先按照词性过滤出一些停用词，然后在通过停用词表过滤掉一些停用词。
 44 | #     :param filename:
 45 | #     :return:
 46 | #     """
 47 | #     dicts.init()  # 初始化人工词典
 48 | #     result = []
 49 | #     with open(filename, 'r') as apply_func:
 50 | #         text = apply_func.read()
 51 | #         words = pseg.cut(text)
 52 | #     for word, flag in words:
 53 | #         if flag not in stop_flag and word not in stopwords:
 54 | #             result.append(word)
 55 | #     return result
 56 | 
 57 | 
 58 | # 语料库准备，导入所有的语料，并且进行分词，去停用词
 59 | # filenames = ['/Users/yiiyuanliu/Desktop/nlp/demo/articles/13 件小事帮您稳血压.txt',
 60 | #              '/Users/yiiyuanliu/Desktop/nlp/demo/articles/高血压患者宜喝低脂奶.txt',
 61 | #              '/Users/yiiyuanliu/Desktop/nlp/demo/articles/ios.txt']
 62 | 
 63 | 
 64 | # corpus = []
 65 | # t = Tokenizer()
 66 | #
 67 | # for each in raw_documents:
 68 | #     corpus.append(t.token(each))
 69 | # print len(corpus)
 70 | #
 71 | #
 72 | # for item in corpus[0]:
 73 | #     print item
 74 | #
 75 | #
 76 | # def DictionaryBuild(corpus):
 77 | #     # 建立词袋模型。
 78 | #     dictionary = corpora.Dictionary(corpus)
 79 | #     return dictionary
 80 | #
 81 | #
 82 | # dictionary = DictionaryBuild(corpus)
 83 | # print dictionary
 84 | #
 85 | #
 86 | # def docVectors(dictionary):
 87 | #     doc_vectors = [dictionary.doc2bow(text) for text in corpus]
 88 | #     print len(doc_vectors)
 89 | #     print doc_vectors
 90 | #
 91 | # docVectors(dictionary)
 92 | 
 93 | 
 94 | # query = tokenization('/Users/yiiyuanliu/Desktop/nlp/demo/articles/关于降压药的五个问题.txt')
 95 | # query_bow = dictionary.doc2bow(query)
 96 | # print query_bow
 97 | #
 98 | #
 99 | # # 文本相似度计算
100 | # # 基于积累的事件，首先计算所有事件的词向量或者tf-idf值，然后将新晋事件与最近的事件进行相似度计算，计算
101 | # lsi = models.LsiModel(tfidf_vectors, id2word=dictionary, num_topics=2)
102 | 
103 | 
104 | if __name__ == '__main__':
105 |     import dicts
106 |     data_processing = data_process.DataPressing()
107 |     dict_init = dicts.init()
108 |     stop_words = load_stop_words()
109 |     t = Tokenizer(data_processing, stop_words)
110 |     stock_dict = dicts.stock_dict
111 |     print(["大智慧".decode("utf8")])
112 |     a = ["大智慧".decode("utf8")]
113 |     print(len(a[0]))
114 |     # print(["【今日题材】".decode("utf8")])
115 | 
116 |     # file = open('file_name.txt', 'w')
117 |     # file.write(str(raw_documents))
118 |     # file.close()
119 | 
120 |     # 剔除杂质词
121 |     print(data_processing.no_remove("【今日题材】[AI决策]大智慧的股票真烂，中美贸易战打得好，中美贸易摩擦擦出爱情火花！科创板也上市了，还是注册制的"))
122 | 
123 |     # 判断content中是否存在某些特殊词
124 |     print(data_processing.useless_contain("[AI决策]大智慧的股票真烂，中美贸易战打得好，中美贸易摩擦擦出爱情火花！科创板也上市了，还是注册制的"))
125 | 
126 |     # 筛选新闻，筛选出股市收报
127 |     # str = '午后，分散染料概念股走强。截至发稿，浙江龙盛(600352)[AI决策](浙江龙盛(600352)[AI决策]-CN)涨6.74%报13.15元，闰土股份(002440)[AI决策](闰土股份(002440)[AI决策]-CN)涨5.84%报19.94元，安诺其(300067)[AI决策](安诺其(300067)[AI决策]-CN)涨5.46%报6.38元，吉华集团(603980)[AI决策](吉华集团(603980)[AI决策]-CN)涨3.41%报22.42元，航民股份(600987)[AI决策](航民股份(600987)[AI决策]-CN)、江苏吴中(600200)[AI决策](江苏吴中(600200)[AI决策]-CN)等个股跟随上涨近2%。据分散染料龙头企业介绍，由于环保形势的持续严峻，企业开工受到限制，染料供应量较少，库存偏低。染料贸易商和印染企业前期采购的分散染料，经过四季度的消耗库存已经很低，近期需要补仓，刚需力度增强。推荐阅读：浙江龙盛最新消息'
128 |     # a = t.token(str)
129 |     # tmp_res = data_processing.useless_filter(a, stock_dict)
130 |     # print tmp_res
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 


--------------------------------------------------------------------------------
/src/utils/time_util.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | @version: ??
 6 | @author: li
 7 | @file: time_util.py
 8 | @time: 2018/11/29 10:15 AM
 9 | """
10 | 
11 | import time
12 | import datetime
13 | 
14 | 
15 | def time_to_timestamp(time_str, style=None):
16 |     """
17 |     固定格式的时间转换成时间戳
18 |     :param time_str:
19 |     :param style:
20 |     :return:
21 |     """
22 |     if style is None:
23 |         style = "%Y-%m-%d %H:%M:%S"
24 |     time_array = time.strptime(time_str, style)
25 |     time_stamp = int(time.mktime(time_array))
26 |     return time_stamp
27 | 
28 | 
29 | def timestamp_to_time(time_stamp, style=None):
30 |     """
31 |     时间戳转换成固定格式的时间
32 |     :param time_stamp:
33 |     :param style:
34 |     :return:
35 |     """
36 |     if style is None:
37 |         style = "%Y-%m-%d %H:%M:%S"
38 |     time_array = time.localtime(time_stamp)
39 |     date_time = time.strftime(style, time_array)
40 |     return date_time
41 | 
42 | 
43 | def get_integral_point_time(hour=0, minute=0, sec=0):
44 |     """
45 |     获取当天的某个时间点, 并转化成时间戳
46 |     :param sec: 秒
47 |     :param minute: 分钟
48 |     :param hour: 小时
49 |     :return:
50 |     """
51 |     if hour > 24 or minute > 60 or sec > 60:
52 |         print('time error in get_integral_point_time')
53 |         exit()
54 |     today = datetime.date.today().strftime("%Y-%m-%d") + ' %s:%s:%s' % (hour, minute, sec)
55 |     time_array = time.strptime(today, "%Y-%m-%d %H:%M:%S")
56 |     today_time = int(time.mktime(time_array))
57 |     return today_time
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     now = int(time.time())
62 |     # print now
63 |     print(time_to_timestamp("2018-12-5 21:49:7", "%Y-%m-%d %H:%M:%S"))
64 |     print(timestamp_to_time(now, "%Y-%m-%d %H:%M:%S"))
65 |     print(get_integral_point_time(9))
66 | 


--------------------------------------------------------------------------------
/src/utils/tokenization.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | @version: ??
  6 | @author: li
  7 | @file: tokenization.py
  8 | @time: 2018/11/2 10:40 AM
  9 | 分词模块
 10 | 调用jieba分词，添加用户自定义词典，封装，并且去停用词等操作
 11 | """
 12 | import codecs
 13 | import sys
 14 | 
 15 | import jieba.posseg as pseg
 16 | 
 17 | from src.utils import data_process, dicts
 18 | from src.utils.log import log_util
 19 | 
 20 | sys.path.append('..')
 21 | sys.path.append('../')
 22 | sys.path.append('../../')
 23 | from src.configure import Configure
 24 | 
 25 | logging = log_util.Logger('tokenization_log')
 26 | 
 27 | stopwords = globals()
 28 | 
 29 | 
 30 | def load_stop_words():
 31 |     # 停用词库准备, 构建停用词表
 32 |     conf = Configure()
 33 |     stop_words_path = conf.stop_words_path
 34 |     words_count = dict()
 35 |     try:
 36 |         stop_word = codecs.open(stop_words_path, 'r', encoding='utf8').readlines()
 37 |         stop_words = [w.strip() for w in stop_word]
 38 |         logging.logger.info("Stopwords 导入成功！")
 39 |         return stop_words
 40 |     except BaseException as e:
 41 |         logging.logger.error('Stop Words Exception: {0}'.format(e))
 42 | 
 43 | 
 44 | class Tokenizer(object):
 45 |     def __init__(self, data_process, stop_words):
 46 |         # dicts.init()  # 初始化人工词典
 47 |         self.data_precessing = data_process
 48 |         # self.dicts = dict_init
 49 |         # 按照词性去停用词
 50 |         # 去停用词的词性列表，包括[标点符号、连词、助词、副词、介词、时语素、‘的’, 数词, 方位词, 代词, 形容词, 动词],暂时没有使用，原因是添加的新词没有添加词性，所以新词词性有问题。
 51 |         self.stop_flag = ['x', 'c', 'u', 'd', 'p', 't', 'uj', 'm', 'apply_func', 'r']
 52 |         self.stopwords = stop_words
 53 |         self.words_count = {}
 54 | 
 55 |     def remove_stopwords(self):
 56 |         # 使用映射来判断当前元素是否在字典中，速度会比list匹配快
 57 |         # [pandas apply 函数 多进程实现](https://blog.csdn.net/Jerry/article/details/71425298?utm_source=blogxgwz1#commentBox)
 58 |         pass
 59 | 
 60 |     def token(self, text):
 61 |         """
 62 |         对语料进行分词，分词之后先按照词性过滤出一些停用词，然后在通过停用词表过滤掉一些停用词。
 63 |         :param text:
 64 |         :return:
 65 |         """
 66 |         if text is None:
 67 |             return None
 68 |         result = []
 69 |         words = pseg.cut(self.data_precessing.no_remove(text))
 70 | 
 71 |         # for word, flag in words:
 72 |         #     result.append(word)
 73 | 
 74 |         for word, flag in words:
 75 |             if flag not in self.stop_flag and word not in self.stopwords and len(word) >= 2:
 76 |                 result.append(word)
 77 |         return result
 78 | 
 79 | 
 80 | def d_test():
 81 |     data_processing = data_process.DataPressing()
 82 |     dict_init = dicts.init()
 83 |     stop_words = load_stop_words()
 84 |     tk = Tokenizer(data_processing, stop_words)
 85 |     # print(["大智慧".decode("utf8")])
 86 |     # print(["【今日题材】".decode("utf8")])
 87 |     # print(["关注同".decode("utf-8")])
 88 | 
 89 |     # 剔除杂质词
 90 |     print(data_processing.no_remove("【今日题材】[AI决策]大智慧的股票真烂，中美贸易战打得好，中美贸易摩擦擦出爱情火花！科创板也上市了，还是注册制的, 关注同花顺财经（ths58）， 获取更多机会。"))
 91 |     # 判断content中是否存在某些特殊词
 92 |     print(data_processing.useless_contain("[AI决策]大智慧的股票真烂，中美贸易战打得好，中美贸易摩擦擦出爱情火花！科创板也上市了，还是注册制的"))
 93 | 
 94 |     # 对content中的内容进行去停，去杂质词，分词
 95 |     # result = tk.token("【今日题材】[AI决策]加多宝的股票真烂，中美贸易战打得好，中美贸易摩擦擦出爱情火花！科创板也上市了，还是注册制的")
 96 |     result = tk.token("加多宝重推红罐 是否能再与王老吉争锋")
 97 |     print('Type of result： {}。'.format(type(result)))
 98 |     for i in result:
 99 |         print(i)
100 | 
101 | 
102 | def paralize_test(text, data_process, stop_words):
103 |     t = Tokenizer(data_process, stop_words)
104 |     restult = t.token(text)
105 |     return restult
106 | 
107 | 
108 | def multi_token_test():
109 |     """
110 |     多进程测试
111 |     :return:
112 |     """
113 |     import time
114 |     from multiprocessing import Pool
115 |     import multiprocessing as mp
116 | 
117 |     s = '程序员(英文Programmer)是从事程序开发、维护的专业人员。' \
118 |         '一般将程序员分为程序设计人员和程序编码人员，但两者的界限并不非常清楚，' \
119 |         '特别是在中国。软件从业人员分为初级程序员、高级程序员、系统分析员和项目经理四大类。'
120 | 
121 |     dataprocess = data_process.DataPressing()
122 |     dicts.init()
123 |     stop_words = load_stop_words()
124 |     # 串行处理
125 |     t0 = time.time()
126 |     res1_l = []
127 |     for i in range(10000):
128 |         res1 = paralize_test(s, dataprocess, stop_words)
129 |         res1_l.append(res1)
130 |     print("串行处理花费时间{t}s".format(t=time.time() - t0))
131 | 
132 |     # 并行处理
133 |     t1 = time.time()
134 |     res2_l = []
135 |     pool = Pool(processes=int(mp.cpu_count() * 0.8))
136 |     for i in range(10000):
137 |         res = pool.apply_async(paralize_test, ((s, dataprocess, stop_words),))
138 |         res2_l.append(res)
139 |     # 获取数据
140 |     # for k in res2_l:
141 |     #     print k.get()
142 |     pool.close()
143 |     pool.join()
144 |     print("并行处理花费时间{t}s".format(t=time.time() - t1))
145 | 
146 | 
147 | # tokenizer = Tokenizer()
148 | if __name__ == '__main__':
149 |     d_test()
150 |     # multi_token_test()
151 | 


--------------------------------------------------------------------------------