├── Faiss的使用
    ├── 001-欧式距离检索.py
    ├── 002-倒排表快速索引.py
    ├── 003-乘积量化索引.py
    ├── 004-faiss实现kmeans聚类.py
    ├── 005-faiss实现pca降维.py
    ├── 006-faiss实现PQ编码和解码.py
    ├── 007-faiss实现标量量化器.py
    └── 008-faiss_use_gpu.py
├── LAC分词器
    ├── 001-分词.py
    ├── 002-词性标注和实体识别.py
    ├── 003-加载自己的词表进行分词.py
    └── vocab.txt
├── PySpark
    ├── .DS_Store
    ├── 001-data_processing_use_pyspark.py
    ├── 002-linear_regression_use_pyspark.py
    ├── 003-logistic_regression_use_pyspark.py
    ├── 004-random_forests_classification_use_pyspark.py
    ├── 005-kmeans_cluster_use_pyspark.py
    ├── 006-recommendr_system_use_pyspark.py
    ├── 007-NLP_use_pyspark.py
    └── data
    │   ├── Linear_regression_dataset.csv
    │   ├── Log_Reg_dataset.csv
    │   ├── Movie_reviews.csv
    │   ├── affairs.csv
    │   ├── iris_dataset.csv
    │   ├── movie_ratings_df.csv
    │   └── sample_data.csv
├── README.md
├── RSA实战
    ├── 001-rsa生成公私钥并保存.py
    └── 002-公钥加密私钥解密.py
├── apscheduler实现定时任务
    └── 定时任务.py
├── chinesebert中的pinyin和glyph的处理
    ├── MSYH.TTC
    ├── image_test.py
    └── pinyin_test.py
├── collections的用法
    └── 001-collections中的namedtuple用法.py
├── elasticsearch
    ├── 001-创建库并插入数据.py
    └── 002-es中的搜索.py
├── flask+echart+ajax
    ├── .DS_Store
    ├── app.py
    ├── static
    │   ├── .DS_Store
    │   ├── css
    │   │   └── main.css
    │   └── js
    │   │   ├── controller.js
    │   │   ├── echarts.min.js
    │   │   ├── jquery.js
    │   │   ├── left.js
    │   │   └── right.js
    └── templates
    │   └── index.html
├── flask表单那些事
    ├── .DS_Store
    ├── app.py
    └── templates
    │   └── index.html
├── gensim
    ├── 001-TF-IDF句子相似度计算.py
    ├── 002-gensim文本摘要.py
    └── data
    │   ├── answer.txt
    │   ├── question.txt
    │   ├── stopwords.txt
    │   ├── test.py
    │   └── text.txt
├── gradio学习
    ├── 01-row_column_layout.py
    └── 02-chatglm_web.py
├── ipdb调试python程序
    ├── 001-简单调试.py
    └── readme.txt
├── logging模块的使用
    ├── 001-日志级别的使用.py
    ├── 002-日志控制台输出.py
    ├── 003-日志文件输出.py
    └── 004-捕捉异常.py
├── pandas一键画图
    ├── 001-plot_zhexiantu.html
    ├── 001-plot_zhexiantu.py
    ├── 002-plot_sandiantu.html
    ├── 002-plot_sandiantu.py
    ├── 003-plot_zhuzhuangtu.html
    └── 003-plot_zhuzhuangtu.py
├── py2neo操作neo4j
    ├── .DS_Store
    ├── py2neo简单练习
    │   ├── create_graph_v1.py
    │   ├── create_graph_v2.py
    │   ├── mingchaonaxieshier.xlsx
    │   ├── santi.xlsx
    │   └── test.xlsx
    ├── readme.txt
    ├── 事件三元组抽取
    │   ├── ltp的使用.py
    │   ├── my_vocab.txt
    │   └── readme.txt
    └── 医疗知识图谱问答
    │   ├── .DS_Store
    │   ├── ahocorasick的使用
    │       └── demo.py
    │   ├── build_medical_graph.py
    │   ├── data
    │       ├── medical.json
    │       └── medical_min.json
    │   ├── data_process
    │       ├── __init__.py
    │       ├── __pycache__
    │       │   ├── __init__.cpython-37.pyc
    │       │   ├── answer_search.cpython-37.pyc
    │       │   ├── question_classifier.cpython-37.pyc
    │       │   └── question_parser.cpython-37.pyc
    │       ├── answer_search.py
    │       ├── question_classifier.py
    │       └── question_parser.py
    │   ├── dict
    │       ├── check.txt
    │       ├── deny.txt
    │       ├── department.txt
    │       ├── disease.txt
    │       ├── drug.txt
    │       ├── food.txt
    │       ├── producer.txt
    │       └── symptom.txt
    │   └── run_chatbot.py
├── pyecharts使用
    ├── 001-柱状图.py
    ├── 002-折线图.py
    ├── 003-饼状图.py
    ├── 折线图.html
    ├── 柱状图.html
    └── 饼状图.html
├── pymysql的使用
    ├── 001-创建数据库.py
    ├── 002-创建表插入数据.py
    ├── 003-查询.py
    ├── 004-更新.py
    └── 005-删除.py
├── python并发编程
    ├── 001-多线程.py
    ├── 002-生产者消费者实现多线程爬虫.py
    ├── 003-多线程锁机制.py
    ├── 004-线程池的使用.py
    ├── 005-线程池加速flask-web服务.py
    ├── 006-多进程的使用.py
    ├── 007-多进程加速flask-web服务.py
    ├── 008-协程爬虫.py
    ├── 009-使用信号量控制协程数进行爬虫.py
    └── data.txt
├── streamlit的使用
    └── 鸢尾花数据的分类app
    │   └── app.py
└── textrank4zh
    ├── 001-关键词提取.py
    ├── 002-摘要抽取.py
    ├── data
        └── text.txt
    └── readme.txt


/Faiss的使用/001-欧式距离检索.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 001-欧式距离检索.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-08-09
 6 | """
 7 | import faiss
 8 | import numpy as np
 9 | 
10 | 
11 | if __name__ == '__main__':
12 |     n_data, d = 1000, 512   # 检索库中的向量个数, 每个向量的维度
13 |     np.random.seed(43)   # 随机种子 为了多次执行结果一致
14 | 
15 |     # 检索库的构造
16 |     data = []
17 |     mu, sigma = 3, 0.1   # 这里时通过高斯分布随机产生若干向量，这两个参数为均值和方差
18 |     for i in range(n_data):
19 |         data.append(np.random.normal(mu, sigma, d))
20 |     data = np.array(data).astype('float32')   # faiss只支持32位的浮点数
21 | 
22 |     # 检索向量的生成
23 |     query = []
24 |     n_query = 10   # 生成10个query向量
25 |     mu, sigma = 3, 0.1
26 |     np.random.seed(12)
27 |     for i in range(n_query):
28 |         query.append(np.random.normal(mu, sigma, d))
29 |     query = np.array(query).astype('float32')
30 | 
31 |     # 构建索引  记住要传入向量维度d
32 |     index = faiss.IndexFlatL2(d)
33 |     # print(index.is_trained)    # 这里若是false就要训练  后面讲
34 | 
35 |     # 添加数据
36 |     index.add(data)
37 |     # print(index.ntotal)   # 总的数据量
38 | 
39 |     # 开始检索
40 |     k = 10   # 指定让其返回10个距离最近的
41 | 
42 |     # 这里我们选取data中的前五个 容易看到结果，因为自己跟自己距离肯定为0 所以最相关的肯定是自己
43 |     query_self = data[:5]
44 | 
45 |     dis, ind = index.search(query_self, k=k)
46 |     print(dis)   # 每条数据代表了当前这个query 与最相关的十个数据的距离
47 |     print(ind)   # 每条数据代表了当前这个query 最相关的十条数据的索引
48 |     """
49 |     [[0.        8.55197   8.634906  8.683499  8.698736  8.821949  8.902446
50 |       8.943979  8.9516735 8.972908 ]
51 |      [0.        8.369204  8.482748  8.53028   8.581224  8.680499  8.684254
52 |       8.697291  8.719812  8.753435 ]
53 |      [0.        8.209936  8.392483  8.456179  8.473589  8.480727  8.551348
54 |       8.553277  8.576391  8.592704 ]
55 |      [0.        8.473689  8.621014  8.827385  8.883725  8.980131  8.99064
56 |       9.015673  9.017438  9.027972 ]
57 |      [0.        8.268832  8.349455  8.597895  8.611757  8.658188  8.675722
58 |       8.685029  8.70588   8.707612 ]]
59 |     [[  0 877 502  42 606 366 348 923 563  56]
60 |      [  1 849 974 106 348 364 877 242 280 173]
61 |      [  2 877 127 655 253 233 558 678  13 208]
62 |      [  3 421  94 348 502 402 536 646 563 735]
63 |      [  4 986 230 209 446 889 974 241 550 248]]
64 |      """
65 | 
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/Faiss的使用/002-倒排表快速索引.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 002-倒排表快速索引.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-08-09
 6 | """
 7 | import numpy as np
 8 | import faiss
 9 | 
10 | if __name__ == '__main__':
11 |     n_data, d = 1000, 512  # 检索库中的向量个数, 每个向量的维度
12 |     np.random.seed(43)  # 随机种子 为了多次执行结果一致
13 | 
14 |     # 检索库的构造
15 |     data = []
16 |     mu, sigma = 3, 0.1  # 这里时通过高斯分布随机产生若干向量，这两个参数为均值和方差
17 |     for i in range(n_data):
18 |         data.append(np.random.normal(mu, sigma, d))
19 |     data = np.array(data).astype('float32')  # faiss只支持32位的浮点数
20 | 
21 |     # 检索向量的生成
22 |     query = []
23 |     n_query = 10  # 生成10个query向量
24 |     mu, sigma = 3, 0.1
25 |     np.random.seed(12)
26 |     for i in range(n_query):
27 |         query.append(np.random.normal(mu, sigma, d))
28 |     query = np.array(query).astype('float32')
29 | 
30 |     nlist = 50  # 将数据库向量分割为多少了维诺空间
31 |     k = 10
32 |     quantizer = faiss.IndexFlatL2(d)  # 量化器
33 |     index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)  # METRIC_L2计算L2距离, 或faiss.METRIC_INNER_PRODUCT计算内积
34 |     assert not index.is_trained  # 倒排表索引类型需要训练
35 |     index.train(data)  # 训练数据集应该与数据库数据集同分布
36 |     assert index.is_trained
37 | 
38 |     index.add(data)
39 |     index.nprobe = 2  # 选择n个维诺空间进行索引,
40 |     dis, ind = index.search(query, k)
41 |     print(dis)
42 |     print(ind)
43 | 


--------------------------------------------------------------------------------
/Faiss的使用/003-乘积量化索引.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 003-乘积量化索引.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-08-09
 6 | """
 7 | import numpy as np
 8 | import faiss
 9 | 
10 | if __name__ == '__main__':
11 |     n_data, d = 1000, 512  # 检索库中的向量个数, 每个向量的维度
12 |     np.random.seed(43)  # 随机种子 为了多次执行结果一致
13 | 
14 |     # 检索库的构造
15 |     data = []
16 |     mu, sigma = 3, 0.1  # 这里时通过高斯分布随机产生若干向量，这两个参数为均值和方差
17 |     for i in range(n_data):
18 |         data.append(np.random.normal(mu, sigma, d))
19 |     data = np.array(data).astype('float32')  # faiss只支持32位的浮点数
20 | 
21 |     # 检索向量的生成
22 |     query = []
23 |     n_query = 10  # 生成10个query向量
24 |     mu, sigma = 3, 0.1
25 |     np.random.seed(12)
26 |     for i in range(n_query):
27 |         query.append(np.random.normal(mu, sigma, d))
28 |     query = np.array(query).astype('float32')
29 | 
30 |     nlist = 50
31 |     m = 8  # 列方向划分个数，必须能被d整除
32 |     k = 10
33 |     quantizer = faiss.IndexFlatL2(d)
34 |     index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 4)   # 4 表示每个子向量被编码为 4 bits
35 | 
36 |     index.train(data)
37 |     index.add(data)
38 |     index.nprobe = 50
39 |     dis, ind = index.search(data[:10], k)  # 查询自身
40 |     print(dis)
41 |     print(ind)
42 | 
43 |     dis, ind = index.search(query, k)  # 真实查询
44 |     print(dis)
45 |     print(ind)
46 | 


--------------------------------------------------------------------------------
/Faiss的使用/004-faiss实现kmeans聚类.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 004-faiss实现kmeans聚类.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-08-09
 6 | """
 7 | import faiss
 8 | import numpy as np
 9 | 
10 | 
11 | if __name__ == '__main__':
12 |     # 数据
13 |     n_data, d = 2000, 512
14 |     np.random.seed(43)
15 |     data = []
16 |     mu, sigma = 3, 0.1
17 |     for i in range(n_data):
18 |         data.append(np.random.normal(mu, sigma, d))
19 |     data = np.array(data).astype('float32')
20 | 
21 |     # 聚类
22 |     n_centroids = 1024    # 聚类中心个数
23 |     d = data.shape[1]
24 |     kmeans = faiss.Kmeans(d, n_centroids)
25 |     kmeans.train(data)
26 |     # 输出聚类中心
27 |     # print(kmeans.centroids)
28 |     # print(len(kmeans.centroids))
29 | 
30 |     # 看data中的前五个向量属于那个类(最有可能的两个类)
31 |     D, I = kmeans.index.search(data[:5], k=2)
32 |     print(D)   # 与每个类的距离
33 |     print(I)   # 类的编号
34 |     """
35 |     输出:
36 |     [[4.1553707 5.2924204]
37 |      [1.9329664 4.930997 ]
38 |      [4.537619  4.8509283]
39 |      [4.6700296 5.2252126]
40 |      [2.101182  4.9292693]]
41 |     [[478 568]
42 |      [767 697]
43 |      [568 527]
44 |      [999 568]
45 |      [175 853]]
46 |     """
47 | 
48 |     print('*'*100)
49 |     # 计算每个中心最近的若干条向量
50 |     k = 5
51 |     index = faiss.IndexFlatL2(d)
52 |     index.add(data)
53 |     D, I = index.search(kmeans.centroids, k)
54 |     print(D)
55 |     print(I)
56 | 
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/Faiss的使用/005-faiss实现pca降维.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 005-faiss实现pca降维.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-08-09
 6 | """
 7 | import faiss
 8 | import numpy as np
 9 | 
10 | 
11 | if __name__ == '__main__':
12 |     # 数据
13 |     n_data, d = 2000, 512
14 |     np.random.seed(43)
15 |     data = []
16 |     mu, sigma = 3, 0.1
17 |     for i in range(n_data):
18 |         data.append(np.random.normal(mu, sigma, d))
19 |     data = np.array(data).astype('float32')
20 | 
21 |     mat = faiss.PCAMatrix(512, 64)  # 从512维降为64维
22 |     mat.train(data)
23 |     assert mat.is_trained
24 |     tr = mat.apply_py(data)
25 |     print(tr.shape)
26 |     print(tr)
27 | 
28 | 


--------------------------------------------------------------------------------
/Faiss的使用/006-faiss实现PQ编码和解码.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 006-faiss实现PQ编码和解码.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-08-09
 6 | """
 7 | import faiss
 8 | import numpy as np
 9 | 
10 | 
11 | if __name__ == '__main__':
12 |     # 数据
13 |     n_data, d = 2000, 512
14 |     np.random.seed(43)
15 |     data = []
16 |     mu, sigma = 3, 0.1
17 |     for i in range(n_data):
18 |         data.append(np.random.normal(mu, sigma, d))
19 |     data = np.array(data).astype('float32')
20 | 
21 |     cs = 4  # code size (bytes)
22 |     # 训练数据集
23 |     x = data   # 原始的数据集
24 | 
25 |     x_train = data  # 训练集
26 |     pq = faiss.ProductQuantizer(d, cs, 8)
27 |     pq.train(x_train)
28 | 
29 |     # encode编码
30 |     codes = pq.compute_codes(x)
31 | 
32 |     # decode解码
33 |     x2 = pq.decode(codes)
34 | 
35 |     # 编码-解码后与原始数据的差
36 |     avg_relative_error = ((x - x2)**2).sum() / (x ** 2).sum()
37 |     print(avg_relative_error)


--------------------------------------------------------------------------------
/Faiss的使用/007-faiss实现标量量化器.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 007-faiss实现标量量化器.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-08-09
 6 | """
 7 | import faiss
 8 | import numpy as np
 9 | 
10 | 
11 | if __name__ == '__main__':
12 |     # 数据
13 |     n_data, d = 2000, 512
14 |     np.random.seed(43)
15 |     data = []
16 |     mu, sigma = 3, 0.1
17 |     for i in range(n_data):
18 |         data.append(np.random.normal(mu, sigma, d))
19 |     data = np.array(data).astype('float32')
20 | 
21 |     x = data
22 |     # 训练集
23 |     x_train = data
24 |     # QT_8bit allocates 8 bits per dimension (QT_4bit also works)
25 |     sq = faiss.ScalarQuantizer(d, faiss.ScalarQuantizer.QT_8bit)
26 |     sq.train(x_train)
27 | 
28 |     # encode 编码
29 |     codes = sq.compute_codes(x)
30 | 
31 |     # decode 解码
32 |     x2 = sq.decode(codes)
33 | 
34 |     # 计算编码-解码后与原始数据的差
35 |     avg_relative_error = ((x - x2)**2).sum() / (x ** 2).sum()
36 |     print(avg_relative_error)


--------------------------------------------------------------------------------
/Faiss的使用/008-faiss_use_gpu.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 008-faiss_use_gpu.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-08-25
 6 | """
 7 | import faiss
 8 | import numpy as np
 9 | import time
10 | 
11 | 
12 | 
13 | if __name__ == '__main__':
14 |     d = 512   # 向量维度
15 |     nb = 300000   # 向量库的大小
16 |     nq = 100   # 用这100个向量进行检索
17 | 
18 |     np.random.seed(1234)
19 | 
20 |     # 随机产生一个向量库
21 |     xb = np.random.random((nb,d)).astype('float32')
22 |     xb[:, 0] += np.arange(nb) / 1000.
23 | 
24 |     # 随机产生100个query向量
25 |     xq = np.random.random((nq,d)).astype('float32')
26 |     xq[:, 0] += np.arange(nq) / 1000.
27 | 
28 |     quantizer = faiss.IndexFlatL2(d)
29 |     nlist = 100   # 将数据库向量分割为多少了维诺空间
30 |     index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
31 | 
32 |     gpu_index = faiss.index_cpu_to_all_gpus(index)   # 使用gpu也就是这行代码就行了
33 |     print(gpu_index.is_trained)
34 |     gpu_index.train(xb)
35 |     print(gpu_index.is_trained)
36 | 
37 |     gpu_index.add(xb)
38 |     gpu_index.nprobe = 10   # 选择10个维诺空间进行索引
39 |     k = 10    # 返回十个结果
40 |     D, gt_nms = gpu_index.search(xq, k)
41 |     print(gt_nms)


--------------------------------------------------------------------------------
/LAC分词器/001-分词.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @Time ： 2020/11/3 14:06
 4 | @Auth ： xiaolu
 5 | @File ：001-分词.py
 6 | @IDE ：PyCharm
 7 | @Email：luxiaonlp@163.com
 8 | """
 9 | from LAC import LAC
10 | import jieba
11 | 
12 | 
13 | if __name__ == '__main__':
14 |     lac = LAC(mode='seg')
15 | 
16 |     # 单个样本输入, 输入为unicode编码的字符串
17 |     text = '大王叫我来巡山'
18 |     lac_result = lac.run(text)
19 |     print(lac_result)
20 | 
21 |     jieba_result = jieba.lcut(text)
22 |     print(jieba_result)
23 | 
24 |     # 批量样本输入, 输入为多个句子组成的list，平均速率会更快
25 |     texts = ["山里有个庙", "庙里有个老和尚跟一个小和尚"]
26 |     result = lac.run(texts)
27 |     print(result)
28 | 
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/LAC分词器/002-词性标注和实体识别.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @Time ： 2020/11/3 14:09
 4 | @Auth ： xiaolu
 5 | @File ：002-词性标注和实体识别.py
 6 | @IDE ：PyCharm
 7 | @Email：luxiaonlp@163.com
 8 | """
 9 | from LAC import LAC
10 | 
11 | 
12 | if __name__ == '__main__':
13 |     lac = LAC(mode='lac')
14 |     text = '我想涨工资'
15 | 
16 |     lac_result = lac.run(text)
17 |     print(lac_result)
18 | 
19 |     texts = ["汤青松长得好帅", "我喜欢做安全开发工程师"]
20 |     lac_result = lac.run(texts)
21 |     print(lac_result)
22 | 
23 | 
24 | 
25 | 
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/LAC分词器/003-加载自己的词表进行分词.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @Time ： 2020/11/6 11:18
 4 | @Auth ： xiaolu
 5 | @File ：001-demo.py
 6 | @IDE ：PyCharm
 7 | @Email：luxiaonlp@163.com
 8 | """
 9 | from LAC import LAC
10 | import jieba
11 | 
12 | if __name__ == '__main__':
13 |     lac = LAC()
14 |     lac.load_customization('./vocab.txt', sep=None)
15 |     res1 = lac.run('字节跳动阿里巴巴腾讯公司金山软件小米科技')
16 |     res2 = jieba.lcut('字节跳动阿里巴巴腾讯公司金山软件小米科技')
17 |     print(res1)
18 |     print(res2)
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/LAC分词器/vocab.txt:
--------------------------------------------------------------------------------
1 | 我
2 | 爱你
3 | 我爱


--------------------------------------------------------------------------------
/PySpark/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/PySpark/.DS_Store


--------------------------------------------------------------------------------
/PySpark/001-data_processing_use_pyspark.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @file   : 001-data_processing_use_pyspark.py
  3 | @author : xiaolu
  4 | @email  : luxiaonlp@163.com
  5 | @time   : 2021-04-08
  6 | """
  7 | import findspark
  8 | findspark.init()
  9 | from pyspark.sql import SparkSession
 10 | from pyspark.sql.types import StringType, DoubleType, IntegerType
 11 | from pyspark.sql.functions import udf
 12 | from pyspark.sql.functions import pandas_udf, PandasUDFType
 13 | 
 14 | 
 15 | def price_range(brand):
 16 |     if brand in ['Samsung', 'Apple']:
 17 |         return 'High Price'
 18 |     elif brand == 'MI':
 19 |         return 'Mid Price'
 20 |     else:
 21 |         return 'Low Price'
 22 | 
 23 | 
 24 | def remaining_yrs(age):
 25 |     yrs_left = 100-age
 26 |     return yrs_left
 27 | 
 28 | 
 29 | if __name__ == '__main__':
 30 |     # 1. 创建会话对象
 31 |     spark = SparkSession.builder.appName('data_processing').getOrCreate()
 32 | 
 33 |     # 2. 加载数据
 34 |     df = spark.read.csv('./data/sample_data.csv', inferSchema=True, header=True)
 35 |     print(df.columns)   # 打印所有特征名: ['ratings', 'age', 'experience', 'family', 'mobile']
 36 |     print(df.count())   # 总的数据量: 33
 37 | 
 38 |     # 打印数据格式
 39 |     print(df.printSchema())
 40 | 
 41 |     # 打印前五条数据
 42 |     print(df.show(n=5))
 43 | 
 44 |     # 打印某两列  的前三条数据
 45 |     print(df.select('ratings', 'mobile').show(n=3))
 46 | 
 47 |     # 打印数据统计量  也就是每个特征的均值、方差等。
 48 |     print(df.describe().show())
 49 | 
 50 |     # 新建一列数据
 51 |     print(df.withColumn("age_after_10_yrs", (df["age"]+10)).show(5))
 52 | 
 53 |     # 将某列数据转换类型  编程新的一列数据
 54 |     print(df.withColumn('age_double', df['age'].cast(DoubleType())).show(3, False))
 55 | 
 56 |     # 过滤: 指定某个属性的取值，找出该属性取该值的全部数据
 57 |     print(df.filter(df['mobile'] == 'Vivo').select('age', 'ratings', 'mobile').show())
 58 | 
 59 |     # 多条件过滤
 60 |     print(df.filter((df['mobile'] == 'Vivo') & (df['experience'] > 10)).show())
 61 | 
 62 |     # 将某个特征下的值去重后，然后显示出来
 63 |     print(df.select('mobile').distinct().show())
 64 |     print('去重后的取值数:', df.select('mobile').distinct().count())
 65 | 
 66 |     # 根据某个特征的取值进行分组
 67 |     print(df.groupBy('mobile').count().show())   # 分组统计个数
 68 |     print(df.groupBy('mobile').mean().show())   # 分组后 计算每个特征的均值
 69 |     print(df.groupBy('mobile').sum().show())    # 分组后 计算每个特征的和
 70 |     print(df.groupBy('mobile').agg({'experience': 'sum'}).show())   # 分组后，只对experience特征求和
 71 |     print(df.groupBy('mobile').max().show())    # 分组后 计算每个特征的最大值
 72 |     print(df.groupBy('mobile').min().show())    # 分组后 计算每个特征的最小值
 73 | 
 74 |     # 普通UDF
 75 |     # 用户自定义数据函数UDF
 76 |     brand_udf = udf(price_range, StringType())  # 两个参数: 用户自定的函数，传输的数据类型
 77 |     print(df.withColumn('price_range', brand_udf(df['mobile'])).show())  # 将udf应用在mobile特征上
 78 | 
 79 |     # 或者采用lambda表达式
 80 |     age_udf = udf(lambda age: "young" if age <= 30 else "senior", StringType())
 81 |     print(df.withColumn("age_group", age_udf(df.age)).show())
 82 | 
 83 |     # 去掉重复的记录
 84 |     print(df.count())
 85 |     df = df.dropDuplicates()
 86 |     print('去掉重复记录后的数据数:', df.count())
 87 | 
 88 |     # 删除某列
 89 |     df_new = df.drop('mobile')
 90 |     print(df_new.show(5))
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/PySpark/002-linear_regression_use_pyspark.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @file   : 002-linear_regression_use_pyspark.py
  3 | @author : xiaolu
  4 | @email  : luxiaonlp@163.com
  5 | @time   : 2021-04-08
  6 | """
  7 | import findspark
  8 | findspark.init()
  9 | 
 10 | from pyspark.sql import SparkSession
 11 | from pyspark.sql.functions import corr
 12 | from pyspark.ml.linalg import Vector
 13 | from pyspark.ml.feature import VectorAssembler
 14 | from pyspark.ml.regression import LinearRegression
 15 | 
 16 | 
 17 | def analyse_data(df):
 18 |     '''
 19 |     数据分析
 20 |     :param df:
 21 |     :return:
 22 |     '''
 23 |     # 打印数据格式
 24 |     print(df.printSchema())
 25 | 
 26 |     # 打印前十条数据
 27 |     print(df.head(10))
 28 | 
 29 |     # 看某个特征与输出的相关系数   var_1与output的相关系数
 30 |     print(df.select(corr('var_1', 'output')).show())   # 0.9187399607627283
 31 | 
 32 | 
 33 | def feature_process(df):
 34 |     '''
 35 |     特征工程
 36 |     :param df:
 37 |     :return:
 38 |     '''
 39 |     # 将var_1到var2合成一个向量，名字叫做features
 40 |     vec_assmebler = VectorAssembler(inputCols=['var_1', 'var_2', 'var_3', 'var_4', 'var_5'], outputCol='features')
 41 |     features_df = vec_assmebler.transform(df)
 42 |     # print(features_df.select('features').show(5))   # 看features的取值
 43 | 
 44 |     model_df = features_df.select('features', 'output')   # 将features和输出拿出来  进行模型训练
 45 |     # print(model_df.show(5))
 46 |     return model_df
 47 | 
 48 | 
 49 | if __name__ == '__main__':
 50 |     # 1. 加载数据集
 51 |     spark = SparkSession.builder.appName('lin_reg').getOrCreate()
 52 |     df = spark.read.csv('./data/Linear_regression_dataset.csv', inferSchema=True, header=True)
 53 |     # print('数据量:{}, 特征数:{}'.format(df.count(), len(df.columns)))   # 数据量:1232, 特征数:6
 54 | 
 55 |     # 2. 数据分析
 56 |     # analyse_data(df)   如果进行数据分析  执行该函数
 57 | 
 58 |     # 3. 特征工程
 59 |     model_df = feature_process(df)   # 将各个特征的值合并成一个向量
 60 |     # 划分数据
 61 |     train_df, test_df = model_df.randomSplit([0.7, 0.3])
 62 |     # print('训练集---数据量:{}, 特征数:{}'.format(train_df.count(), len(train_df.columns)))   # 数据量:868, 特征数:2
 63 |     # print('测试集---数据量:{}, 特征数:{}'.format(test_df.count(), len(test_df.columns)))   # 数据量:364, 特征数:2
 64 | 
 65 |     # 4. 模型训练
 66 |     lin_Reg = LinearRegression(labelCol='output')
 67 |     lr_model = lin_Reg.fit(train_df)
 68 | 
 69 |     # 5. 模型评价
 70 |     # 模型训练完毕 打印回归系数
 71 |     print(lr_model.coefficients)
 72 | 
 73 |     training_predictions = lr_model.evaluate(train_df)
 74 |     print('训练集的均方误差:', training_predictions.meanSquaredError)
 75 |     # 训练集的均方误差: 0.00014265219879599827
 76 | 
 77 |     testing_predictions = lr_model.evaluate(test_df)
 78 |     print('测试集的均方误差:', testing_predictions.meanSquaredError)
 79 |     # 测试集的均方误差: 0.00014983739298532136
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 


--------------------------------------------------------------------------------
/PySpark/003-logistic_regression_use_pyspark.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @file   : 003-logistic_regression_use_pyspark.py
  3 | @author : xiaolu
  4 | @email  : luxiaonlp@163.com
  5 | @time   : 2021-04-08
  6 | """
  7 | import findspark
  8 | findspark.init()
  9 | 
 10 | from pyspark.sql import SparkSession
 11 | from pyspark.ml.feature import StringIndexer
 12 | from pyspark.ml.feature import VectorAssembler
 13 | from pyspark.ml.feature import OneHotEncoder
 14 | from pyspark.ml.classification import LogisticRegression
 15 | from pyspark.ml.evaluation import BinaryClassificationEvaluator
 16 | 
 17 | 
 18 | def analyse_data(df):
 19 |     '''
 20 |     数据分析
 21 |     :param df:
 22 |     :return:
 23 |     '''
 24 |     # 打印数据的格式
 25 |     print(df.printSchema())
 26 | 
 27 |     # 打印前五条数据
 28 |     print(df.show(n=5))
 29 | 
 30 |     # 简单看一下各个特征的统计指标
 31 |     print(df.describe().show())   # 对于离散值 是不计算均值和方差的
 32 | 
 33 |     # 按国家特征进行聚合 看看那个国家样本多
 34 |     print(df.groupby('Country').count().show())
 35 | 
 36 |     # 看看搜索引擎用户数量谁最高
 37 |     print(df.groupby('Platform').count().show())
 38 | 
 39 | 
 40 | def feature_process(df):
 41 |     '''
 42 |     特征工程
 43 |     :param df:
 44 |     :return:
 45 |     '''
 46 |     # 这里需要将国家和搜索引擎两个特征转为数值特征
 47 |     search_engine_indexer = StringIndexer(inputCol="Platform", outputCol='Platform_Num').fit(df)
 48 |     df = search_engine_indexer.transform(df)
 49 |     # print(df.show(3))
 50 |     search_engine_encoder = OneHotEncoder(inputCol='Platform_Num', outputCol='Platform_Num_Vec').fit(df)
 51 |     df = search_engine_encoder.transform(df)
 52 |     # print(df.show(3))
 53 | 
 54 |     # print('*'*150)
 55 |     # 然后处理国家特征
 56 |     country_indexer = StringIndexer(inputCol="Country", outputCol='Country_Num').fit(df)
 57 |     df = country_indexer.transform(df)
 58 |     # print(df.show(3))
 59 |     country_encoder = OneHotEncoder(inputCol='Country_Num', outputCol='Country_Num_Vec').fit(df)
 60 |     df = country_encoder.transform(df)
 61 |     # print(df.show(3))
 62 | 
 63 |     df_assembler = VectorAssembler(
 64 |         inputCols=['Platform_Num_Vec', 'Country_Num_Vec', 'Age', 'Repeat_Visitor', 'Web_pages_viewed'],
 65 |         outputCol='features'
 66 |     )
 67 |     df = df_assembler.transform(df)
 68 |     model_df = df.select(['features', 'Status'])
 69 |     return model_df
 70 | 
 71 | 
 72 | if __name__ == "__main__":
 73 |     # 1. 加载数据
 74 |     spark = SparkSession.builder.appName('log_reg').getOrCreate()
 75 |     df = spark.read.csv('./data/Log_Reg_dataset.csv', inferSchema=True, header=True)
 76 |     # print('样本数:{}, 特征数:{}'.format(df.count(), len(df.columns)))  # 样本数:20000, 特征数:6
 77 | 
 78 |     # 2. 数据分析
 79 |     # analyse_data(df)
 80 | 
 81 |     # 3. 特征工程
 82 |     model_df = feature_process(df)
 83 |     # print(model_df.show(3))
 84 |     # 切分数据集
 85 |     training_df, test_df = model_df.randomSplit([0.75, 0.25])
 86 |     print('训练集的个数:', training_df.count())
 87 |     print('测试集的个数:', test_df.count())
 88 | 
 89 |     print('训练集的正负样本比例:')
 90 |     print(training_df.groupBy('Status').count().show())
 91 | 
 92 |     print('测试集的正负样本比例:')
 93 |     print(test_df.groupBy('Status').count().show())
 94 | 
 95 |     # 4. 训练模型
 96 |     log_reg = LogisticRegression(labelCol='Status').fit(training_df)
 97 | 
 98 |     # 5. 测试模型
 99 |     train_results = log_reg.evaluate(training_df).predictions
100 |     correct_preds = train_results.filter(train_results['Status'] == 1).filter(train_results['prediction'] == 1).count()
101 |     print('训练集的正确率:', float(correct_preds)/(training_df.filter(training_df['Status'] == 1).count()))
102 | 
103 |     # 在测试集上的表现
104 |     results = log_reg.evaluate(test_df).predictions
105 |     # 计算混淆矩阵
106 |     true_postives = results[(results.Status == 1) & (results.prediction == 1)].count()
107 |     true_negatives = results[(results.Status == 0) & (results.prediction == 0)].count()
108 |     false_positives = results[(results.Status == 0) & (results.prediction == 1)].count()
109 |     false_negatives = results[(results.Status == 1) & (results.prediction == 0)].count()
110 |     recall = float(true_postives)/(true_postives + false_negatives)
111 |     print('召回率:', recall)
112 | 
113 |     precision = float(true_postives) / (true_postives + false_positives)
114 |     print('精确率:', precision)
115 | 
116 |     accuracy = float((true_postives+true_negatives) /(results.count()))
117 |     print('准确率:', accuracy)
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 


--------------------------------------------------------------------------------
/PySpark/004-random_forests_classification_use_pyspark.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 004-random_forests_classification_use_pyspark.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-04-08
 6 | """
 7 | import findspark
 8 | 
 9 | findspark.init()
10 | 
11 | from pyspark.ml.feature import VectorAssembler
12 | from pyspark.sql import SparkSession
13 | from pyspark.ml.classification import RandomForestClassifier
14 | from pyspark.ml.classification import RandomForestClassificationModel
15 | from pyspark.ml.evaluation import BinaryClassificationEvaluator
16 | from pyspark.ml.evaluation import MulticlassClassificationEvaluator
17 | 
18 | 
19 | def analyse_data(df):
20 |     '''
21 |     数据分析
22 |     :param df:
23 |     :return:
24 |     '''
25 |     print(df.show(5))
26 | 
27 |     # 看看每个特征的统计信息 如均值方差等
28 |     print(df.describe().select('summary', 'rate_marriage', 'age', 'yrs_married', 'children', 'religious').show())
29 | 
30 |     # 人们对婚姻打分比例
31 |     print(df.groupBy('rate_marriage').count().show())
32 | 
33 |     # 以孩子和事务为键  然后聚合。 可以发现数据集中 没孩子 没事务的人最多
34 |     print(df.groupBy('children', 'affairs').count().orderBy('children', 'affairs', 'count', ascending=True).show())
35 | 
36 | 
37 | def feature_process(df):
38 |     '''
39 |     特征工程
40 |     :param df:
41 |     :return:
42 |     '''
43 |     df_assembler = VectorAssembler(inputCols=['rate_marriage', 'age', 'yrs_married', 'children', 'religious'],
44 |                                    outputCol="features")
45 |     df = df_assembler.transform(df)
46 |     model_df = df.select(['features', 'affairs'])
47 |     return model_df
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     # 1. 加载数据集
52 |     spark = SparkSession.builder.appName('random_forest').getOrCreate()
53 |     df = spark.read.csv('./data/affairs.csv', inferSchema=True, header=True)
54 |     print((df.count(), len(df.columns)))
55 | 
56 |     # 2. 数据分析
57 |     analyse_data(df)
58 | 
59 |     # 3. 特征工程
60 |     model_df = feature_process(df)
61 |     # 切分数据集
62 |     train_df, test_df = model_df.randomSplit([0.75, 0.25])
63 |     print('训练集条数:', train_df.count())
64 |     print('训练集标签的统计:')
65 |     print(train_df.groupBy('affairs').count().show())
66 | 
67 |     print('测试集条数:', test_df.count())
68 |     print('测试集标签的统计:')
69 |     print(test_df.groupBy('affairs').count().show())
70 | 
71 |     # 4. 训练模型
72 |     rf_classifier = RandomForestClassifier(labelCol='affairs', numTrees=50).fit(train_df)
73 | 
74 |     # 5. 模型评估
75 |     rf_predictions = rf_classifier.transform(test_df)
76 | 
77 |     rf_accuracy = MulticlassClassificationEvaluator(labelCol='affairs', metricName='accuracy').evaluate(rf_predictions)
78 |     print('测试集的准确率:', rf_accuracy)
79 | 
80 |     rf_precision = MulticlassClassificationEvaluator(labelCol='affairs', metricName='weightedPrecision').evaluate(
81 |         rf_predictions)
82 |     print('测试集的精确率:', rf_precision)
83 | 
84 |     rf_auc = BinaryClassificationEvaluator(labelCol='affairs').evaluate(rf_predictions)
85 |     print('测试集的AUC值:', rf_auc)
86 | 
87 |     # 看一下在分类中  每个特征所起的重要性
88 |     print(rf_classifier.featureImportances)
89 | 
90 |     # 保存模型
91 |     rf_classifier.save("./RF_model")
92 | 
93 |     # 下次使用， 则按照下面的方式加载
94 |     rf = RandomForestClassificationModel.load("./RF_model")
95 |     model_preditions = rf.transform(test_df)
96 |     model_preditions.show()
97 | 
98 | 


--------------------------------------------------------------------------------
/PySpark/005-kmeans_cluster_use_pyspark.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 005-kmeans_cluster_use_pyspark.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-04-08
 6 | """
 7 | import findspark
 8 | findspark.init()
 9 | import pyspark
10 | import pandas as pd
11 | import numpy as np
12 | import matplotlib.pyplot as plt
13 | from pyspark.sql.functions import *
14 | from pyspark.sql.types import *
15 | from pyspark.sql.functions import rand, randn
16 | from pyspark.ml.clustering import KMeans
17 | from pyspark.sql import SparkSession
18 | from pyspark.ml.linalg import Vectors
19 | from pyspark.ml.feature import VectorAssembler
20 | from pyspark.ml.evaluation import ClusteringEvaluator
21 | 
22 | def analyse_data(df):
23 |     '''
24 |     数据分析
25 |     :param df:
26 |     :return:
27 |     '''
28 |     print('总共的标签数:', df.select('species').distinct().count())
29 | 
30 |     # 每类数据集的样本数
31 |     print(df.groupBy('species').count().orderBy('count', ascending=False).show())
32 | 
33 | 
34 | def feature_process(df):
35 |     '''
36 |     特征工程
37 |     :param df:
38 |     :return:
39 |     '''
40 |     input_cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
41 |     vec_assembler = VectorAssembler(inputCols = input_cols, outputCol='features')
42 |     final_data = vec_assembler.transform(df)
43 |     return final_data
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     # 加载鸢尾花的数据
48 |     spark = SparkSession.builder.appName('k_means').getOrCreate()
49 |     df = spark.read.csv('./data/iris_dataset.csv',inferSchema=True,header=True)
50 |     print((df.count(),len(df.columns)))
51 | 
52 |     analyse_data(df)
53 | 
54 |     final_data = feature_process(df)
55 | 
56 |     errors=[]
57 | 
58 |     for k in range(2, 10):
59 |         kmeans = KMeans(featuresCol='features', k=k)
60 |         model = kmeans.fit(final_data)
61 | 
62 |         # Make predictions
63 |         predictions = model.transform(final_data)
64 |         evaluator = ClusteringEvaluator()
65 |         silhouette = evaluator.evaluate(predictions)   # 欧式距离
66 | 
67 |         # 打印聚类的中心
68 |         centers = model.clusterCenters()
69 |         print("Cluster Centers: ")
70 |         for center in centers:
71 |             print(center)
72 | 
73 | 
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/PySpark/006-recommendr_system_use_pyspark.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 006-recommendr_system_use_pyspark.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-04-09
 6 | """
 7 | import findspark
 8 | 
 9 | findspark.init()
10 | 
11 | from pyspark.sql import SparkSession
12 | from pyspark.sql.functions import rand
13 | from pyspark.ml.feature import StringIndexer, IndexToString
14 | from pyspark.ml.recommendation import ALS
15 | from pyspark.ml.evaluation import RegressionEvaluator
16 | 
17 | 
18 | def analyse_data(df):
19 |     '''
20 |     数据分析
21 |     :param df:
22 |     :return:
23 |     '''
24 |     print(df.printSchema())  # 查看数据格式
25 | 
26 |     # 看前5条数据
27 |     print(df.show(5))
28 | 
29 |     print(df.orderBy(rand()).show(5))  # 将数据打乱 看前五条
30 | 
31 |     # 与用户进行聚合，看每个用户都看过多少电影  前五名最爱看电影的人
32 |     print(df.groupBy('userId').count().orderBy('count', ascending=False).show(5))
33 | 
34 |     # 显示前五个最热门的电影
35 |     print(df.groupBy('title').count().orderBy('count', ascending=False).show(5))
36 | 
37 | 
38 | def feature_process(df):
39 |     '''
40 |     特征工程
41 |     :param df:
42 |     :return:
43 |     '''
44 |     # 1. 将title转为数字  也就是多加了一列特征
45 |     stringIndexer = StringIndexer(inputCol="title", outputCol="title_new")
46 |     model = stringIndexer.fit(df)
47 |     indexed = model.transform(df)
48 |     print(indexed.show(5))
49 |     return indexed
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     # 1. 加载数据
54 |     spark = SparkSession.builder.appName('rc').getOrCreate()
55 |     df = spark.read.csv('./data/movie_ratings_df.csv', inferSchema=True, header=True)
56 |     # print((df.count(), len(df.columns)))    # (100000, 3)
57 | 
58 |     # 2. 数据分析
59 |     analyse_data(df)
60 | 
61 |     # 3. 特征工程
62 |     model_df = feature_process(df)
63 |     # 切分数据集
64 |     train, test = model_df.randomSplit([0.75, 0.25])
65 |     print('训练集条数:', train.count())
66 |     print('测试集条数:', test.count())
67 |     # 训练集条数: 74996
68 |     # 测试集条数: 25004
69 | 
70 |     # 4. 模型训练
71 |     rec = ALS(maxIter=10, regParam=0.01, userCol='userId',
72 |               itemCol='title_new', ratingCol='rating',
73 |               nonnegative=True, coldStartStrategy="drop")
74 |     rec_model = rec.fit(train)
75 | 
76 |     # 5. 模型评估
77 |     predicted_ratings = rec_model.transform(test)
78 |     print(predicted_ratings.printSchema())
79 | 
80 |     # 计算预测和rating的均方误差
81 |     evaluator=RegressionEvaluator(metricName='rmse',predictionCol='prediction',labelCol='rating')
82 |     rmse=evaluator.evaluate(predicted_ratings)
83 |     print(rmse)
84 | 
85 | 


--------------------------------------------------------------------------------
/PySpark/007-NLP_use_pyspark.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @file   : 007-NLP_use_pyspark.py
  3 | @author : xiaolu
  4 | @email  : luxiaonlp@163.com
  5 | @time   : 2021-04-09
  6 | """
  7 | import findspark
  8 | 
  9 | findspark.init()
 10 | 
 11 | from pyspark.sql import SparkSession
 12 | from pyspark.ml.feature import Tokenizer
 13 | from pyspark.ml.feature import StopWordsRemover
 14 | from pyspark.ml.feature import CountVectorizer
 15 | from pyspark.ml.feature import HashingTF, IDF
 16 | from pyspark.sql.functions import length
 17 | from pyspark.sql.functions import udf
 18 | from pyspark.sql.types import IntegerType
 19 | from pyspark.sql.functions import *
 20 | from pyspark.ml.feature import VectorAssembler
 21 | from pyspark.ml.classification import LogisticRegression
 22 | from pyspark.ml.evaluation import BinaryClassificationEvaluator
 23 | 
 24 | 
 25 | def basic_op():
 26 |     '''
 27 |     基本的操作
 28 |     :return:
 29 |     '''
 30 |     spark = SparkSession.builder.appName('nlp').getOrCreate()
 31 |     df = spark.createDataFrame([(1, 'I really liked this movie'),
 32 |                                 (2, 'I would recommend this movie to my friends'),
 33 |                                 (3, 'movie was alright but acting was horrible'),
 34 |                                 (4, 'I am never watching that movie ever again')],
 35 |                                ['user_id', 'review'])
 36 |     # print(df.show())
 37 | 
 38 |     # 1. 将文本进行分词 做成新一个特征
 39 |     tokenization = Tokenizer(inputCol='review', outputCol='tokens')
 40 |     tokenized_df = tokenization.transform(df)
 41 |     # print(tokenized_df.show())
 42 | 
 43 |     # 2. 去除停用词
 44 |     stopword_removal = StopWordsRemover(inputCol='tokens', outputCol='refined_tokens')
 45 |     refined_df = stopword_removal.transform(tokenized_df)
 46 |     print(refined_df.select(['user_id', 'tokens', 'refined_tokens']).show(10))
 47 | 
 48 |     # 3. 统计向量  使用one-hot
 49 |     count_vec = CountVectorizer(inputCol='refined_tokens', outputCol='features')
 50 |     cv_df = count_vec.fit(refined_df).transform(refined_df)
 51 |     print(cv_df.select(['user_id', 'refined_tokens', 'features']).show(4))
 52 |     print('词表(注:去停用词之后的):', count_vec.fit(refined_df).vocabulary)
 53 | 
 54 |     # 4. 计算tf-idf
 55 |     hashing_vec = HashingTF(inputCol='refined_tokens', outputCol='tf_features')
 56 |     hashing_df = hashing_vec.transform(refined_df)  # 先进行一个hash计算
 57 |     print(hashing_df.select(['user_id', 'refined_tokens', 'tf_features']).show())
 58 | 
 59 |     tf_idf_vec = IDF(inputCol='tf_features', outputCol='tf_idf_features')
 60 |     tf_idf_df = tf_idf_vec.fit(hashing_df).transform(hashing_df)
 61 |     print(tf_idf_df.select(['user_id', 'tf_idf_features']).show(4))
 62 | 
 63 | 
 64 | def data_process(text_df):
 65 |     text_df = text_df.filter(((text_df.Sentiment == '1') | (text_df.Sentiment == '0')))
 66 |     print('清洗后的数据量:', text_df.count())
 67 | 
 68 |     print('正负样本的分布')
 69 |     print(text_df.groupBy('Sentiment').count().show())
 70 | 
 71 |     # 加入长度特征
 72 |     text_df = text_df.withColumn("Label", text_df.Sentiment.cast('float')).drop('Sentiment')
 73 | 
 74 |     # 分词
 75 |     text_df = text_df.withColumn('length', length(text_df['Review']))
 76 |     tokenization = Tokenizer(inputCol='Review', outputCol='tokens')
 77 |     tokenized_df = tokenization.transform(text_df)
 78 | 
 79 |     # 去停用词
 80 |     stopword_removal = StopWordsRemover(inputCol='tokens', outputCol='refined_tokens')
 81 |     refined_text_df = stopword_removal.transform(tokenized_df)
 82 | 
 83 |     len_udf = udf(lambda s: len(s), IntegerType())
 84 |     refined_text_df = refined_text_df.withColumn("token_count", len_udf(col('refined_tokens')))
 85 | 
 86 |     count_vec = CountVectorizer(inputCol='refined_tokens', outputCol='features')
 87 |     cv_text_df = count_vec.fit(refined_text_df).transform(refined_text_df)
 88 | 
 89 |     model_text_df = cv_text_df.select(['features', 'token_count', 'Label'])
 90 |     return model_text_df
 91 | 
 92 | 
 93 | if __name__ == '__main__':
 94 |     # basic_op()
 95 | 
 96 |     # 下面做一个简单的文本分类
 97 |     spark = SparkSession.builder.appName('text_classification').getOrCreate()
 98 |     text_df = spark.read.csv('./data/Movie_reviews.csv', inferSchema=True, header=True, sep=',')
 99 |     print('数据量:', text_df.count())  # 数据量: 7087
100 | 
101 |     model_text_df = data_process(text_df)
102 |     df_assembler = VectorAssembler(inputCols=['features', 'token_count'], outputCol='features_vec')
103 |     model_text_df = df_assembler.transform(model_text_df)
104 | 
105 |     # 切分数据集
106 |     training_df, test_df = model_text_df.randomSplit([0.75, 0.25])
107 | 
108 |     # 模型训练
109 |     log_reg = LogisticRegression(featuresCol='features_vec', labelCol='Label').fit(training_df)
110 | 
111 |     # 模型评估
112 |     results = log_reg.evaluate(test_df).predictions
113 | 
114 |     # confusion matrix
115 |     true_postives = results[(results.Label == 1) & (results.prediction == 1)].count()
116 |     true_negatives = results[(results.Label == 0) & (results.prediction == 0)].count()
117 |     false_positives = results[(results.Label == 0) & (results.prediction == 1)].count()
118 |     false_negatives = results[(results.Label == 1) & (results.prediction == 0)].count()
119 | 
120 |     recall = float(true_postives) / (true_postives + false_negatives)
121 |     print(recall)
122 | 
123 |     precision = float(true_postives) / (true_postives + false_positives)
124 |     print(precision)
125 | 
126 |     accuracy = float((true_postives + true_negatives) / (results.count()))
127 |     print(accuracy)
128 | 


--------------------------------------------------------------------------------
/PySpark/data/Movie_reviews.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/PySpark/data/Movie_reviews.csv


--------------------------------------------------------------------------------
/PySpark/data/iris_dataset.csv:
--------------------------------------------------------------------------------
  1 | sepal_length,sepal_width,petal_length,petal_width,species
  2 | 5.1,3.5,1.4,0.2,setosa
  3 | 4.9,3,1.4,0.2,setosa
  4 | 4.7,3.2,1.3,0.2,setosa
  5 | 4.6,3.1,1.5,0.2,setosa
  6 | 5,3.6,1.4,0.2,setosa
  7 | 5.4,3.9,1.7,0.4,setosa
  8 | 4.6,3.4,1.4,0.3,setosa
  9 | 5,3.4,1.5,0.2,setosa
 10 | 4.4,2.9,1.4,0.2,setosa
 11 | 4.9,3.1,1.5,0.1,setosa
 12 | 5.4,3.7,1.5,0.2,setosa
 13 | 4.8,3.4,1.6,0.2,setosa
 14 | 4.8,3,1.4,0.1,setosa
 15 | 4.3,3,1.1,0.1,setosa
 16 | 5.8,4,1.2,0.2,setosa
 17 | 5.7,4.4,1.5,0.4,setosa
 18 | 5.4,3.9,1.3,0.4,setosa
 19 | 5.1,3.5,1.4,0.3,setosa
 20 | 5.7,3.8,1.7,0.3,setosa
 21 | 5.1,3.8,1.5,0.3,setosa
 22 | 5.4,3.4,1.7,0.2,setosa
 23 | 5.1,3.7,1.5,0.4,setosa
 24 | 4.6,3.6,1,0.2,setosa
 25 | 5.1,3.3,1.7,0.5,setosa
 26 | 4.8,3.4,1.9,0.2,setosa
 27 | 5,3,1.6,0.2,setosa
 28 | 5,3.4,1.6,0.4,setosa
 29 | 5.2,3.5,1.5,0.2,setosa
 30 | 5.2,3.4,1.4,0.2,setosa
 31 | 4.7,3.2,1.6,0.2,setosa
 32 | 4.8,3.1,1.6,0.2,setosa
 33 | 5.4,3.4,1.5,0.4,setosa
 34 | 5.2,4.1,1.5,0.1,setosa
 35 | 5.5,4.2,1.4,0.2,setosa
 36 | 4.9,3.1,1.5,0.1,setosa
 37 | 5,3.2,1.2,0.2,setosa
 38 | 5.5,3.5,1.3,0.2,setosa
 39 | 4.9,3.1,1.5,0.1,setosa
 40 | 4.4,3,1.3,0.2,setosa
 41 | 5.1,3.4,1.5,0.2,setosa
 42 | 5,3.5,1.3,0.3,setosa
 43 | 4.5,2.3,1.3,0.3,setosa
 44 | 4.4,3.2,1.3,0.2,setosa
 45 | 5,3.5,1.6,0.6,setosa
 46 | 5.1,3.8,1.9,0.4,setosa
 47 | 4.8,3,1.4,0.3,setosa
 48 | 5.1,3.8,1.6,0.2,setosa
 49 | 4.6,3.2,1.4,0.2,setosa
 50 | 5.3,3.7,1.5,0.2,setosa
 51 | 5,3.3,1.4,0.2,setosa
 52 | 7,3.2,4.7,1.4,versicolor
 53 | 6.4,3.2,4.5,1.5,versicolor
 54 | 6.9,3.1,4.9,1.5,versicolor
 55 | 5.5,2.3,4,1.3,versicolor
 56 | 6.5,2.8,4.6,1.5,versicolor
 57 | 5.7,2.8,4.5,1.3,versicolor
 58 | 6.3,3.3,4.7,1.6,versicolor
 59 | 4.9,2.4,3.3,1,versicolor
 60 | 6.6,2.9,4.6,1.3,versicolor
 61 | 5.2,2.7,3.9,1.4,versicolor
 62 | 5,2,3.5,1,versicolor
 63 | 5.9,3,4.2,1.5,versicolor
 64 | 6,2.2,4,1,versicolor
 65 | 6.1,2.9,4.7,1.4,versicolor
 66 | 5.6,2.9,3.6,1.3,versicolor
 67 | 6.7,3.1,4.4,1.4,versicolor
 68 | 5.6,3,4.5,1.5,versicolor
 69 | 5.8,2.7,4.1,1,versicolor
 70 | 6.2,2.2,4.5,1.5,versicolor
 71 | 5.6,2.5,3.9,1.1,versicolor
 72 | 5.9,3.2,4.8,1.8,versicolor
 73 | 6.1,2.8,4,1.3,versicolor
 74 | 6.3,2.5,4.9,1.5,versicolor
 75 | 6.1,2.8,4.7,1.2,versicolor
 76 | 6.4,2.9,4.3,1.3,versicolor
 77 | 6.6,3,4.4,1.4,versicolor
 78 | 6.8,2.8,4.8,1.4,versicolor
 79 | 6.7,3,5,1.7,versicolor
 80 | 6,2.9,4.5,1.5,versicolor
 81 | 5.7,2.6,3.5,1,versicolor
 82 | 5.5,2.4,3.8,1.1,versicolor
 83 | 5.5,2.4,3.7,1,versicolor
 84 | 5.8,2.7,3.9,1.2,versicolor
 85 | 6,2.7,5.1,1.6,versicolor
 86 | 5.4,3,4.5,1.5,versicolor
 87 | 6,3.4,4.5,1.6,versicolor
 88 | 6.7,3.1,4.7,1.5,versicolor
 89 | 6.3,2.3,4.4,1.3,versicolor
 90 | 5.6,3,4.1,1.3,versicolor
 91 | 5.5,2.5,4,1.3,versicolor
 92 | 5.5,2.6,4.4,1.2,versicolor
 93 | 6.1,3,4.6,1.4,versicolor
 94 | 5.8,2.6,4,1.2,versicolor
 95 | 5,2.3,3.3,1,versicolor
 96 | 5.6,2.7,4.2,1.3,versicolor
 97 | 5.7,3,4.2,1.2,versicolor
 98 | 5.7,2.9,4.2,1.3,versicolor
 99 | 6.2,2.9,4.3,1.3,versicolor
100 | 5.1,2.5,3,1.1,versicolor
101 | 5.7,2.8,4.1,1.3,versicolor
102 | 6.3,3.3,6,2.5,virginica
103 | 5.8,2.7,5.1,1.9,virginica
104 | 7.1,3,5.9,2.1,virginica
105 | 6.3,2.9,5.6,1.8,virginica
106 | 6.5,3,5.8,2.2,virginica
107 | 7.6,3,6.6,2.1,virginica
108 | 4.9,2.5,4.5,1.7,virginica
109 | 7.3,2.9,6.3,1.8,virginica
110 | 6.7,2.5,5.8,1.8,virginica
111 | 7.2,3.6,6.1,2.5,virginica
112 | 6.5,3.2,5.1,2,virginica
113 | 6.4,2.7,5.3,1.9,virginica
114 | 6.8,3,5.5,2.1,virginica
115 | 5.7,2.5,5,2,virginica
116 | 5.8,2.8,5.1,2.4,virginica
117 | 6.4,3.2,5.3,2.3,virginica
118 | 6.5,3,5.5,1.8,virginica
119 | 7.7,3.8,6.7,2.2,virginica
120 | 7.7,2.6,6.9,2.3,virginica
121 | 6,2.2,5,1.5,virginica
122 | 6.9,3.2,5.7,2.3,virginica
123 | 5.6,2.8,4.9,2,virginica
124 | 7.7,2.8,6.7,2,virginica
125 | 6.3,2.7,4.9,1.8,virginica
126 | 6.7,3.3,5.7,2.1,virginica
127 | 7.2,3.2,6,1.8,virginica
128 | 6.2,2.8,4.8,1.8,virginica
129 | 6.1,3,4.9,1.8,virginica
130 | 6.4,2.8,5.6,2.1,virginica
131 | 7.2,3,5.8,1.6,virginica
132 | 7.4,2.8,6.1,1.9,virginica
133 | 7.9,3.8,6.4,2,virginica
134 | 6.4,2.8,5.6,2.2,virginica
135 | 6.3,2.8,5.1,1.5,virginica
136 | 6.1,2.6,5.6,1.4,virginica
137 | 7.7,3,6.1,2.3,virginica
138 | 6.3,3.4,5.6,2.4,virginica
139 | 6.4,3.1,5.5,1.8,virginica
140 | 6,3,4.8,1.8,virginica
141 | 6.9,3.1,5.4,2.1,virginica
142 | 6.7,3.1,5.6,2.4,virginica
143 | 6.9,3.1,5.1,2.3,virginica
144 | 5.8,2.7,5.1,1.9,virginica
145 | 6.8,3.2,5.9,2.3,virginica
146 | 6.7,3.3,5.7,2.5,virginica
147 | 6.7,3,5.2,2.3,virginica
148 | 6.3,2.5,5,1.9,virginica
149 | 6.5,3,5.2,2,virginica
150 | 6.2,3.4,5.4,2.3,virginica
151 | 5.9,3,5.1,1.8,virginica


--------------------------------------------------------------------------------
/PySpark/data/movie_ratings_df.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/PySpark/data/movie_ratings_df.csv


--------------------------------------------------------------------------------
/PySpark/data/sample_data.csv:
--------------------------------------------------------------------------------
 1 | ratings,age,experience,family,mobile
 2 | 3,32,9,3,Vivo
 3 | 3,27,13,3,Apple
 4 | 4,22,2.5,0,Samsung
 5 | 4,37,16.5,4,Apple
 6 | 5,27,9,1,MI
 7 | 4,27,9,0,Oppo
 8 | 5,37,23,5,Vivo
 9 | 5,37,23,5,Samsung
10 | 3,22,2.5,0,Apple
11 | 3,27,6,0,MI
12 | 2,27,6,2,Oppo
13 | 5,27,6,2,Samsung
14 | 3,37,16.5,5,Apple
15 | 5,27,6,0,MI
16 | 4,22,6,1,Oppo
17 | 4,37,9,2,Samsung
18 | 4,27,6,1,Apple
19 | 1,37,23,5,MI
20 | 2,42,23,2,Oppo
21 | 4,37,6,0,Vivo
22 | 5,22,2.5,0,Samsung
23 | 3,37,16.5,5,Apple
24 | 3,42,23,5,MI
25 | 2,27,9,2,Samsung
26 | 4,27,6,1,Apple
27 | 5,27,2.5,0,MI
28 | 2,27,6,2,Oppo
29 | 5,37,13,1,Vivo
30 | 2,32,16.5,2,Oppo
31 | 3,27,6,0,MI
32 | 3,27,6,0,MI
33 | 4,22,6,1,Oppo
34 | 4,37,6,0,Vivo


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Python-Library-Learning
2 | Here we will sort out a variety of interesting Python library learning
3 | 


--------------------------------------------------------------------------------
/RSA实战/001-rsa生成公私钥并保存.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2020/7/15 12:00
 3 | # @Author  : xiaolu
 4 | # @FileName: 001-rsa生成公私钥并保存.py
 5 | # @Software: PyCharm
 6 | import rsa
 7 | 
 8 | pubkey, privkey = rsa.newkeys(1024)  # 生成公钥和私钥
 9 | 
10 | pub = pubkey.save_pkcs1()   # 将生成的公钥和私钥进行转换, 以便存储
11 | pri = privkey.save_pkcs1()  # save_pkcs1()是内置方法, 其默认参数就是"PEM"
12 | 
13 | with open('pubkey.pem', mode='wb') as f, open('privkey.pem', mode='wb') as f1:
14 |     f.write(pub)
15 |     f1.write(pri)
16 | 


--------------------------------------------------------------------------------
/RSA实战/002-公钥加密私钥解密.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2020/7/15 12:05
 3 | # @Author  : xiaolu
 4 | # @FileName: 002-公钥加密私钥解密.py
 5 | # @Software: PyCharm
 6 | import rsa
 7 | 
 8 | if __name__ == '__main__':
 9 |     with open('pubkey.pem', mode='rb') as f, open('privkey.pem', 'rb') as f1:
10 |         # 从文件读取公私钥
11 |         pub = f.read()
12 |         pri = f1.read()
13 | 
14 |         # 转为原始的状态
15 |         pubkey = rsa.PublicKey.load_pkcs1(pub)
16 |         privkey = rsa.PrivateKey.load_pkcs1(pri)
17 | 
18 |     message = '你是个傻逼吧'
19 |     info = rsa.encrypt(message.encode('utf8'), pubkey)
20 |     msg = rsa.decrypt(info, privkey)
21 |     print(msg.decode('utf8'))
22 | 
23 | 
24 | 
25 | 
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/apscheduler实现定时任务/定时任务.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 定时任务.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-10-27
 6 | """
 7 | import time
 8 | from datetime import datetime
 9 | from apscheduler.schedulers.blocking import BlockingScheduler
10 | 
11 | 
12 | def my_job(text):
13 |     print('{}'.format(text), time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
14 | 
15 | 
16 | if __name__ == '__main__':
17 |     sched = BlockingScheduler()
18 |     # sched.add_job(my_job, 'interval', days=0, hours=24, minutes=0, seconds=0)  # 每隔24小时执行一次
19 |     # sched.add_job(my_job, 'interval', seconds=5, args=['北京时间:'])   # 每个5秒执行 用interval
20 | 
21 |     # 指定某个时间点执行一次
22 |     sched.add_job(my_job, 'date', run_date=datetime(2021, 10, 27, 17, 8, 5), args=['北京时间:'])
23 |     sched.start()


--------------------------------------------------------------------------------
/chinesebert中的pinyin和glyph的处理/MSYH.TTC:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/chinesebert中的pinyin和glyph的处理/MSYH.TTC


--------------------------------------------------------------------------------
/chinesebert中的pinyin和glyph的处理/image_test.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : image_test.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-07-22
 6 | """
 7 | import os
 8 | import pygame
 9 | 
10 | chinese_dir = 'chinese'
11 | if not os.path.exists(chinese_dir):
12 |     os.mkdir(chinese_dir)
13 | 
14 | pygame.init()
15 | 
16 | # 1. 写出所有的汉字
17 | # start, end = (0x4E00, 0x9FA5)  # 汉字编码范围
18 | #
19 | # for codepoint in range(int(start), int(end)):
20 | #     word = chr(codepoint)
21 | #     font = pygame.font.Font("MSYH.TTC", 22)  # 当前目录下要有微软雅黑的字体文件msyh.ttc,或者去c:\Windows\Fonts目录下找
22 | #     rtext = font.render(word, True, (0, 0, 0), (255, 255, 255))
23 | #     pygame.image.save(rtext, os.path.join(chinese_dir, word + ".png"))
24 | 
25 | # 2. 指定汉字   对于不同的字体 可以切换MSYH.TTC文件就行
26 | word = '新'
27 | font = pygame.font.Font("MSYH.TTC", 22)  # 当前目录下要有微软雅黑的字体文件msyh.ttc,或者去c:\Windows\Fonts目录下找
28 | rtext = font.render(word, True, (0, 0, 0), (255, 255, 255))
29 | pygame.image.save(rtext, os.path.join(chinese_dir, word + ".png"))
30 | 
31 | 


--------------------------------------------------------------------------------
/chinesebert中的pinyin和glyph的处理/pinyin_test.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : pinyin_test.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-07-22
 6 | """
 7 | from pypinyin import pinyin, lazy_pinyin, Style
 8 | 
 9 | if __name__ == '__main__':
10 |     print(pinyin('新浪微博'))   # 输出: [['xīn'], ['làng'], ['wēi'], ['bó']]
11 | 
12 |     print(lazy_pinyin('新浪微博'))   # 输出: ['xin', 'lang', 'wei', 'bo']
13 | 
14 |     # 将拼音用数字表示 然后跟在拼音的后面
15 |     style = Style.TONE3    # 1代表一声、2代表二声、3代表三声、4代表四声
16 |     print(lazy_pinyin('新浪微博', style=style))
17 | 
18 | 


--------------------------------------------------------------------------------
/collections的用法/001-collections中的namedtuple用法.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2020/8/3 15:57
 3 | # @Author  : xiaolu
 4 | # @FileName: 001-collections中的namedtuple用法.py
 5 | # @Software: PyCharm
 6 | 
 7 | # 我的认识 感觉nametuple是一种便捷类的使用
 8 | from collections import namedtuple
 9 | 
10 | Point = namedtuple("Point", ['x', 'y'])
11 | # 相当于定义了一个Point类，其中x, y为类的属性
12 | p = Point(1, 2)
13 | print(p.x)
14 | print(p.y)
15 | 
16 | 
17 | # 在深度学习中 我们可以定义参数文件
18 | from collections import namedtuple
19 | Config = namedtuple('Config', ['learning_rate',
20 |                                'epoch',
21 |                                'device',
22 |                                'batch_size',
23 |                                'vocab_size'])
24 | 
25 | 
26 | config = Config(
27 |     learning_rate=1e-5,
28 |     epoch=10,
29 |     device=4,
30 |     batch_size=32,
31 |     vocab_size=12239
32 | )
33 | print(config.learning_rate)
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/elasticsearch/001-创建库并插入数据.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Time    : 2020/7/30 15:04
  3 | # @Author  : xiaolu
  4 | # @FileName: 001-创建库并插入数据.py
  5 | # @Software: PyCharm
  6 | from elasticsearch import Elasticsearch
  7 | 
  8 | 
  9 | es = Elasticsearch()
 10 | 
 11 | # result = es.indices.delete(index='point_type', ignore=[400, 404])  # 删除索引(库)
 12 | # exit()
 13 | 
 14 | 
 15 | mapping = {
 16 |     "settings": {
 17 |         "analysis": {
 18 |             "filter": {
 19 |                 "jieba_stop": {
 20 |                     "type": "stop",
 21 |                     "stopwords_path": "stopwords/stopwords.txt"
 22 |                     },
 23 |                 "jieba_synonym": {
 24 |                     "type": "synonym",
 25 |                     "synonyms_path": "synonyms/synonyms.txt"
 26 |                 },
 27 |                 "my_shingle_filter": {
 28 |                     "type": "shingle",
 29 |                     "min_shingle_size": 2,
 30 |                     "max_shingle_size": 2,
 31 |                     "output_unigrams": False
 32 |                 }
 33 |             },
 34 |             "analyzer": {
 35 |                 "word_ans": {
 36 |                     "tokenizer": "jieba_search",   # 采用结巴分词
 37 |                     "filter": "jieba_stop"      # 采用结巴停用词过滤
 38 |                 },
 39 |                 "char_ana": {
 40 |                     "tokenizer": "standard",   # 对于字符 采用标准的分词方式  就是按字分割
 41 |                     "filter": "jieba_stop"    # 也采用jieba停用词过滤
 42 |                 },
 43 |                 "char_bigram_ana": {
 44 |                     "type": "custom",
 45 |                     "tokenizer": "standard",
 46 |                     "filter": [
 47 |                         "jieba_stop",
 48 |                         "my_shingle_filter"
 49 |                     ]
 50 |                 },
 51 |                 "word_bigram_ana": {
 52 |                     "type": "custom",
 53 |                     "tokenizer": "jieba_search",
 54 |                     "filter": [
 55 |                         "jieba_stop",
 56 |                         "my_shingle_filter"
 57 |                     ]
 58 |                 }
 59 |             }
 60 |         }
 61 |     },
 62 |     "mappings": {
 63 |         "properties": {
 64 |             "title": {
 65 |                 "type": "keyword"
 66 |             },
 67 |             "author": {
 68 |                 "type": "keyword"
 69 |             },
 70 |             "dynasty": {
 71 |                 "type": "keyword"
 72 |             },
 73 |             "words": {
 74 |                 "type": "integer"
 75 |             },
 76 |             "content": {
 77 |                 "analyzer": "word_ana",
 78 |                 "search_analyzer": "word_ana",
 79 |                 "type": "text"
 80 |             }
 81 |         }
 82 |     }
 83 | }
 84 | # 相当于将content入库时，会进行分词，然后采用jieba的停用词过滤方式。  当通过内容去查找时，也是先将问题分词，然后停用词过滤，在进行匹配。
 85 | 
 86 | # es.indices.create(index='point_type', body=mapping)
 87 | 
 88 | # 然后插入数据
 89 | data = [
 90 |     {
 91 |         "title": "静夜思",
 92 |         "author": "李白",
 93 |         "dynasty": "唐",
 94 |         "words": "20",
 95 |         "content": "床前明月光，疑是地上霜。举头望明月，低头思故乡。"
 96 |     },
 97 | 
 98 |     {
 99 |         "title": "观沧海",
100 |         "author": "曹操",
101 |         "dynasty": "东汉末年",
102 |         "words": "56",
103 |         "content": "东临碣石，以观沧海。水何澹澹，山岛竦峙。树木丛生，百草丰茂。秋风萧瑟，洪波涌起。日月之行，若出其中。星汉灿烂，若出其里。幸甚至哉，歌以咏志。"
104 |     },
105 | 
106 |     {
107 |         "title": "咏鹅",
108 |         "author": "骆宾王",
109 |         "dynasty": "唐",
110 |         "words": "18",
111 |         "content": "鹅鹅鹅，曲项向天歌。白毛浮绿水，红掌拨清波。"
112 |     },
113 | 
114 |     {
115 |         "title": "将进酒",
116 |         "author": "陈陶",
117 |         "dynasty": "唐",
118 |         "words": "14",
119 |         "content": "银鸭金鹅言待谁，隋家岳渎皇家有"
120 |     },
121 | 
122 |     {
123 |         "title": "春雪",
124 |         "author": "白居易",
125 |         "dynasty": "唐",
126 |         "words": "10",
127 |         "content": "大似落鹅毛，密如飘玉屑"
128 |     }
129 | ]
130 | for d in data:
131 |     es.index(index='point_type', body=d)
132 | 


--------------------------------------------------------------------------------
/elasticsearch/002-es中的搜索.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2020/7/30 15:52
 3 | # @Author  : xiaolu
 4 | # @FileName: 002-es中的搜索.py
 5 | # @Software: PyCharm
 6 | from elasticsearch import Elasticsearch
 7 | 
 8 | 
 9 | if __name__ == '__main__':
10 |     es = Elasticsearch()
11 |     querys = '东临碣石'
12 |     dsl = {
13 |         'query': {
14 |             'match': {
15 |                 'title': '咏鹅'
16 |             }
17 |         }
18 |     }
19 |     results = es.search(index='point_type', body=dsl)['hits']['hits']  # 搜索多条结果的话 这里可能是一个列表
20 | 
21 |     res = []
22 |     for result in results:
23 |         res.append(result['_source'])
24 |     print(res)
25 | 
26 | 


--------------------------------------------------------------------------------
/flask+echart+ajax/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/flask+echart+ajax/.DS_Store


--------------------------------------------------------------------------------
/flask+echart+ajax/app.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : app.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2022-01-08
 6 | """
 7 | import random
 8 | from flask import Flask, render_template, jsonify
 9 | 
10 | app = Flask(__name__)
11 | 
12 | 
13 | @app.route('/left_data')
14 | def get_left_data():
15 |     day = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
16 |     # nums = [150, 230, 224, 218, 135, 147, 260]
17 |     nums = [random.randint(0, 100) for _ in range(len(day))]
18 |     random.shuffle(nums)
19 |     data = {'day': day, 'nums': nums}
20 |     return jsonify(data)
21 | 
22 | 
23 | @app.route('/')
24 | def index():
25 |     return render_template('index.html')
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     # app.run(port=6000)
30 |     app.run(host='0.0.0.0')


--------------------------------------------------------------------------------
/flask+echart+ajax/static/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/flask+echart+ajax/static/.DS_Store


--------------------------------------------------------------------------------
/flask+echart+ajax/static/css/main.css:
--------------------------------------------------------------------------------
 1 | body {
 2 | 	margin: 0;
 3 | 	background: #333;
 4 | }
 5 | 
 6 | #left {
 7 | 	position: absolute;
 8 | 	width: 50%;
 9 | 	height: 100%;
10 | 	top: 0%;
11 | 	left: 0%;
12 | 	background: #666666;
13 | 	/* color: white; */
14 | }
15 | 
16 | #right {
17 | 	position: absolute;
18 | 	width: 50%;
19 | 	height: 100%;
20 | 	top: 0%;
21 | 	right: 0%;
22 | 	color: #FFFFFF;
23 | 	/* font-size: 20px; */
24 | 	background: green;
25 | }


--------------------------------------------------------------------------------
/flask+echart+ajax/static/js/controller.js:
--------------------------------------------------------------------------------
 1 | function get_left_data() {
 2 | 	$.ajax({
 3 | 		url:"/left_data",
 4 | 		success: function(data) {
 5 | 		    option_left.xAxis.data = data.day
 6 | 		    option_left.series[0].data = data.nums
 7 | 			ec_left.setOption(option_left)
 8 | 		},
 9 | 		error: function(xhr, type, errorThrown) {
10 | 		}
11 | 	})
12 | }
13 | 
14 | get_left_data()
15 | setInterval(get_left_data, 1000*5)


--------------------------------------------------------------------------------
/flask+echart+ajax/static/js/left.js:
--------------------------------------------------------------------------------
 1 | var ec_left = echarts.init(document.getElementById("left"), "dark");
 2 | 
 3 | option_left = {
 4 |   xAxis: {
 5 |     type: 'category',
 6 |     data: []
 7 |   },
 8 |   yAxis: {
 9 |     type: 'value'
10 |   },
11 |   series: [
12 |     {
13 |       data: [],
14 |       type: 'line'
15 |     }
16 |   ]
17 | };
18 | ec_left.setOption(option_left);


--------------------------------------------------------------------------------
/flask+echart+ajax/static/js/right.js:
--------------------------------------------------------------------------------
 1 | var ec_right = echarts.init(document.getElementById("right"), "dark");
 2 | 
 3 | option_right = {
 4 |   xAxis: {
 5 |     type: 'category',
 6 |     data: ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
 7 |   },
 8 |   yAxis: {
 9 |     type: 'value'
10 |   },
11 |   series: [
12 |     {
13 |       data: [150, 230, 224, 218, 135, 147, 260],
14 |       type: 'line'
15 |     }
16 |   ]
17 | };
18 | ec_right.setOption(option_right);


--------------------------------------------------------------------------------
/flask+echart+ajax/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>Title</title>
 6 |     <script src="../static/js/jquery.js"></script>
 7 |     <script src="../static/js/echarts.min.js"></script>
 8 |     <link href="../static/css/main.css" rel="stylesheet" />
 9 | 
10 | </head>
11 | <body>
12 | 		<div id="left"></div>
13 | 		<div id="right"></div>
14 |         <script src="../static/js/left.js"></script>
15 | 		<script src="../static/js/right.js"></script>
16 | 		<script src="../static/js/controller.js"></script>
17 | </body>
18 | </html>


--------------------------------------------------------------------------------
/flask表单那些事/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/flask表单那些事/.DS_Store


--------------------------------------------------------------------------------
/flask表单那些事/app.py:
--------------------------------------------------------------------------------
 1 | """
 2 | # -*- coding: utf-8 -*-
 3 | # @File    : app.py
 4 | # @Time    : 2020/11/19 3:54 下午
 5 | # @Author  : xiaolu
 6 | # @Email   : luxiaonlp@163.com
 7 | # @Software: PyCharm
 8 | """
 9 | from flask import Flask
10 | from flask import render_template, request
11 | 
12 | app = Flask(__name__)
13 | 
14 | 
15 | @app.route('/', methods=['POST', 'GET'])
16 | def my_index():
17 |     user_name = request.form.get('username')
18 |     if user_name is not None:
19 |         pass_word = request.form.get('pwd')
20 |         sex = request.form.getlist('sex')
21 |         property = request.form.getlist('property')
22 |         content = request.form.get('content')
23 |         print(content)
24 |         print(user_name)
25 |         print(pass_word)
26 |         print(sex)
27 |         print(property)
28 |     return render_template('index.html')
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     app.run()
33 | 
34 | 


--------------------------------------------------------------------------------
/flask表单那些事/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>表单那些事</title>
 6 | </head>
 7 | <body>
 8 |     <form method="post">
 9 |         用户名: <input type="text" name="username"><br>
10 |         密 码: <input type="password" name="pwd">
11 |         <p>性别:</p>
12 |         <input type="radio" name="sex" value="male">男神
13 |         <input type="radio" name="sex" value="female">女神
14 |         <p>你目前的家产:</p>
15 |         <input type="checkbox" name="property" value="bike"> 一辆自行车
16 |         <input type="checkbox" name="property" value="compute">一台电脑
17 |         <input type="checkbox" name="property" value="phone"> 一个手机
18 |         <br>
19 | 
20 |         <textarea name="content">
21 |             这里可以输入你要的文本。。。
22 |         </textarea>
23 |         <input type="submit" value="提交">
24 |     </form>
25 | 
26 | </body>
27 | </html>


--------------------------------------------------------------------------------
/gensim/001-TF-IDF句子相似度计算.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Time    : 2020/8/5 11:47
  3 | # @Author  : xiaolu
  4 | # @FileName: 001-TF-IDF句子相似度计算.py
  5 | # @Software: PyCharm
  6 | import jieba
  7 | from gensim import corpora, models, similarities
  8 | 
  9 | import numpy as np
 10 | import linecache
 11 | 
 12 | 
 13 | def similarity(query_path, query):
 14 |     '''
 15 |     :param query_path: 问题库的路径
 16 |     :param query: 所提的问题
 17 |     :return: 问题库中与当前问题相似的问题索引
 18 |     '''
 19 |     # 对问题库中的问题处理
 20 |     questions = []
 21 |     with open(query_path, 'r', encoding='utf8') as f:
 22 |         for line in f.readlines():
 23 |             line = line.strip()
 24 |             line = jieba.lcut(line)
 25 |             temp = []
 26 |             for w in line:
 27 |                 if w not in stopword:
 28 |                     temp.append(w)
 29 |             questions.append(temp)
 30 | 
 31 |     # 创建词典
 32 |     dictionary = corpora.Dictionary(questions)
 33 |     # 基于词典，将分词列表集转换成稀疏向量集，即语料库
 34 |     questions = [dictionary.doc2bow(ques) for ques in questions]
 35 |     # 训练TF-IDF模型，传入语料库进行训练
 36 |     tfidf = models.TfidfModel(questions)  # 传入的向量集
 37 |     # 用训练好的TF-IDF模型处理被检索文本，即语料库
 38 |     corpus_tfidf = tfidf[questions]
 39 |     # for temp in corpus_tfidf:  # 每个问题中的每个词的tfidf值
 40 |     #     print(temp)
 41 |     # 对当前所问问题进行处理
 42 | 
 43 |     new_vec = dictionary.doc2bow(query.split())
 44 |     new_vec_tfidf = tfidf[new_vec]
 45 | 
 46 |     # 计算当前问题与问题库中所有问题的相似度
 47 |     index = similarities.MatrixSimilarity(corpus_tfidf)   # 最相似问题
 48 |     sims = index[new_vec_tfidf]   # 相似的列表吧
 49 |     # print(sims)
 50 | 
 51 |     max_loc = np.argmax(sims)    # 最相似的问题(问题库)编号
 52 |     max_sim = sims[max_loc]
 53 |     # print(max_loc)    # 5   相似问题的编号
 54 |     # print(max_sim)   # 1.0  相似程度
 55 | 
 56 |     # 句子相似度阈值
 57 |     sup = 0.7
 58 |     # row_index默认为-1，即未匹配到满足相似度阈值的问题
 59 |     row_index = -1
 60 |     if max_sim > sup:
 61 |         # 相似度最大值对应文件中问题所在的行索引
 62 |         row_index = max_loc + 1
 63 |     return row_index
 64 | 
 65 | 
 66 | def get_answer(answer_path, row_index):
 67 |     """
 68 |     :func: 得到问题对应的答案
 69 |     :param answer_path: 答案存储所在文件路径
 70 |     :param row_index: 答案的行索引
 71 |     :return:
 72 |     """
 73 |     answer = linecache.getline(answer_path, row_index)
 74 |     return answer
 75 | 
 76 | 
 77 | if __name__ == '__main__':
 78 |     answer_path = './data/answer.txt'
 79 |     query_path = './data/question.txt'
 80 | 
 81 |     # 加载停用词
 82 |     stopword = []
 83 |     with open('./data/stopwords.txt', 'r', encoding='utf8') as f:
 84 |         for line in f.readlines():
 85 |             line = line.strip()
 86 |             stopword.append(line)
 87 |     print('退出请按q')
 88 |     while True:
 89 |         question = input('>:')
 90 |         if question == 'q':
 91 |             break
 92 | 
 93 |         # 首先分词然后去除停用词
 94 |         res = jieba.lcut(question)
 95 |         question_sep = []
 96 |         for r in res:
 97 |             if r not in stopword:
 98 |                 question_sep.append(r)
 99 |         # question_sep 是问题经过分词, 停用词处理后的词表
100 |         query = ' '.join(line for line in question_sep)
101 | 
102 |         # 得到问题对应的行索引   也就是问题来了  我们先和问题库中的问题匹配  得到问题库中的相似问题
103 |         row_index = similarity(query_path, query)   # 找到相似问题的索引位置了
104 | 
105 |         answer = get_answer(answer_path, row_index)
106 |         print('<:', answer)
107 | 


--------------------------------------------------------------------------------
/gensim/002-gensim文本摘要.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @Time ： 2020/11/3 14:43
 4 | @Auth ： xiaolu
 5 | @File ：002-gensim文本摘要.py
 6 | @IDE ：PyCharm
 7 | @Email：luxiaonlp@163.com
 8 | """
 9 | import re
10 | from LAC import LAC
11 | from gensim.summarization.summarizer import summarize
12 | 
13 | 
14 | def clean(content):
15 |     content = content.replace('.', '')
16 |     content = content.replace(' ', '')
17 |     content = content.replace('\n', '.')
18 |     return content
19 | 
20 | 
21 | def process_data(text, lac):
22 |     # 首先对text进行分句子  主要防止摘要为半句话
23 |     text = re.split('[.。？！]', text)
24 | 
25 |     sentences = []
26 |     for t in text:
27 |         if len(t) == 0:
28 |             continue
29 |         t = lac.run(t)
30 |         sentences.append(' '.join(t))
31 | 
32 |     # 最后用.将句子连起来
33 |     return '. '.join(sentences)
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     lac = LAC(mode='seg')
38 | 
39 |     # 1. 加载文章
40 |     data = []
41 |     with open('./data/text.txt', 'r', encoding='utf8') as f:
42 |         lines = f.readlines()
43 |         for i, line in enumerate(lines):
44 |             line = line.strip()
45 |             line = process_data(line, lac)
46 |             line = summarize(line)
47 |             line = clean(line)
48 |             print('*' * 20 + '第{}篇文章的摘要'.format(i + 1) + '*' * 20)
49 |             print(line)
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/gensim/data/answer.txt:
--------------------------------------------------------------------------------
1 | 中国的首都是北京
2 | 美国的首都在华盛顿
3 | 陕西的省会城市是西安
4 | 山西的省会城市是太原
5 | 姚明的老婆是叶莉
6 | 姚明的女儿是姚沁蕾
7 | 国家主席是习近平


--------------------------------------------------------------------------------
/gensim/data/question.txt:
--------------------------------------------------------------------------------
1 | 中国的首都在哪儿
2 | 美国的首都在哪儿
3 | 陕西的省会城市在哪
4 | 山西的省会城市在哪儿
5 | 姚明的老婆是谁
6 | 姚明的女儿是谁
7 | 国家主席是谁呀


--------------------------------------------------------------------------------
/gensim/data/stopwords.txt:
--------------------------------------------------------------------------------
  1 | ———
  2 | 》），
  3 | ）÷（１－
  4 | ”，
  5 | ）、
  6 | ＝（
  7 | :
  8 | →
  9 | ℃ 
 10 | &
 11 | *
 12 | 一一
 13 | ~~~~
 14 | ’
 15 | . 
 16 | 『
 17 | .一
 18 | ./
 19 | -- 
 20 | 』
 21 | ＝″
 22 | 【
 23 | ［＊］
 24 | ｝＞
 25 | ［⑤］］
 26 | ［①Ｄ］
 27 | ｃ］
 28 | ｎｇ昉
 29 | ＊
 30 | //
 31 | ［
 32 | ］
 33 | ［②ｅ］
 34 | ［②ｇ］
 35 | ＝｛
 36 | }
 37 | ，也 
 38 | ‘
 39 | Ａ
 40 | ［①⑥］
 41 | ［②Ｂ］ 
 42 | ［①ａ］
 43 | ［④ａ］
 44 | ［①③］
 45 | ［③ｈ］
 46 | ③］
 47 | １． 
 48 | －－ 
 49 | ［②ｂ］
 50 | ’‘ 
 51 | ××× 
 52 | ［①⑧］
 53 | ０：２ 
 54 | ＝［
 55 | ［⑤ｂ］
 56 | ［②ｃ］ 
 57 | ［④ｂ］
 58 | ［②③］
 59 | ［③ａ］
 60 | ［④ｃ］
 61 | ［①⑤］
 62 | ［①⑦］
 63 | ［①ｇ］
 64 | ∈［ 
 65 | ［①⑨］
 66 | ［①④］
 67 | ［①ｃ］
 68 | ［②ｆ］
 69 | ［②⑧］
 70 | ［②①］
 71 | ［①Ｃ］
 72 | ［③ｃ］
 73 | ［③ｇ］
 74 | ［②⑤］
 75 | ［②②］
 76 | 一.
 77 | ［①ｈ］
 78 | .数
 79 | ［］
 80 | ［①Ｂ］
 81 | 数/
 82 | ［①ｉ］
 83 | ［③ｅ］
 84 | ［①①］
 85 | ［④ｄ］
 86 | ［④ｅ］
 87 | ［③ｂ］
 88 | ［⑤ａ］
 89 | ［①Ａ］
 90 | ［②⑧］
 91 | ［②⑦］
 92 | ［①ｄ］
 93 | ［②ｊ］
 94 | 〕〔
 95 | ］［
 96 | ://
 97 | ′∈
 98 | ［②④
 99 | ［⑤ｅ］
100 | １２％
101 | ｂ］
102 | ...
103 | ...................
104 | …………………………………………………③
105 | ＺＸＦＩＴＬ
106 | ［③Ｆ］
107 | 」
108 | ［①ｏ］
109 | ］∧′＝［ 
110 | ∪φ∈
111 | ′｜
112 | ｛－
113 | ②ｃ
114 | ｝
115 | ［③①］
116 | Ｒ．Ｌ．
117 | ［①Ｅ］
118 | Ψ
119 | －［＊］－
120 | ↑
121 | .日 
122 | ［②ｄ］
123 | ［②
124 | ［②⑦］
125 | ［②②］
126 | ［③ｅ］
127 | ［①ｉ］
128 | ［①Ｂ］
129 | ［①ｈ］
130 | ［①ｄ］
131 | ［①ｇ］
132 | ［①②］
133 | ［②ａ］
134 | ｆ］
135 | ［⑩］
136 | ａ］
137 | ［①ｅ］
138 | ［②ｈ］
139 | ［②⑥］
140 | ［③ｄ］
141 | ［②⑩］
142 | ｅ］
143 | 〉
144 | 】
145 | 元／吨
146 | ［②⑩］
147 | ２．３％
148 | ５：０  
149 | ［①］
150 | ::
151 | ［②］
152 | ［③］
153 | ［④］
154 | ［⑤］
155 | ［⑥］
156 | ［⑦］
157 | ［⑧］
158 | ［⑨］ 
159 | ……
160 | ——
161 | ?
162 | 、
163 | 。
164 | “
165 | ”
166 | 《
167 | 》
168 | ！
169 | ，
170 | ：
171 | ；
172 | ？
173 | ．
174 | ,
175 | ．
176 | '
177 | ? 
178 | ·
179 | ———
180 | ──
181 | ? 
182 | —
183 | <
184 | >
185 | （
186 | ）
187 | 〔
188 | 〕
189 | [
190 | ]
191 | (
192 | )
193 | -
194 | +
195 | ～
196 | ×
197 | ／
198 | /
199 | ①
200 | ②
201 | ③
202 | ④
203 | ⑤
204 | ⑥
205 | ⑦
206 | ⑧
207 | ⑨
208 | ⑩
209 | Ⅲ
210 | В
211 | "
212 | ;
213 | #
214 | @
215 | γ
216 | μ
217 | φ
218 | φ．
219 | × 
220 | Δ
221 | ■
222 | ▲
223 | sub
224 | exp 
225 | sup
226 | sub
227 | Lex 
228 | ＃
229 | ％
230 | ＆
231 | ＇
232 | ＋
233 | ＋ξ
234 | ＋＋
235 | －
236 | －β
237 | ＜
238 | ＜±
239 | ＜Δ
240 | ＜λ
241 | ＜φ
242 | ＜＜
243 | =
244 | ＝
245 | ＝☆
246 | ＝－
247 | ＞
248 | ＞λ
249 | ＿
250 | ～±
251 | ～＋
252 | ［⑤ｆ］
253 | ［⑤ｄ］
254 | ［②ｉ］
255 | ≈ 
256 | ［②Ｇ］
257 | ［①ｆ］
258 | ＬＩ
259 | ㈧ 
260 | ［－
261 | ......
262 | 〉
263 | ［③⑩］
264 | 第二
265 | 一番
266 | 一直
267 | 一个
268 | 一些
269 | 许多
270 | 种
271 | 有的是
272 | 也就是说
273 | 末##末
274 | 啊
275 | 阿
276 | 哎
277 | 哎呀
278 | 哎哟
279 | 唉
280 | 俺
281 | 俺们
282 | 按
283 | 按照
284 | 吧
285 | 吧哒
286 | 把
287 | 罢了
288 | 被
289 | 本
290 | 本着
291 | 比
292 | 比方
293 | 比如
294 | 鄙人
295 | 彼
296 | 彼此
297 | 边
298 | 别
299 | 别的
300 | 别说
301 | 并
302 | 并且
303 | 不比
304 | 不成
305 | 不单
306 | 不但
307 | 不独
308 | 不管
309 | 不光
310 | 不过
311 | 不仅
312 | 不拘
313 | 不论
314 | 不怕
315 | 不然
316 | 不如
317 | 不特
318 | 不惟
319 | 不问
320 | 不只
321 | 朝
322 | 朝着
323 | 趁
324 | 趁着
325 | 乘
326 | 冲
327 | 除
328 | 除此之外
329 | 除非
330 | 除了
331 | 此
332 | 此间
333 | 此外
334 | 从
335 | 从而
336 | 打
337 | 待
338 | 但
339 | 但是
340 | 当
341 | 当着
342 | 到
343 | 得
344 | 的
345 | 的话
346 | 等
347 | 等等
348 | 地
349 | 第
350 | 叮咚
351 | 对
352 | 对于
353 | 多
354 | 多少
355 | 而
356 | 而况
357 | 而且
358 | 而是
359 | 而外
360 | 而言
361 | 而已
362 | 尔后
363 | 反过来
364 | 反过来说
365 | 反之
366 | 非但
367 | 非徒
368 | 否则
369 | 嘎
370 | 嘎登
371 | 该
372 | 赶
373 | 个
374 | 各
375 | 各个
376 | 各位
377 | 各种
378 | 各自
379 | 给
380 | 根据
381 | 跟
382 | 故
383 | 故此
384 | 固然
385 | 关于
386 | 管
387 | 归
388 | 果然
389 | 果真
390 | 过
391 | 哈
392 | 哈哈
393 | 呵
394 | 和
395 | 何
396 | 何处
397 | 何况
398 | 何时
399 | 嘿
400 | 哼
401 | 哼唷
402 | 呼哧
403 | 乎
404 | 哗
405 | 还是
406 | 还有
407 | 换句话说
408 | 换言之
409 | 或
410 | 或是
411 | 或者
412 | 极了
413 | 及
414 | 及其
415 | 及至
416 | 即
417 | 即便
418 | 即或
419 | 即令
420 | 即若
421 | 即使
422 | 几
423 | 几时
424 | 己
425 | 既
426 | 既然
427 | 既是
428 | 继而
429 | 加之
430 | 假如
431 | 假若
432 | 假使
433 | 鉴于
434 | 将
435 | 较
436 | 较之
437 | 叫
438 | 接着
439 | 结果
440 | 借
441 | 紧接着
442 | 进而
443 | 尽
444 | 尽管
445 | 经
446 | 经过
447 | 就
448 | 就是
449 | 就是说
450 | 据
451 | 具体地说
452 | 具体说来
453 | 开始
454 | 开外
455 | 靠
456 | 咳
457 | 可
458 | 可见
459 | 可是
460 | 可以
461 | 况且
462 | 啦
463 | 来
464 | 来着
465 | 离
466 | 例如
467 | 哩
468 | 连
469 | 连同
470 | 两者
471 | 了
472 | 临
473 | 另
474 | 另外
475 | 另一方面
476 | 论
477 | 嘛
478 | 吗
479 | 慢说
480 | 漫说
481 | 冒
482 | 么
483 | 每
484 | 每当
485 | 们
486 | 莫若
487 | 某
488 | 某个
489 | 某些
490 | 拿
491 | 哪
492 | 哪边
493 | 哪儿
494 | 哪个
495 | 哪里
496 | 哪年
497 | 哪怕
498 | 哪天
499 | 哪些
500 | 哪样
501 | 那
502 | 那边
503 | 那儿
504 | 那个
505 | 那会儿
506 | 那里
507 | 那么
508 | 那么些
509 | 那么样
510 | 那时
511 | 那些
512 | 那样
513 | 乃
514 | 乃至
515 | 呢
516 | 能
517 | 你
518 | 你们
519 | 您
520 | 宁
521 | 宁可
522 | 宁肯
523 | 宁愿
524 | 哦
525 | 呕
526 | 啪达
527 | 旁人
528 | 呸
529 | 凭
530 | 凭借
531 | 其
532 | 其次
533 | 其二
534 | 其他
535 | 其它
536 | 其一
537 | 其余
538 | 其中
539 | 起
540 | 起见
541 | 起见
542 | 岂但
543 | 恰恰相反
544 | 前后
545 | 前者
546 | 且
547 | 然而
548 | 然后
549 | 然则
550 | 让
551 | 人家
552 | 任
553 | 任何
554 | 任凭
555 | 如
556 | 如此
557 | 如果
558 | 如何
559 | 如其
560 | 如若
561 | 如上所述
562 | 若
563 | 若非
564 | 若是
565 | 啥
566 | 上下
567 | 尚且
568 | 设若
569 | 设使
570 | 甚而
571 | 甚么
572 | 甚至
573 | 省得
574 | 时候
575 | 什么
576 | 什么样
577 | 使得
578 | 是
579 | 是的
580 | 首先
581 | 谁
582 | 谁知
583 | 顺
584 | 顺着
585 | 似的
586 | 虽
587 | 虽然
588 | 虽说
589 | 虽则
590 | 随
591 | 随着
592 | 所
593 | 所以
594 | 他
595 | 他们
596 | 他人
597 | 它
598 | 它们
599 | 她
600 | 她们
601 | 倘
602 | 倘或
603 | 倘然
604 | 倘若
605 | 倘使
606 | 腾
607 | 替
608 | 通过
609 | 同
610 | 同时
611 | 哇
612 | 万一
613 | 往
614 | 望
615 | 为
616 | 为何
617 | 为了
618 | 为什么
619 | 为着
620 | 喂
621 | 嗡嗡
622 | 我
623 | 我们
624 | 呜
625 | 呜呼
626 | 乌乎
627 | 无论
628 | 无宁
629 | 毋宁
630 | 嘻
631 | 吓
632 | 相对而言
633 | 像
634 | 向
635 | 向着
636 | 嘘
637 | 呀
638 | 焉
639 | 沿
640 | 沿着
641 | 要
642 | 要不
643 | 要不然
644 | 要不是
645 | 要么
646 | 要是
647 | 也
648 | 也罢
649 | 也好
650 | 一
651 | 一般
652 | 一旦
653 | 一方面
654 | 一来
655 | 一切
656 | 一样
657 | 一则
658 | 依
659 | 依照
660 | 矣
661 | 以
662 | 以便
663 | 以及
664 | 以免
665 | 以至
666 | 以至于
667 | 以致
668 | 抑或
669 | 因
670 | 因此
671 | 因而
672 | 因为
673 | 哟
674 | 用
675 | 由
676 | 由此可见
677 | 由于
678 | 有
679 | 有的
680 | 有关
681 | 有些
682 | 又
683 | 于
684 | 于是
685 | 于是乎
686 | 与
687 | 与此同时
688 | 与否
689 | 与其
690 | 越是
691 | 云云
692 | 哉
693 | 再说
694 | 再者
695 | 在
696 | 在下
697 | 咱
698 | 咱们
699 | 则
700 | 怎
701 | 怎么
702 | 怎么办
703 | 怎么样
704 | 怎样
705 | 咋
706 | 照
707 | 照着
708 | 者
709 | 这
710 | 这边
711 | 这儿
712 | 这个
713 | 这会儿
714 | 这就是说
715 | 这里
716 | 这么
717 | 这么点儿
718 | 这么些
719 | 这么样
720 | 这时
721 | 这些
722 | 这样
723 | 正如
724 | 吱
725 | 之
726 | 之类
727 | 之所以
728 | 之一
729 | 只是
730 | 只限
731 | 只要
732 | 只有
733 | 至
734 | 至于
735 | 诸位
736 | 着
737 | 着呢
738 | 自
739 | 自从
740 | 自个儿
741 | 自各儿
742 | 自己
743 | 自家
744 | 自身
745 | 综上所述
746 | 总的来看
747 | 总的来说
748 | 总的说来
749 | 总而言之
750 | 总之
751 | 纵
752 | 纵令
753 | 纵然
754 | 纵使
755 | 遵照
756 | 作为
757 | 兮
758 | 呃
759 | 呗
760 | 咚
761 | 咦
762 | 喏
763 | 啐
764 | 喔唷
765 | 嗬
766 | 嗯
767 | 嗳
768 | 


--------------------------------------------------------------------------------
/gensim/data/test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2020/8/6 10:18
 3 | # @Author  : xiaolu
 4 | # @FileName: test.py
 5 | # @Software: PyCharm
 6 | import linecache
 7 | 
 8 | 
 9 | path = 'answer.txt'
10 | for i in range(5):
11 |     answer = linecache.getline(path, i)
12 |     answer = answer.strip()
13 |     print(answer)


--------------------------------------------------------------------------------
/gradio学习/01-row_column_layout.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 01-row_column_layout.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2023-05-12
 6 | """
 7 | import gradio as gr
 8 | 
 9 | title = "抽取式问答"
10 | 
11 | description = "输入上下文与问题后，点击submit按钮，可从上下文中抽取出答案，赶快试试吧！"
12 | 
13 | examples = [
14 |     ["普希金从那里学习人民的语言，吸取了许多有益的养料，这一切对普希金后来的创作产生了很大的影响。这两年里，普希金创作了不少优秀的作品，如《囚徒》、《致大海》、《致凯恩》和《假如生活欺骗了你》等几十首抒情诗，叙事诗《努林伯爵》，历史剧《鲍里斯·戈都诺夫》，以及《叶甫盖尼·奥涅金》前六章。", "著名诗歌《假如生活欺骗了你》的作者是"],
15 |     ["普希金从那里学习人民的语言，吸取了许多有益的养料，这一切对普希金后来的创作产生了很大的影响。这两年里，普希金创作了不少优秀的作品，如《囚徒》、《致大海》、《致凯恩》和《假如生活欺骗了你》等几十首抒情诗，叙事诗《努林伯爵》，历史剧《鲍里斯·戈都诺夫》，以及《叶甫盖尼·奥涅金》前六章。", "普希金创作的叙事诗叫什么"]
16 |     ]
17 | 
18 | article = "感兴趣的小伙伴可以阅读[Transformers实用指南](https://zhuanlan.zhihu.com/p/548336726)"
19 | 
20 | 
21 | # 预测函数
22 | def custom_predict(context, question):
23 |     answer = '对不起 我就是不给你回答'
24 |     answer = question + ": " + answer
25 |     score = 0.01
26 |     return answer, score
27 | 
28 | 
29 | # 清除输入输出
30 | def clear_input():
31 |     return "", "", "", ""
32 | 
33 | 
34 | # 构建Blocks上下文
35 | with gr.Blocks() as demo:
36 |     gr.Markdown("# 抽取式问答")
37 |     gr.Markdown("输入上下文与问题后，点击submit按钮，可从上下文中抽取出答案，赶快试试吧！")
38 |     with gr.Column():    # 列排列
39 |         context = gr.Textbox(label="context")
40 |         question = gr.Textbox(label="question")
41 |     with gr.Row():       # 行排列
42 |         clear = gr.Button("clear")   # 清除按钮
43 |         submit = gr.Button("submit")   # submit提交按钮
44 |     with gr.Column():    # 列排列
45 |         answer = gr.Textbox(label="answer")
46 |         score = gr.Label(label="score")
47 | 
48 |     # 绑定submit点击函数
49 |     submit.click(fn=custom_predict, inputs=[context, question], outputs=[answer, score])
50 | 
51 |     # 绑定clear点击函数
52 |     clear.click(fn=clear_input, inputs=[], outputs=[context, question, answer, score])
53 |     gr.Examples(examples, inputs=[context, question])
54 |     gr.Markdown("感兴趣的小伙伴可以阅读[Transformers实用指南](https://zhuanlan.zhihu.com/p/548336726)")
55 | 
56 | demo.launch()
57 | 


--------------------------------------------------------------------------------
/gradio学习/02-chatglm_web.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @file   : 02-chatglm_web.py
  3 | @author : xiaolu
  4 | @email  : luxiaonlp@163.com
  5 | @time   : 2023-05-12
  6 | """
  7 | from transformers import AutoModel, AutoTokenizer
  8 | import gradio as gr
  9 | 
 10 | # 加载模型
 11 | tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
 12 | model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()
 13 | model = model.eval()
 14 | 
 15 | 
 16 | """Override Chatbot.postprocess"""
 17 | def postprocess(self, y):
 18 |     if y is None:
 19 |         return []
 20 |     for i, (message, response) in enumerate(y):
 21 |         y[i] = (
 22 |             None if message is None else mdtex2html.convert((message)),
 23 |             None if response is None else mdtex2html.convert(response),
 24 |         )
 25 |     return y
 26 | 
 27 | 
 28 | gr.Chatbot.postprocess = postprocess
 29 | 
 30 | 
 31 | def parse_text(text):
 32 |     """copy from https://github.com/GaiZhenbiao/ChuanhuChatGPT/"""
 33 |     lines = text.split("\n")
 34 |     lines = [line for line in lines if line != ""]
 35 |     count = 0
 36 |     for i, line in enumerate(lines):
 37 |         if "```" in line:
 38 |             count += 1
 39 |             items = line.split('`')
 40 |             if count % 2 == 1:
 41 |                 lines[i] = f'<pre><code class="language-{items[-1]}">'
 42 |             else:
 43 |                 lines[i] = f'<br></code></pre>'
 44 |         else:
 45 |             if i > 0:
 46 |                 if count % 2 == 1:
 47 |                     line = line.replace("`", "\`")
 48 |                     line = line.replace("<", "&lt;")
 49 |                     line = line.replace(">", "&gt;")
 50 |                     line = line.replace(" ", "&nbsp;")
 51 |                     line = line.replace("*", "&ast;")
 52 |                     line = line.replace("_", "&lowbar;")
 53 |                     line = line.replace("-", "&#45;")
 54 |                     line = line.replace(".", "&#46;")
 55 |                     line = line.replace("!", "&#33;")
 56 |                     line = line.replace("(", "&#40;")
 57 |                     line = line.replace(")", "&#41;")
 58 |                     line = line.replace("$", "&#36;")
 59 |                 lines[i] = "<br>"+line
 60 |     text = "".join(lines)
 61 |     return text
 62 | 
 63 | 
 64 | def predict(input, chatbot, max_length, top_p, temperature, history):
 65 |     chatbot.append((parse_text(input), ""))
 66 |     for response, history in model.stream_chat(tokenizer, input, history, max_length=max_length, top_p=top_p,
 67 |                                                temperature=temperature):
 68 |         chatbot[-1] = (parse_text(input), parse_text(response))
 69 |         yield chatbot, history
 70 | 
 71 | 
 72 | def reset_user_input():
 73 |     return gr.update(value='')
 74 | 
 75 | 
 76 | def reset_state():
 77 |     return [], []
 78 | 
 79 | 
 80 | with gr.Blocks() as demo:
 81 |     gr.HTML("""<h1 align="center">ChatGLM</h1>""")    # 可以加入前端代码显示
 82 | 
 83 |     chatbot = gr.Chatbot()   # 占一行  chatbot
 84 |     with gr.Row():   # 下面的每个元素行行排列
 85 |         with gr.Column(scale=4):   # 行 左  占总行空间的4/5
 86 |             with gr.Column(scale=12):
 87 |                 user_input = gr.Textbox(show_label=False, placeholder="Input...", lines=10).style(
 88 |                     container=False)
 89 |             with gr.Column(min_width=32, scale=1):
 90 |                 submitBtn = gr.Button("Submit", variant="primary")
 91 | 
 92 |         with gr.Column(scale=1):   # 行  右   占总行空间的1/5
 93 |             emptyBtn = gr.Button("Clear History")
 94 |             max_length = gr.Slider(0, 4096, value=2048, step=1.0, label="Maximum length", interactive=True)
 95 |             top_p = gr.Slider(0, 1, value=0.7, step=0.01, label="Top P", interactive=True)
 96 |             temperature = gr.Slider(0, 1, value=0.95, step=0.01, label="Temperature", interactive=True)
 97 | 
 98 |     history = gr.State([])
 99 |     submitBtn.click(predict, [user_input, chatbot, max_length, top_p, temperature, history], [chatbot, history],
100 |                     show_progress=True)
101 |     submitBtn.click(reset_user_input, [], [user_input])   # 点了提交按钮后 用户输入框也得改下
102 |     emptyBtn.click(reset_state, outputs=[chatbot, history], show_progress=True)
103 | 
104 | demo.queue().launch(share=False, inbrowser=True)
105 | # 如果想加登录
106 | # zhanghu = [["xiaolu", "1234"]]
107 | # demo.queue().launch(share=True, server_name='0.0.0.0', server_port=6006, auth=zhanghu, auth_message='请联系xiaolu认证进行访问')
108 | 


--------------------------------------------------------------------------------
/ipdb调试python程序/001-简单调试.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2020/8/6 10:04
 3 | # @Author  : xiaolu
 4 | # @FileName: 001-简单调试.py
 5 | # @Software: PyCharm
 6 | from ipdb import set_trace
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 |     a = 0
11 |     b = 1
12 |     for i in range(1, 100, 2):
13 |         a += i
14 |         b *= i
15 |         set_trace()
16 | 
17 | 
18 | # ipdb> print(a)
19 | # 1
20 | # ipdb> print(b)
21 | # 1
22 | # 接下来输入n   每输入一次 往后执行一行
23 | 
24 | # 假设输入两次n  此时的a=4   b=1    输三次n 此时的a=4  b=3
25 | 


--------------------------------------------------------------------------------
/ipdb调试python程序/readme.txt:
--------------------------------------------------------------------------------
 1 | ipdb命令大全:
 2 | 
 3 | ENTER(重复上次命令)
 4 | c(继续)
 5 | l(查找当前位于哪里)
 6 | s(进入子程序)
 7 | r(运行直到子程序结束)
 8 | !<python 命令>
 9 | h(帮助)
10 | a(rgs) 打印当前函数的参数
11 | j(ump) 让程序跳转到指定的行数
12 | l(ist) 可以列出当前将要运行的代码块
13 | n(ext) 让程序运行下一行，如果当前语句有一个函数调用，用 n 是不会进入被调用的函数体中的
14 | p(rint) 最有用的命令之一，打印某个变量
15 | q(uit) 退出调试
16 | r(eturn) 继续执行，直到函数体返回
17 | s(tep) 跟 n 相似，但是如果当前有一个函数调用，那么 s 会进入被调用的函数体中


--------------------------------------------------------------------------------
/logging模块的使用/001-日志级别的使用.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 001-日志级别的使用.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-02-21
 6 | """
 7 | import logging
 8 | 
 9 | if __name__ == '__main__':
10 |     logging.basicConfig(level=logging.NOTSET)  # 这是级别 输出小于warning级别的信息
11 |     logging.debug('数学')
12 |     logging.info('英语')
13 |     logging.warning('物理')
14 |     logging.error('体育')
15 |     logging.critical('政治')


--------------------------------------------------------------------------------
/logging模块的使用/002-日志控制台输出.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 002-日志控制台输出.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-02-21
 6 | """
 7 | import logging  # 引入logging模块
 8 | if __name__ == '__main__':
 9 |     logging.basicConfig(level=logging.DEBUG,   # 输出的最低级别
10 |                         format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')  # logging.basicConfig函数对日志的输出格式及方式做相关配置
11 |     # asctime是时间 filename是当前文件夹 lineno 行号  levelname 什么级别的错误  massage输出的信息
12 |     # 由于日志基本配置中级别设置为DEBUG，所以一下打印信息将会全部显示在控制台上
13 |     logging.info('this is a loggging info message')
14 |     logging.debug('this is a loggging debug message')
15 |     logging.warning('this is loggging a warning message')
16 |     logging.error('this is an loggging error message')
17 |     logging.critical('this is a loggging critical message')
18 | 


--------------------------------------------------------------------------------
/logging模块的使用/003-日志文件输出.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 003-日志文件输出.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-02-21
 6 | """
 7 | import logging  # 引入logging模块
 8 | import os.path
 9 | import time
10 | 
11 | if __name__ == '__main__':
12 |     # 第一步，创建一个logger
13 |     logger = logging.getLogger()
14 |     logger.setLevel(logging.INFO)  # Log等级总开关
15 | 
16 |     # 第二步，创建一个handler，用于写入日志文件
17 |     rq = time.strftime('%Y%m%d%H%M', time.localtime(time.time()))
18 |     log_path = os.path.dirname(os.getcwd()) + '/Logs/'
19 |     os.makedirs(log_path, exist_ok=True)   # 创建文件夹
20 |     log_name = log_path + rq + '.log'   # 日志名
21 |     logfile = log_name
22 |     fh = logging.FileHandler(logfile, mode='w')
23 |     fh.setLevel(logging.DEBUG)  # 输出到file的log等级的开关
24 | 
25 |     # 第三步，定义handler的输出格式
26 |     formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")
27 |     fh.setFormatter(formatter)
28 | 
29 |     # 第四步，将logger添加到handler里面
30 |     logger.addHandler(fh)
31 | 
32 |     # 日志
33 |     logger.debug('this is a logger debug message')
34 |     logger.info('this is a logger info message')
35 |     logger.warning('this is a logger warning message')
36 |     logger.error('this is a logger error message')
37 |     logger.critical('this is a logger critical message')


--------------------------------------------------------------------------------
/logging模块的使用/004-捕捉异常.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 004-捕捉异常.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-02-21
 6 | """
 7 | import os.path
 8 | import time
 9 | import logging
10 | 
11 | if __name__ == '__main__':
12 |     # 创建一个logger
13 |     logger = logging.getLogger()
14 |     logger.setLevel(logging.INFO)  # Log等级总开关
15 | 
16 |     # 创建一个handler，用于写入日志文件
17 |     rq = time.strftime('%Y%m%d%H%M', time.localtime(time.time()))
18 |     log_path = os.path.dirname(os.getcwd()) + '/Logs/'
19 |     os.makedirs(log_path, exist_ok=True)
20 |     log_name = log_path + rq + '.log'
21 |     logfile = log_name
22 |     fh = logging.FileHandler(logfile, mode='w')
23 |     fh.setLevel(logging.DEBUG)  # 输出到file的log等级的开关
24 | 
25 |     # 定义handler的输出格式
26 |     formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")
27 |     fh.setFormatter(formatter)
28 |     logger.addHandler(fh)
29 | 
30 |     # 使用logger.XX来记录错误,这里的"error"可以根据所需要的级别进行修改
31 |     try:
32 |         open('/path/to/does/not/exist', 'rb')
33 |     except (SystemExit, KeyboardInterrupt):
34 |         raise
35 |     except Exception:
36 |         logger.error('Failed to open file', exc_info=True)
37 | 


--------------------------------------------------------------------------------
/pandas一键画图/001-plot_zhexiantu.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 001-plot_zhexiantu.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-12-29
 6 | """
 7 | # 安装pandas以及pandas_bokeh   # pip install pandas_bokeh pandas
 8 | import numpy as np
 9 | import pandas as pd
10 | import pandas_bokeh
11 | 
12 | # 注意  文件名字不要夹带中文
13 | 
14 | if __name__ == '__main__':
15 |     np.random.seed(55)
16 |     df = pd.DataFrame({"宁德时代": np.random.randn(100)+0.2,
17 |                        "贵州茅台": np.random.randn(100)+0.17},
18 |                       index=pd.date_range('1/1/2021', periods=100))
19 |     df = df.cumsum()   # 累加
20 |     df = df + 50
21 |     df.plot_bokeh.line(
22 |         figsize=(800, 450),   # 图片的大小
23 |         title="宁德时代 vs 贵州茅台",   # 表名
24 |         xlabel="日期",    # 横坐标的名字
25 |         ylabel="股票价格 [$]",   # 纵坐标的名字
26 |         # yticks=[0, 100, 200, 300, 400],   #  y轴的虚线  可以不带
27 |         ylim=(45, 80),   # y轴范围
28 |         xlim=("2021-01-01", "2021-04-01"),  # x轴的范围
29 |         colormap=["red", "blue"],
30 |         plot_data_points=True,   # 标记每个值
31 |         plot_data_points_size=5,
32 |         marker="asterisk")
33 | 


--------------------------------------------------------------------------------
/pandas一键画图/002-plot_sandiantu.html:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | 
 5 | <!DOCTYPE html>
 6 | <html lang="en">
 7 |   
 8 |   <head>
 9 |     
10 |       <meta charset="utf-8">
11 |       <title>Bokeh Plot</title>
12 |       
13 |       
14 |         
15 |           
16 |         
17 |         
18 |           
19 |         <script type="text/javascript" src="https://cdn.bokeh.org/bokeh/release/bokeh-2.4.2.min.js"></script>
20 |         <script type="text/javascript" src="https://cdn.bokeh.org/bokeh/release/bokeh-gl-2.4.2.min.js"></script>
21 |         <script type="text/javascript">
22 |             Bokeh.set_log_level("info");
23 |         </script>
24 |         
25 |       
26 |       
27 |     
28 |   </head>
29 |   
30 |   
31 |   <body>
32 |     
33 |       
34 |         
35 |           
36 |           
37 |             
38 |               <div class="bk-root" id="e476a16a-32eb-41d0-ad67-53eac5594917" data-root-id="1002"></div>
39 |             
40 |           
41 |         
42 |       
43 |       
44 |         <script type="application/json" id="1202">
45 |           {"f46110a9-47d6-4201-9bf5-d13107dab9fa":{"defs":[],"roots":{"references":[{"attributes":{"source":{"id":"1035"}},"id":"1043","type":"CDSView"},{"attributes":{"coordinates":null,"data_source":{"id":"1035"},"glyph":{"id":"1039"},"group":null,"hover_glyph":null,"muted_glyph":{"id":"1041"},"nonselection_glyph":{"id":"1040"},"view":{"id":"1043"}},"id":"1042","type":"GlyphRenderer"},{"attributes":{"axis_label":"length","coordinates":null,"formatter":{"id":"1050"},"group":null,"major_label_policy":{"id":"1051"},"ticker":{"id":"1014"}},"id":"1013","type":"LinearAxis"},{"attributes":{},"id":"1025","type":"ResetTool"},{"attributes":{"callback":null,"renderers":[{"id":"1042"}],"tooltips":[["length","@__x__values_original"],["width","@y"],["label","@{label}"]]},"id":"1057","type":"HoverTool"},{"attributes":{"axis":{"id":"1017"},"coordinates":null,"dimension":1,"group":null,"ticker":null},"id":"1020","type":"Grid"},{"attributes":{"fill_alpha":{"value":0.1},"fill_color":{"field":"label","transform":{"id":"1036"}},"hatch_alpha":{"value":0.1},"line_alpha":{"value":0.1},"line_width":{"value":2},"size":{"value":10},"x":{"field":"__x__values"},"y":{"field":"y"}},"id":"1040","type":"Scatter"},{"attributes":{},"id":"1014","type":"BasicTicker"},{"attributes":{},"id":"1026","type":"HelpTool"},{"attributes":{"below":[{"id":"1013"}],"center":[{"id":"1016"},{"id":"1020"},{"id":"1055"}],"height":400,"left":[{"id":"1017"}],"output_backend":"webgl","renderers":[{"id":"1042"}],"right":[{"id":"1037"}],"sizing_mode":"fixed","title":{"id":"1003"},"toolbar":{"id":"1028"},"x_range":{"id":"1005"},"x_scale":{"id":"1009"},"y_range":{"id":"1007"},"y_scale":{"id":"1011"}},"id":"1002","subtype":"Figure","type":"Plot"},{"attributes":{},"id":"1011","type":"LinearScale"},{"attributes":{"label":{"value":"label "},"renderers":[{"id":"1042"}]},"id":"1056","type":"LegendItem"},{"attributes":{"click_policy":"hide","coordinates":null,"group":null,"items":[{"id":"1056"}]},"id":"1055","type":"Legend"},{"attributes":{"fill_color":{"field":"label","transform":{"id":"1036"}},"line_width":{"value":2},"size":{"value":10},"x":{"field":"__x__values"},"y":{"field":"y"}},"id":"1039","type":"Scatter"},{"attributes":{"bottom_units":"screen","coordinates":null,"fill_alpha":0.5,"fill_color":"lightgrey","group":null,"left_units":"screen","level":"overlay","line_alpha":1.0,"line_color":"black","line_dash":[4,4],"line_width":2,"right_units":"screen","syncable":false,"top_units":"screen"},"id":"1027","type":"BoxAnnotation"},{"attributes":{"axis":{"id":"1013"},"coordinates":null,"group":null,"ticker":null},"id":"1016","type":"Grid"},{"attributes":{},"id":"1051","type":"AllLabels"},{"attributes":{},"id":"1052","type":"UnionRenderers"},{"attributes":{},"id":"1018","type":"BasicTicker"},{"attributes":{"axis_label":"width","coordinates":null,"formatter":{"id":"1046"},"group":null,"major_label_policy":{"id":"1047"},"ticker":{"id":"1018"}},"id":"1017","type":"LinearAxis"},{"attributes":{},"id":"1053","type":"Selection"},{"attributes":{},"id":"1047","type":"AllLabels"},{"attributes":{"palette":["#000003","#000004","#000006","#010007","#010109","#01010B","#02010E","#020210","#030212","#040314","#040316","#050418","#06041B","#07051D","#08061F","#090621","#0A0723","#0B0726","#0D0828","#0E082A","#0F092D","#10092F","#120A32","#130A34","#140B36","#160B39","#170B3B","#190B3E","#1A0B40","#1C0C43","#1D0C45","#1F0C47","#200C4A","#220B4C","#240B4E","#260B50","#270B52","#290B54","#2B0A56","#2D0A58","#2E0A5A","#300A5C","#32095D","#34095F","#350960","#370961","#390962","#3B0964","#3C0965","#3E0966","#400966","#410967","#430A68","#450A69","#460A69","#480B6A","#4A0B6A","#4B0C6B","#4D0C6B","#4F0D6C","#500D6C","#520E6C","#530E6D","#550F6D","#570F6D","#58106D","#5A116D","#5B116E","#5D126E","#5F126E","#60136E","#62146E","#63146E","#65156E","#66156E","#68166E","#6A176E","#6B176E","#6D186E","#6E186E","#70196E","#72196D","#731A6D","#751B6D","#761B6D","#781C6D","#7A1C6D","#7B1D6C","#7D1D6C","#7E1E6C","#801F6B","#811F6B","#83206B","#85206A","#86216A","#88216A","#892269","#8B2269","#8D2369","#8E2468","#902468","#912567","#932567","#952666","#962666","#982765","#992864","#9B2864","#9C2963","#9E2963","#A02A62","#A12B61","#A32B61","#A42C60","#A62C5F","#A72D5F","#A92E5E","#AB2E5D","#AC2F5C","#AE305B","#AF315B","#B1315A","#B23259","#B43358","#B53357","#B73456","#B83556","#BA3655","#BB3754","#BD3753","#BE3852","#BF3951","#C13A50","#C23B4F","#C43C4E","#C53D4D","#C73E4C","#C83E4B","#C93F4A","#CB4049","#CC4148","#CD4247","#CF4446","#D04544","#D14643","#D24742","#D44841","#D54940","#D64A3F","#D74B3E","#D94D3D","#DA4E3B","#DB4F3A","#DC5039","#DD5238","#DE5337","#DF5436","#E05634","#E25733","#E35832","#E45A31","#E55B30","#E65C2E","#E65E2D","#E75F2C","#E8612B","#E9622A","#EA6428","#EB6527","#EC6726","#ED6825","#ED6A23","#EE6C22","#EF6D21","#F06F1F","#F0701E","#F1721D","#F2741C","#F2751A","#F37719","#F37918","#F47A16","#F57C15","#F57E14","#F68012","#F68111","#F78310","#F7850E","#F8870D","#F8880C","#F88A0B","#F98C09","#F98E08","#F99008","#FA9107","#FA9306","#FA9506","#FA9706","#FB9906","#FB9B06","#FB9D06","#FB9E07","#FBA007","#FBA208","#FBA40A","#FBA60B","#FBA80D","#FBAA0E","#FBAC10","#FBAE12","#FBB014","#FBB116","#FBB318","#FBB51A","#FBB71C","#FBB91E","#FABB21","#FABD23","#FABF25","#FAC128","#F9C32A","#F9C52C","#F9C72F","#F8C931","#F8CB34","#F8CD37","#F7CF3A","#F7D13C","#F6D33F","#F6D542","#F5D745","#F5D948","#F4DB4B","#F4DC4F","#F3DE52","#F3E056","#F3E259","#F2E45D","#F2E660","#F1E864","#F1E968","#F1EB6C","#F1ED70","#F1EE74","#F1F079","#F1F27D","#F2F381","#F2F485","#F3F689","#F4F78D","#F5F891","#F6FA95","#F7FB99","#F9FC9D","#FAFDA0","#FCFEA4"]},"id":"1036","type":"LinearColorMapper"},{"attributes":{"data":{"__x__values":{"__ndarray__":"ZmZmZmZmFECamZmZmZkTQM3MzMzMzBJAZmZmZmZmEkAAAAAAAAAUQJqZmZmZmRVAZmZmZmZmEkAAAAAAAAAUQJqZmZmZmRFAmpmZmZmZE0CamZmZmZkVQDMzMzMzMxNAMzMzMzMzE0AzMzMzMzMRQDMzMzMzMxdAzczMzMzMFkA=","dtype":"float64","order":"little","shape":[16]},"__x__values_original":{"__ndarray__":"ZmZmZmZmFECamZmZmZkTQM3MzMzMzBJAZmZmZmZmEkAAAAAAAAAUQJqZmZmZmRVAZmZmZmZmEkAAAAAAAAAUQJqZmZmZmRFAmpmZmZmZE0CamZmZmZkVQDMzMzMzMxNAMzMzMzMzE0AzMzMzMzMRQDMzMzMzMxdAzczMzMzMFkA=","dtype":"float64","order":"little","shape":[16]},"label":[0,1,0,0,1,0,0,0,0,1,0,1,1,1,0,1],"y":{"__ndarray__":"AAAAAAAADEAAAAAAAAAIQJqZmZmZmQlAzczMzMzMCEDNzMzMzMwMQDMzMzMzMw9AMzMzMzMzC0AzMzMzMzMLQDMzMzMzMwdAzczMzMzMCECamZmZmZkNQDMzMzMzMwtAAAAAAAAACEAAAAAAAAAIQAAAAAAAABBAmpmZmZmZEUA=","dtype":"float64","order":"little","shape":[16]}},"selected":{"id":"1053"},"selection_policy":{"id":"1052"}},"id":"1035","type":"ColumnDataSource"},{"attributes":{},"id":"1050","type":"BasicTickFormatter"},{"attributes":{},"id":"1048","type":"NoOverlap"},{"attributes":{"active_scroll":{"id":"1022"},"tools":[{"id":"1021"},{"id":"1022"},{"id":"1023"},{"id":"1024"},{"id":"1025"},{"id":"1026"},{"id":"1057"}]},"id":"1028","type":"Toolbar"},{"attributes":{"color_mapper":{"id":"1036"},"coordinates":null,"group":null,"label_standoff":0,"location":[0,0],"major_label_policy":{"id":"1048"}},"id":"1037","type":"ColorBar"},{"attributes":{},"id":"1021","type":"PanTool"},{"attributes":{"coordinates":null,"group":null,"text":"\u968f\u4fbf\u4e00\u753b"},"id":"1003","type":"Title"},{"attributes":{},"id":"1022","type":"WheelZoomTool"},{"attributes":{},"id":"1007","type":"DataRange1d"},{"attributes":{},"id":"1009","type":"LinearScale"},{"attributes":{"fill_alpha":{"value":0.2},"fill_color":{"field":"label","transform":{"id":"1036"}},"hatch_alpha":{"value":0.2},"line_alpha":{"value":0.2},"line_width":{"value":2},"size":{"value":10},"x":{"field":"__x__values"},"y":{"field":"y"}},"id":"1041","type":"Scatter"},{"attributes":{"overlay":{"id":"1027"}},"id":"1023","type":"BoxZoomTool"},{"attributes":{},"id":"1046","type":"BasicTickFormatter"},{"attributes":{},"id":"1024","type":"SaveTool"},{"attributes":{},"id":"1005","type":"DataRange1d"}],"root_ids":["1002"]},"title":"Bokeh Application","version":"2.4.2"}}
46 |         </script>
47 |         <script type="text/javascript">
48 |           (function() {
49 |             const fn = function() {
50 |               Bokeh.safely(function() {
51 |                 (function(root) {
52 |                   function embed_document(root) {
53 |                     
54 |                   const docs_json = document.getElementById('1202').textContent;
55 |                   const render_items = [{"docid":"f46110a9-47d6-4201-9bf5-d13107dab9fa","root_ids":["1002"],"roots":{"1002":"e476a16a-32eb-41d0-ad67-53eac5594917"}}];
56 |                   root.Bokeh.embed.embed_items(docs_json, render_items);
57 |                 
58 |                   }
59 |                   if (root.Bokeh !== undefined) {
60 |                     embed_document(root);
61 |                   } else {
62 |                     let attempts = 0;
63 |                     const timer = setInterval(function(root) {
64 |                       if (root.Bokeh !== undefined) {
65 |                         clearInterval(timer);
66 |                         embed_document(root);
67 |                       } else {
68 |                         attempts++;
69 |                         if (attempts > 100) {
70 |                           clearInterval(timer);
71 |                           console.log("Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing");
72 |                         }
73 |                       }
74 |                     }, 10, root)
75 |                   }
76 |                 })(window);
77 |               });
78 |             };
79 |             if (document.readyState != "loading") fn();
80 |             else document.addEventListener("DOMContentLoaded", fn);
81 |           })();
82 |         </script>
83 |     
84 |   </body>
85 |   
86 | </html>


--------------------------------------------------------------------------------
/pandas一键画图/002-plot_sandiantu.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 002-plot_sandiantu.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-12-29
 6 | """
 7 | import pandas as pd
 8 | import pandas_bokeh
 9 | 
10 | if __name__ == '__main__':
11 |     # 随便造一些数据
12 |     df = pd.DataFrame({
13 |         'length': [5.1, 4.9, 4.7, 4.6, 5., 5.4, 4.6, 5., 4.4, 4.9, 5.4, 4.8, 4.8, 4.3, 5.8, 5.7],
14 |         'width': [3.5, 3., 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.4, 3., 3., 4., 4.4],
15 |         'label': [0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1]
16 |     })
17 | 
18 |     p_scatter = df.plot_bokeh.scatter(
19 |         x="length",
20 |         y="width",
21 |         category="label",   # 如果有类别 还可以jia
22 |         title="随便一画",
23 |         show_figure=True,
24 |     )


--------------------------------------------------------------------------------
/pandas一键画图/003-plot_zhuzhuangtu.html:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | 
 5 | <!DOCTYPE html>
 6 | <html lang="en">
 7 |   
 8 |   <head>
 9 |     
10 |       <meta charset="utf-8">
11 |       <title>Bokeh Plot</title>
12 |       
13 |       
14 |         
15 |           
16 |         
17 |         
18 |           
19 |         <script type="text/javascript" src="https://cdn.bokeh.org/bokeh/release/bokeh-2.4.2.min.js"></script>
20 |         <script type="text/javascript" src="https://cdn.bokeh.org/bokeh/release/bokeh-gl-2.4.2.min.js"></script>
21 |         <script type="text/javascript">
22 |             Bokeh.set_log_level("info");
23 |         </script>
24 |         
25 |       
26 |       
27 |     
28 |   </head>
29 |   
30 |   
31 |   <body>
32 |     
33 |       
34 |         
35 |           
36 |           
37 |             
38 |               <div class="bk-root" id="3700df7e-9fe6-4aaf-a8bd-3fd9ef14dcee" data-root-id="1038"></div>
39 |             
40 |           
41 |         
42 |       
43 |       
44 |         <script type="application/json" id="1237">
45 |           {"dc77cbca-1cf6-4e67-b5a9-899aae96e0f3":{"defs":[],"roots":{"references":[{"attributes":{},"id":"1083","type":"AllLabels"},{"attributes":{"source":{"id":"1037"}},"id":"1099","type":"CDSView"},{"attributes":{"source":{"id":"1037"}},"id":"1118","type":"CDSView"},{"attributes":{},"id":"1061","type":"ResetTool"},{"attributes":{},"id":"1085","type":"AllLabels"},{"attributes":{},"id":"1062","type":"HelpTool"},{"attributes":{"label":{"value":" 2017"},"renderers":[{"id":"1117"}]},"id":"1128","type":"LegendItem"},{"attributes":{},"id":"1086","type":"UnionRenderers"},{"attributes":{},"id":"1047","type":"LinearScale"},{"attributes":{"callback":null,"mode":"vline","renderers":[{"id":"1098"}],"tooltips":[["fruits","@__x__values_original"],["2016","@{2016}"]]},"id":"1110","type":"HoverTool"},{"attributes":{"active_scroll":{"id":"1058"},"tools":[{"id":"1057"},{"id":"1058"},{"id":"1059"},{"id":"1060"},{"id":"1061"},{"id":"1062"},{"id":"1091"},{"id":"1110"},{"id":"1129"}]},"id":"1064","type":"Toolbar"},{"attributes":{"fill_alpha":{"value":0.2},"fill_color":{"value":"#ff7f0e"},"hatch_alpha":{"value":0.2},"hatch_color":{"value":"#ff7f0e"},"line_alpha":{"value":0.2},"line_color":{"value":"#ff7f0e"},"line_width":{"value":2},"top":{"field":"2016"},"width":{"value":0.2},"x":{"field":"__x__values","transform":{"id":"1093"}}},"id":"1097","type":"VBar"},{"attributes":{},"id":"1087","type":"Selection"},{"attributes":{"coordinates":null,"data_source":{"id":"1037"},"glyph":{"id":"1095"},"group":null,"hover_glyph":null,"muted_glyph":{"id":"1097"},"nonselection_glyph":{"id":"1096"},"view":{"id":"1099"}},"id":"1098","type":"GlyphRenderer"},{"attributes":{"axis_label":"fruits","coordinates":null,"formatter":{"id":"1071"},"group":null,"major_label_policy":{"id":"1085"},"ticker":{"id":"1131"}},"id":"1049","type":"LinearAxis"},{"attributes":{"axis":{"id":"1053"},"coordinates":null,"dimension":1,"group":null,"ticker":null},"id":"1056","type":"Grid"},{"attributes":{"bottom_units":"screen","coordinates":null,"fill_alpha":0.5,"fill_color":"lightgrey","group":null,"left_units":"screen","level":"overlay","line_alpha":1.0,"line_color":"black","line_dash":[4,4],"line_width":2,"right_units":"screen","syncable":false,"top_units":"screen"},"id":"1063","type":"BoxAnnotation"},{"attributes":{"ticks":[0,1,2,3,4,5]},"id":"1131","type":"FixedTicker"},{"attributes":{"axis":{"id":"1049"},"coordinates":null,"group":null,"ticker":null},"id":"1052","type":"Grid"},{"attributes":{"fill_alpha":{"value":0.1},"fill_color":{"value":"#ff7f0e"},"hatch_alpha":{"value":0.1},"hatch_color":{"value":"#ff7f0e"},"line_alpha":{"value":0.1},"line_color":{"value":"#ff7f0e"},"line_width":{"value":2},"top":{"field":"2016"},"width":{"value":0.2},"x":{"field":"__x__values","transform":{"id":"1093"}}},"id":"1096","type":"VBar"},{"attributes":{"fill_alpha":{"value":0.6},"fill_color":{"value":"#1f77b4"},"hatch_alpha":{"value":0.6},"hatch_color":{"value":"#1f77b4"},"line_alpha":{"value":0.6},"line_color":{"value":"#1f77b4"},"line_width":{"value":2},"top":{"field":"2015"},"width":{"value":0.2},"x":{"field":"__x__values","transform":{"id":"1073"}}},"id":"1075","type":"VBar"},{"attributes":{"below":[{"id":"1049"}],"center":[{"id":"1052"},{"id":"1056"},{"id":"1089"}],"height":400,"left":[{"id":"1053"}],"output_backend":"webgl","renderers":[{"id":"1078"},{"id":"1098"},{"id":"1117"}],"sizing_mode":"fixed","title":{"id":"1039"},"toolbar":{"id":"1064"},"x_range":{"id":"1041"},"x_scale":{"id":"1045"},"y_range":{"id":"1043"},"y_scale":{"id":"1047"}},"id":"1038","subtype":"Figure","type":"Plot"},{"attributes":{"fill_alpha":{"value":0.2},"fill_color":{"value":"#1f77b4"},"hatch_alpha":{"value":0.2},"hatch_color":{"value":"#1f77b4"},"line_alpha":{"value":0.2},"line_color":{"value":"#1f77b4"},"line_width":{"value":2},"top":{"field":"2015"},"width":{"value":0.2},"x":{"field":"__x__values","transform":{"id":"1073"}}},"id":"1077","type":"VBar"},{"attributes":{},"id":"1054","type":"BasicTicker"},{"attributes":{"fill_alpha":{"value":0.1},"fill_color":{"value":"#1f77b4"},"hatch_alpha":{"value":0.1},"hatch_color":{"value":"#1f77b4"},"line_alpha":{"value":0.1},"line_color":{"value":"#1f77b4"},"line_width":{"value":2},"top":{"field":"2015"},"width":{"value":0.2},"x":{"field":"__x__values","transform":{"id":"1073"}}},"id":"1076","type":"VBar"},{"attributes":{"axis_label":"\u6bcf\u65a4\u7684\u7684\u4ef7\u683c [\uffe5]","coordinates":null,"formatter":{"id":"1082"},"group":null,"major_label_policy":{"id":"1083"},"ticker":{"id":"1054"}},"id":"1053","type":"LinearAxis"},{"attributes":{"label":{"value":" 2016"},"renderers":[{"id":"1098"}]},"id":"1109","type":"LegendItem"},{"attributes":{},"id":"1060","type":"SaveTool"},{"attributes":{},"id":"1082","type":"BasicTickFormatter"},{"attributes":{"data":{"2015":[2,1,4,3,2,4],"2016":[5,3,3,2,4,6],"2017":[3,2,4,4,5,3],"__x__values":[0,1,2,3,4,5],"__x__values_original":["\u82f9\u679c","\u68a8","\u8349\u8393","\u897f\u74dc","\u8461\u8404","\u9999\u8549"]},"selected":{"id":"1087"},"selection_policy":{"id":"1086"}},"id":"1037","type":"ColumnDataSource"},{"attributes":{},"id":"1045","type":"LinearScale"},{"attributes":{"coordinates":null,"data_source":{"id":"1037"},"glyph":{"id":"1075"},"group":null,"hover_glyph":null,"muted_glyph":{"id":"1077"},"nonselection_glyph":{"id":"1076"},"view":{"id":"1079"}},"id":"1078","type":"GlyphRenderer"},{"attributes":{},"id":"1043","type":"DataRange1d"},{"attributes":{"code":"\n                                    var labels = {0: '\u82f9\u679c', 1: '\u68a8', 2: '\u8349\u8393', 3: '\u897f\u74dc', 4: '\u8461\u8404', 5: '\u9999\u8549'};\n                                    return labels[tick];\n                                    "},"id":"1071","type":"FuncTickFormatter"},{"attributes":{"label":{"value":" 2015"},"renderers":[{"id":"1078"}]},"id":"1090","type":"LegendItem"},{"attributes":{"range":{"id":"1041"},"value":0.25},"id":"1112","type":"Dodge"},{"attributes":{"coordinates":null,"group":null,"text":"\u6c34\u679c\u6bcf\u5e74\u7684\u4ef7\u683c"},"id":"1039","type":"Title"},{"attributes":{"source":{"id":"1037"}},"id":"1079","type":"CDSView"},{"attributes":{"callback":null,"mode":"vline","renderers":[{"id":"1078"}],"tooltips":[["fruits","@__x__values_original"],["2015","@{2015}"]]},"id":"1091","type":"HoverTool"},{"attributes":{"click_policy":"hide","coordinates":null,"group":null,"items":[{"id":"1090"},{"id":"1109"},{"id":"1128"}]},"id":"1089","type":"Legend"},{"attributes":{},"id":"1057","type":"PanTool"},{"attributes":{"range":{"id":"1041"},"value":-0.25},"id":"1073","type":"Dodge"},{"attributes":{"range":{"id":"1041"}},"id":"1093","type":"Dodge"},{"attributes":{"fill_alpha":{"value":0.2},"fill_color":{"value":"#2ca02c"},"hatch_alpha":{"value":0.2},"hatch_color":{"value":"#2ca02c"},"line_alpha":{"value":0.2},"line_color":{"value":"#2ca02c"},"line_width":{"value":2},"top":{"field":"2017"},"width":{"value":0.2},"x":{"field":"__x__values","transform":{"id":"1112"}}},"id":"1116","type":"VBar"},{"attributes":{},"id":"1058","type":"WheelZoomTool"},{"attributes":{"overlay":{"id":"1063"}},"id":"1059","type":"BoxZoomTool"},{"attributes":{"callback":null,"mode":"vline","renderers":[{"id":"1117"}],"tooltips":[["fruits","@__x__values_original"],["2017","@{2017}"]]},"id":"1129","type":"HoverTool"},{"attributes":{},"id":"1041","type":"DataRange1d"},{"attributes":{"fill_alpha":{"value":0.6},"fill_color":{"value":"#ff7f0e"},"hatch_alpha":{"value":0.6},"hatch_color":{"value":"#ff7f0e"},"line_alpha":{"value":0.6},"line_color":{"value":"#ff7f0e"},"line_width":{"value":2},"top":{"field":"2016"},"width":{"value":0.2},"x":{"field":"__x__values","transform":{"id":"1093"}}},"id":"1095","type":"VBar"},{"attributes":{"fill_alpha":{"value":0.6},"fill_color":{"value":"#2ca02c"},"hatch_alpha":{"value":0.6},"hatch_color":{"value":"#2ca02c"},"line_alpha":{"value":0.6},"line_color":{"value":"#2ca02c"},"line_width":{"value":2},"top":{"field":"2017"},"width":{"value":0.2},"x":{"field":"__x__values","transform":{"id":"1112"}}},"id":"1114","type":"VBar"},{"attributes":{"fill_alpha":{"value":0.1},"fill_color":{"value":"#2ca02c"},"hatch_alpha":{"value":0.1},"hatch_color":{"value":"#2ca02c"},"line_alpha":{"value":0.1},"line_color":{"value":"#2ca02c"},"line_width":{"value":2},"top":{"field":"2017"},"width":{"value":0.2},"x":{"field":"__x__values","transform":{"id":"1112"}}},"id":"1115","type":"VBar"},{"attributes":{"coordinates":null,"data_source":{"id":"1037"},"glyph":{"id":"1114"},"group":null,"hover_glyph":null,"muted_glyph":{"id":"1116"},"nonselection_glyph":{"id":"1115"},"view":{"id":"1118"}},"id":"1117","type":"GlyphRenderer"}],"root_ids":["1038"]},"title":"Bokeh Application","version":"2.4.2"}}
46 |         </script>
47 |         <script type="text/javascript">
48 |           (function() {
49 |             const fn = function() {
50 |               Bokeh.safely(function() {
51 |                 (function(root) {
52 |                   function embed_document(root) {
53 |                     
54 |                   const docs_json = document.getElementById('1237').textContent;
55 |                   const render_items = [{"docid":"dc77cbca-1cf6-4e67-b5a9-899aae96e0f3","root_ids":["1038"],"roots":{"1038":"3700df7e-9fe6-4aaf-a8bd-3fd9ef14dcee"}}];
56 |                   root.Bokeh.embed.embed_items(docs_json, render_items);
57 |                 
58 |                   }
59 |                   if (root.Bokeh !== undefined) {
60 |                     embed_document(root);
61 |                   } else {
62 |                     let attempts = 0;
63 |                     const timer = setInterval(function(root) {
64 |                       if (root.Bokeh !== undefined) {
65 |                         clearInterval(timer);
66 |                         embed_document(root);
67 |                       } else {
68 |                         attempts++;
69 |                         if (attempts > 100) {
70 |                           clearInterval(timer);
71 |                           console.log("Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing");
72 |                         }
73 |                       }
74 |                     }, 10, root)
75 |                   }
76 |                 })(window);
77 |               });
78 |             };
79 |             if (document.readyState != "loading") fn();
80 |             else document.addEventListener("DOMContentLoaded", fn);
81 |           })();
82 |         </script>
83 |     
84 |   </body>
85 |   
86 | </html>


--------------------------------------------------------------------------------
/pandas一键画图/003-plot_zhuzhuangtu.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 003-plot_zhuzhuangtu.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-12-29
 6 | """
 7 | import pandas as pd
 8 | import pandas_bokeh
 9 | 
10 | 
11 | if __name__ == '__main__':
12 |     data = {
13 |         'fruits':
14 |         ['苹果', '梨', '草莓', '西瓜', '葡萄', '香蕉'],
15 |         '2015': [2, 1, 4, 3, 2, 4],
16 |         '2016': [5, 3, 3, 2, 4, 6],
17 |         '2017': [3, 2, 4, 4, 5, 3]
18 |     }
19 |     df = pd.DataFrame(data).set_index("fruits")   # 设置水果为索引
20 | 
21 |     p_bar = df.plot_bokeh.bar(
22 |         ylabel="每斤的的价格 [￥]",
23 |         title="水果每年的价格",
24 |         alpha=0.6)


--------------------------------------------------------------------------------
/py2neo操作neo4j/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/.DS_Store


--------------------------------------------------------------------------------
/py2neo操作neo4j/py2neo简单练习/create_graph_v1.py:
--------------------------------------------------------------------------------
  1 | """
  2 | # -*- coding: utf-8 -*-
  3 | # @File    : create_graph_v1.py
  4 | # @Time    : 2020/11/23 6:52 下午
  5 | # @Author  : xiaolu
  6 | # @Email   : luxiaonlp@163.com
  7 | # @Software: PyCharm
  8 | """
  9 | from py2neo import Graph, Node, Relationship, NodeMatcher
 10 | import pandas as pd
 11 | from pdb import set_trace
 12 | 
 13 | 
 14 | def load_data():
 15 |     # 加载数据
 16 |     # data = pd.read_excel('./santi.xlsx')
 17 |     # data = pd.read_excel('./mingchaonaxieshier.xlsx')
 18 |     data = pd.read_excel('./test.xlsx')
 19 |     start = data['S'].tolist()
 20 |     relation = data['P'].tolist()
 21 |     end = data['O'].tolist()
 22 |     start_list = [str(i) for i in start]
 23 |     relation_list = [str(i) for i in relation]
 24 |     end_list = [str(i) for i in end]
 25 |     link_dict = dict()
 26 |     link_dict['start'] = start_list
 27 |     link_dict['relation'] = relation_list
 28 |     link_dict['end'] = end_list
 29 |     df_data = pd.DataFrame(link_dict)
 30 |     return df_data
 31 | 
 32 | 
 33 | class DataToNeo4j:
 34 |     def __init__(self):
 35 |         link = Graph()
 36 |         self.graph = link
 37 | 
 38 |         self.start = 'start'
 39 |         self.end = 'end'
 40 | 
 41 |         self.graph.delete_all()   # 将之前的图  全部删除
 42 |         self.matcher = NodeMatcher(link)   # 为了查找
 43 | 
 44 |     def create_node(self, start, end):
 45 |         # 创建节点
 46 |         for name in start:
 47 |             node = Node(self.start, name=name)
 48 |             self.graph.create(node)
 49 | 
 50 |         for name in end:
 51 |             node = Node(self.end, name=name)
 52 |             self.graph.create(node)
 53 | 
 54 |     def create_relation(self, df_data):
 55 |         m = 0
 56 |         for m in range(0, len(df_data)):
 57 |             # print(list(self.matcher.match(self.start).where('_.name=' + "'" + df_data['start'][m] + "'")))
 58 |             # 相当于在'start'标签下找   name=某个名字的节点
 59 |             # print(list(self.matcher.match(self.end).where('_.name=' + "'" + df_data['end'][m] + "'")))
 60 |             # 相当于在'end'标签下找   name=某个名字的节点'
 61 |             # 然后为这两个节点创建关系
 62 |             try:
 63 |                 rel = Relationship(
 64 |                     self.matcher.match(self.start).where('_.name=' + "'" + df_data['start'][m] + "'").first(),
 65 |                     df_data['relation'][m],
 66 |                     self.matcher.match(self.end).where('_.name=' + "'" + df_data['end'][m] + "'").first()
 67 |                 )
 68 |                 self.graph.create(rel)
 69 |             except AttributeError as e:
 70 |                 print(e, m)
 71 | 
 72 | 
 73 | def data_extraction(df_data):
 74 |     node_start = []
 75 |     for i in df_data['start'].tolist():
 76 |         node_start.append(i)
 77 | 
 78 |     node_end = []
 79 |     for i in df_data['end'].tolist():
 80 |         node_end.append(i)
 81 | 
 82 |     # 去重
 83 |     node_start = list(set(node_start))
 84 |     node_end = list(set(node_end))
 85 |     return node_start, node_end
 86 | 
 87 | 
 88 | if __name__ == '__main__':
 89 |     df_data = load_data()
 90 |     # print(df_data.head())
 91 |     node_start, node_end = data_extraction(df_data)
 92 |     # 创建图
 93 |     create_data = DataToNeo4j()
 94 |     # 节点
 95 |     create_data.create_node(node_start, node_end)
 96 |     # 关系
 97 |     create_data.create_relation(df_data)
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 


--------------------------------------------------------------------------------
/py2neo操作neo4j/py2neo简单练习/create_graph_v2.py:
--------------------------------------------------------------------------------
  1 | """
  2 | # -*- coding: utf-8 -*-
  3 | # @File    : create_graph_v2.py
  4 | # @Time    : 2020/11/23 9:54 下午
  5 | # @Author  : xiaolu
  6 | # @Email   : luxiaonlp@163.com
  7 | # @Software: PyCharm
  8 | """
  9 | """
 10 | # -*- coding: utf-8 -*-
 11 | # @File    : create_graph_v1.py
 12 | # @Time    : 2020/11/23 6:52 下午
 13 | # @Author  : xiaolu
 14 | # @Email   : luxiaonlp@163.com
 15 | # @Software: PyCharm
 16 | """
 17 | from py2neo import Graph, Node, Relationship, NodeMatcher
 18 | import pandas as pd
 19 | from pdb import set_trace
 20 | 
 21 | 
 22 | def load_data():
 23 |     # 加载数据
 24 |     data = pd.read_excel('./santi.xlsx')
 25 |     # data = pd.read_excel('./mingchaonaxieshier.xlsx')
 26 |     # data = pd.read_excel('./test.xlsx')
 27 |     start = data['S'].tolist()
 28 |     relation = data['P'].tolist()
 29 |     end = data['O'].tolist()
 30 |     start_list = [str(i) for i in start]
 31 |     relation_list = [str(i) for i in relation]
 32 |     end_list = [str(i) for i in end]
 33 |     link_dict = dict()
 34 |     link_dict['start'] = start_list
 35 |     link_dict['relation'] = relation_list
 36 |     link_dict['end'] = end_list
 37 |     df_data = pd.DataFrame(link_dict)
 38 |     return df_data
 39 | 
 40 | 
 41 | class DataToNeo4j:
 42 |     def __init__(self):
 43 |         link = Graph()
 44 |         self.graph = link
 45 | 
 46 |         self.start = 'start'
 47 |         self.end = 'end'
 48 | 
 49 |         self.graph.delete_all()   # 将之前的图  全部删除
 50 |         self.matcher = NodeMatcher(link)   # 为了查找
 51 | 
 52 |     def create_node(self, start, end):
 53 |         # 创建节点
 54 |         temp = []
 55 |         temp.extend(start)
 56 |         temp.extend(end)
 57 |         temp = list(set(temp))
 58 |         for t in temp:
 59 |             node = Node(self.start, name=t)
 60 |             self.graph.create(node)
 61 | 
 62 | 
 63 |         # for name in start:
 64 |         #     node = Node(self.start, name=name)
 65 |         #     self.graph.create(node)
 66 |         #
 67 |         # for name in end:
 68 |         #     node = Node(self.end, name=name)
 69 |         #     self.graph.create(node)
 70 | 
 71 |     def create_relation(self, df_data):
 72 |         m = 0
 73 |         for m in range(0, len(df_data)):
 74 |             # print(list(self.matcher.match(self.start).where('_.name=' + "'" + df_data['start'][m] + "'")))
 75 |             # 相当于在'start'标签下找   name=某个名字的节点
 76 |             # print(list(self.matcher.match(self.end).where('_.name=' + "'" + df_data['end'][m] + "'")))
 77 |             # 相当于在'end'标签下找   name=某个名字的节点'
 78 |             # 然后为这两个节点创建关系
 79 |             try:
 80 |                 rel = Relationship(
 81 |                     self.matcher.match(self.start).where('_.name=' + "'" + df_data['start'][m] + "'").first(),
 82 |                     df_data['relation'][m],
 83 |                     self.matcher.match(self.start).where('_.name=' + "'" + df_data['end'][m] + "'").first()
 84 |                 )
 85 |                 self.graph.create(rel)
 86 |             except AttributeError as e:
 87 |                 print(e, m)
 88 | 
 89 | 
 90 | def data_extraction(df_data):
 91 |     node_start = []
 92 |     for i in df_data['start'].tolist():
 93 |         node_start.append(i)
 94 | 
 95 |     node_end = []
 96 |     for i in df_data['end'].tolist():
 97 |         node_end.append(i)
 98 | 
 99 |     # 去重
100 |     node_start = list(set(node_start))
101 |     node_end = list(set(node_end))
102 |     return node_start, node_end
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     df_data = load_data()
107 |     # print(df_data.head())
108 |     node_start, node_end = data_extraction(df_data)
109 |     # 创建图
110 |     create_data = DataToNeo4j()
111 |     # 节点
112 |     create_data.create_node(node_start, node_end)
113 |     # 关系
114 |     create_data.create_relation(df_data)
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------
/py2neo操作neo4j/py2neo简单练习/mingchaonaxieshier.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/py2neo简单练习/mingchaonaxieshier.xlsx


--------------------------------------------------------------------------------
/py2neo操作neo4j/py2neo简单练习/santi.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/py2neo简单练习/santi.xlsx


--------------------------------------------------------------------------------
/py2neo操作neo4j/py2neo简单练习/test.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/py2neo简单练习/test.xlsx


--------------------------------------------------------------------------------
/py2neo操作neo4j/readme.txt:
--------------------------------------------------------------------------------
 1 | 1. 创造一个节点
 2 |    create (n:Person {name:"我", age=21})     
 3 | 2. 创建关系
 4 |    create (p:Person {name:"我", age:"23"})-[:包工程{金额:10000}]->(n:Person {name:"好大哥", age:"35"})
 5 | 3. 删除节点    注意 删除有连接的节点时  必须先删掉关系  
 6 |    create (n:Person {name:"XL", age:23})
 7 |    match (n:Person {name:"XL"}) delete n
 8 | 4. 删除关系
 9 |   match  (p:Person {name:"我", age:"23"})-[f:包工程{金额:10000}]->(n:Person {name:"好大哥", age:"35"}) delete f
10 | 5. 加上标签
11 |   match (t:Person) where id(t)=2 set t:好人 return t
12 |   同个某个节点的id   找到它   然后给其设置一个好人的标签  
13 | 6. 额外增加属性
14 |    match (a:好人) where id(a)=2 set a.战斗力=200 return a
15 |    在好人标签中找一个节点的id为2  然后给其加一个战斗力属性  并设置其值为200
16 | 7. 查找
17 |   create (:Person {name:"唐僧", age:"79"})-[:师傅 {s_time:"2020-11-23"}]->(:Person {name:"孙悟空", age:"1w"})
18 |   match (a:Person)-[:师傅]->(b:Person) return a,b
19 |   创建了唐僧和孙悟空是师傅关系 然后  找关系为师傅的两个节点。
20 | 8. 快速清空数据库
21 |    match (n) detach delete n


--------------------------------------------------------------------------------
/py2neo操作neo4j/事件三元组抽取/ltp的使用.py:
--------------------------------------------------------------------------------
  1 | """
  2 | # -*- coding: utf-8 -*-
  3 | # @File    : ltp的使用.py
  4 | # @Time    : 2020/11/25 9:20 下午
  5 | # @Author  : xiaolu
  6 | # @Email   : luxiaonlp@163.com
  7 | # @Software: PyCharm
  8 | """
  9 | from ltp import LTP
 10 | # 安装ltp: pip install ltp -i https://pypi.douban.com/simple/
 11 | # 学习文档: http://ltp.ai/docs/quickstart.html
 12 | 
 13 | 
 14 | def fenju():
 15 |     # 分句子
 16 |     sents = ltp.sent_split(["他叫汤姆去拿外衣。", "汤姆生病了。他去了医院。"])
 17 |     print(sents)
 18 | 
 19 | 
 20 | def fenci():
 21 |     # 可以加载自己的词表
 22 |     ltp.init_dict(path='my_vocab.txt', max_window=4)
 23 |     segment, _ = ltp.seg(['我是你爸，我是你妈'])
 24 |     print(segment)
 25 | 
 26 | 
 27 | def cixingbiaozhu():
 28 |     seg, hidden = ltp.seg(['他叫汤姆去拿外衣。'])
 29 |     pos = ltp.pos(hidden)
 30 |     print(seg)
 31 |     print(pos)
 32 | 
 33 | 
 34 | def mingmingshitishibie():
 35 |     seg, hidden = ltp.seg(['他叫汤姆去拿外衣。孙悟空不同意咋办? 但是奥特曼肯定会同意'])
 36 |     ner = ltp.ner(hidden)
 37 |     print(seg)
 38 |     print(ner)
 39 | 
 40 |     for i in ner[0]:
 41 |         tag = i[0]
 42 |         name = seg[0][i[1]: i[2]+1]
 43 |         print(tag, ":", name)
 44 | 
 45 | 
 46 | def yuyijuesebiaozhu():
 47 |     seg, hidden = ltp.seg(["他叫汤姆去拿外衣。"])
 48 |     srl = ltp.srl(hidden)
 49 |     print(srl)  # 包含了空
 50 | 
 51 |     srl = ltp.srl(hidden, keep_empty=False)
 52 |     print(srl)
 53 | 
 54 | 
 55 | def yicunjufafenxi():
 56 |     seg, hidden = ltp.seg(["他叫汤姆去拿外衣。"])
 57 |     dep = ltp.dep(hidden)
 58 |     print(dep)
 59 | 
 60 | 
 61 | def yicunjufashu():
 62 |     seg, hidden = ltp.seg(["他叫汤姆去拿外衣。"])
 63 |     sdp = ltp.sdp(hidden, graph=False)
 64 |     print(sdp)
 65 | 
 66 | 
 67 | def yicunjufafenxitu():
 68 |     seg, hidden = ltp.seg(["他叫汤姆去拿外衣。"])
 69 |     sdp = ltp.sdp(hidden, graph=True)
 70 |     print(sdp)
 71 | 
 72 | 
 73 | 
 74 | 
 75 | 
 76 | if __name__ == '__main__':
 77 |     ltp = LTP()   # ltp = LTP(path = "base|small|tiny")  默认下载small
 78 | 
 79 |     # 1. 分句
 80 |     # fenju()
 81 | 
 82 |     # 2. 分词
 83 |     # fenci()
 84 | 
 85 |     # 3. 词性标注
 86 |     # cixingbiaozhu()
 87 | 
 88 |     # 4. 命名实体识别
 89 |     # mingmingshitishibie()
 90 | 
 91 |     # 5. 语义角色标注
 92 |     # yuyijuesebiaozhu()
 93 | 
 94 |     # 6. 依存句法分析
 95 |     # yicunjufafenxi()
 96 | 
 97 |     # 7. 依存句法树
 98 |     yicunjufashu()
 99 | 
100 |     # 8. 依存句法分析(图)
101 |     yicunjufafenxitu()
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/py2neo操作neo4j/事件三元组抽取/my_vocab.txt:
--------------------------------------------------------------------------------
1 | 我是
2 | 你妈


--------------------------------------------------------------------------------
/py2neo操作neo4j/事件三元组抽取/readme.txt:
--------------------------------------------------------------------------------
 1 | --------------2020-12-9 更新----------------------
 2 | 这里不建议用ltp做三元组抽取，最近学习了一个深度学习模型进行三元组抽取  在我的另一个仓库
 3 | 
 4 | [链接](https://github.com/shawroad/NLP_pytorch_project/tree/master/relation_extraction/lstm_cnn_information_extract)
 5 | 
 6 | 
 7 | --------------2020-11-28 更新----------------------
 8 | 迪哥使用的是pyltp。 这里我不推荐用pyltp，这个包目前已经不更新了。已经是老古董了。加载的模型估计也过时了。
 9 | 
10 | 这里我推荐使用ltp
11 | 
12 | 安装: pip install ltp -i https://pypi.douban.com/simple/
13 | 
14 | 测试安装成功与否: from ltp import LTP
15 | 
16 | 安装成功后   下载模型 直接执行下面的代码  就可以下载了
17 | from ltp import LTP
18 | ltp = LTP()    # ltp = LTP(path = "base|small|tiny") 可以指定参数  默认下载的是small  180m左右
19 | 
20 | 
21 | 这些操作完成以后  建议先看看ltp的使用方法
22 | 可以看代码  ltp的使用.py  或者看官方文档:http://ltp.ai/docs/quickstart.html
23 | 然后在去看三元组的抽取
24 | 


--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/医疗知识图谱问答/.DS_Store


--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/ahocorasick的使用/demo.py:
--------------------------------------------------------------------------------
 1 | """
 2 | # -*- coding: utf-8 -*-
 3 | # @File    : demo.py
 4 | # @Time    : 2020/11/25 10:26 上午
 5 | # @Author  : xiaolu
 6 | # @Email   : luxiaonlp@163.com
 7 | # @Software: PyCharm
 8 | """
 9 | # 安装 pip install ahocorasick -i https://pypi.douban.com/simple
10 | import ahocorasick
11 | 
12 | 
13 | if __name__ == '__main__':
14 |     wordlist = ['长春海外制药接骨续筋片', '香菇炖甲鱼', '三鹤药业黄柏胶囊', '上海衡山熊去氧胆酸片', '升和药业依托泊苷注射液', '怡诺思',
15 |                   '人格障碍', '转铁蛋白饱和度', '脾囊肿', '素烧白萝卜', '利君现代冠脉宁片', '上海复华药业注射用还原型谷', '阴囊上有白色小疙瘩',
16 |                   '腹痛伴休克', '成都通德胰激肽原酶肠溶片', '蒸猪肝', '河北百善血尿胶囊', '精神障碍', '输卵管畸形', '元和抑眩宁胶囊', '莲藕豆腐',
17 |                   '辰欣哈西奈德溶液', '信谊烟酸片', '慢性胆囊炎', '参芪降糖颗粒', '康普药业盐酸普萘洛尔片', '西安迪赛胸腺肽肠溶片',
18 |                   '双鹭药业注射用复合辅酶', '慢性筛窦炎', '新高制药维胺酯维E乳膏', '冰黄肤乐软膏', '神经类疾病', '液晶热图',
19 |                   '枣（干）', '股外侧皮神经病', '浙江惠松硅炭银片', '牙根外露', '湖北潜江氯霉素滴眼液', '盐类皮质激素分泌过多', '五子衍宗丸',
20 |                   '小儿阵发性睡眠性血红蛋白尿症', '功能失调性子宫出血病', '茵栀黄口服液', '眼底出血和渗出', '斯达制药注射用头孢噻肟钠', '复方白芷酊',
21 |                   '胫腓骨骨折', '西南药业氯霉素片', '宫颈炎', '茶碱缓释胶囊', '原发性硬化性胆管炎', '郑州韩都利肺胶囊', '咽反射消失',
22 |                   '脊髓灰质炎', '甲状腺片', '回盲瓣功能不全', '牛黄清胃丸', '乙肝e抗体', '马齿苋粥', '动脉硬化', '宝宝乐', '肠闭锁', '肺放线菌病',
23 |                   '江苏晨牌产妇安颗粒', '犬吠样咳嗽', '胃康灵胶囊', '小儿烟酸缺乏病', '青龙防风通圣丸', '广东南国维生素C片', '碘化油咀嚼片',
24 |                   '西乐葆', '伟哥甲磺酸酚妥拉明分散片', '成都迪康药业樟脑醑', '斑疹', '五花炖墨鱼', '肉炖芸豆粉条', '陕西东泰制药益脉康胶囊',
25 |                   '桔梗八味颗粒', '华南牌溴丙胺太林片', '吉林敖东洮南小牛脾提取物注', '仁青芒觉', '血吸虫病与肝胆疾病', '持续性枕横位难产',
26 |                   '弯曲菌感染', '丝瓜蘑菇肉片汤', '长春银诺克清咽片', '肝叶萎缩', '迪皿盐酸左西替利嗪口服溶液', '阿司匹林']
27 | 
28 |     # 建树
29 |     actree = ahocorasick.Automaton()
30 |     for index, word in enumerate(wordlist):
31 |         actree.add_word(word, (index, word))
32 |     actree.make_automaton()
33 | 
34 |     for i in actree.iter('昨天发烧，服用了阿司匹林,并且还吃了牛黄清胃丸，饭是吃了瓜烧白菜，大便有点色浅'):
35 |         print(i)
36 | 


--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/build_medical_graph.py:
--------------------------------------------------------------------------------
  1 | """
  2 | # -*- coding: utf-8 -*-
  3 | # @File    : build_medical_graph.py
  4 | # @Time    : 2020/11/24 8:39 下午
  5 | # @Author  : xiaolu
  6 | # @Email   : luxiaonlp@163.com
  7 | # @Software: PyCharm
  8 | """
  9 | import json
 10 | from py2neo import Graph, Node
 11 | 
 12 | 
 13 | class MedicalGraph:
 14 |     def __init__(self):
 15 |         self.data_path = './data/medical_min.json'
 16 |         self.g = Graph()   # 这里填自己的信息
 17 |         self.g.delete_all()  # 将之前的图  全部删除
 18 | 
 19 |     def read_nodes(self):
 20 |         # 共７类节点
 21 |         drugs = []  # 药品
 22 |         foods = []  # 食物
 23 |         checks = []  # 检查
 24 |         departments = []  # 科室
 25 |         producers = []  # 药品大类
 26 |         diseases = []  # 疾病
 27 |         symptoms = []  # 症状
 28 | 
 29 |         disease_infos = []  # 疾病信息
 30 | 
 31 |         # 构建节点实体关系
 32 |         rels_department = []  # 科室－科室关系
 33 |         rels_noteat = []  # 疾病－忌吃食物关系
 34 |         rels_doeat = []  # 疾病－宜吃食物关系
 35 |         rels_recommandeat = []  # 疾病－推荐吃食物关系
 36 |         rels_commonddrug = []  # 疾病－通用药品关系
 37 |         rels_recommanddrug = []  # 疾病－热门药品关系
 38 |         rels_check = []  # 疾病－检查关系
 39 |         rels_drug_producer = []  # 厂商－药物关系
 40 | 
 41 |         rels_symptom = []  # 疾病症状关系
 42 |         rels_acompany = []  # 疾病并发关系
 43 |         rels_category = []  # 疾病与科室之间的关系
 44 | 
 45 |         count = 0
 46 |         for data in open(self.data_path, encoding='utf8'):
 47 |             disease_dict = {}
 48 |             count += 1
 49 |             print(count)
 50 |             data_json = json.loads(data)
 51 |             disease = data_json['name']   # 疾病名
 52 |             disease_dict['name'] = disease
 53 |             diseases.append(disease)
 54 |             disease_dict['desc'] = ''
 55 |             disease_dict['prevent'] = ''
 56 |             disease_dict['cause'] = ''
 57 |             disease_dict['easy_get'] = ''
 58 |             disease_dict['cure_department'] = ''
 59 |             disease_dict['cure_way'] = ''
 60 |             disease_dict['cure_lasttime'] = ''
 61 |             disease_dict['symptom'] = ''
 62 |             disease_dict['cured_prob'] = ''
 63 | 
 64 |             # 做症状 然后做疾病和症状的关系
 65 |             if 'symptom' in data_json:
 66 |                 symptoms += data_json['symptom']  # 这里加入所有的症状
 67 |                 for symptom in data_json['symptom']:
 68 |                     rels_symptom.append([disease, symptom])
 69 | 
 70 |             # 做并发症  并做疾病与并发症的关系
 71 |             if 'acompany' in data_json:
 72 |                 for acompany in data_json['acompany']:
 73 |                     rels_acompany.append([disease, acompany])
 74 | 
 75 |             # 做描述  不和病做关系  当做病的属性
 76 |             if 'desc' in data_json:
 77 |                 disease_dict['desc'] = data_json['desc']
 78 | 
 79 |             #
 80 |             if 'prevent' in data_json:
 81 |                 disease_dict['prevent'] = data_json['prevent']
 82 | 
 83 |             if 'cause' in data_json:
 84 |                 disease_dict['cause'] = data_json['cause']
 85 | 
 86 |             if 'get_prob' in data_json:
 87 |                 disease_dict['get_prob'] = data_json['get_prob']
 88 | 
 89 |             if 'easy_get' in data_json:
 90 |                 disease_dict['easy_get'] = data_json['easy_get']
 91 | 
 92 |             # 科室
 93 |             if 'cure_department' in data_json:
 94 |                 cure_department = data_json['cure_department']
 95 |                 if len(cure_department) == 1:
 96 |                     rels_category.append([disease, cure_department[0]])
 97 |                 if len(cure_department) == 2:
 98 |                     big = cure_department[0]
 99 |                     small = cure_department[1]
100 |                     rels_department.append([small, big])
101 |                     rels_category.append([disease, small])
102 |                 disease_dict['cure_department'] = cure_department
103 |                 departments += cure_department
104 | 
105 |             if 'cure_way' in data_json:
106 |                 disease_dict['cure_way'] = data_json['cure_way']
107 | 
108 |             if 'cure_lasttime' in data_json:
109 |                 disease_dict['cure_lasttime'] = data_json['cure_lasttime']
110 | 
111 |             if 'cured_prob' in data_json:
112 |                 disease_dict['cured_prob'] = data_json['cured_prob']
113 | 
114 |             if 'common_drug' in data_json:
115 |                 common_drug = data_json['common_drug']
116 |                 for drug in common_drug:
117 |                     rels_commonddrug.append([disease, drug])
118 |                 drugs += common_drug
119 | 
120 |             if 'recommand_drug' in data_json:
121 |                 recommand_drug = data_json['recommand_drug']
122 |                 drugs += recommand_drug
123 |                 for drug in recommand_drug:
124 |                     rels_recommanddrug.append([disease, drug])
125 | 
126 |             if 'not_eat' in data_json:
127 |                 not_eat = data_json['not_eat']
128 |                 for _not in not_eat:
129 |                     rels_noteat.append([disease, _not])
130 | 
131 |                 foods += not_eat
132 |                 do_eat = data_json['do_eat']
133 |                 for _do in do_eat:
134 |                     rels_doeat.append([disease, _do])
135 | 
136 |                 foods += do_eat
137 |                 recommand_eat = data_json['recommand_eat']
138 | 
139 |                 for _recommand in recommand_eat:
140 |                     rels_recommandeat.append([disease, _recommand])
141 |                 foods += recommand_eat
142 | 
143 |             if 'check' in data_json:
144 |                 check = data_json['check']
145 |                 for _check in check:
146 |                     rels_check.append([disease, _check])
147 |                 checks += check
148 |             if 'drug_detail' in data_json:
149 |                 drug_detail = data_json['drug_detail']
150 |                 producer = [i.split('(')[0] for i in drug_detail]
151 |                 rels_drug_producer += [[i.split('(')[0], i.split('(')[-1].replace(')', '')] for i in drug_detail]
152 |                 producers += producer
153 |             disease_infos.append(disease_dict)
154 |         return set(drugs), set(foods), set(checks), set(departments), set(producers), set(symptoms), set(diseases), \
155 |                disease_infos, rels_check, rels_recommandeat, rels_noteat, rels_doeat, rels_department, \
156 |                rels_commonddrug, rels_drug_producer, rels_recommanddrug, rels_symptom, rels_acompany, rels_category
157 | 
158 |     def create_diseases_nodes(self, disease_infos):
159 |         '''
160 |         创建知识图谱中心疾病的节点
161 |         '''
162 |         count = 0
163 |         for disease_dict in disease_infos:
164 |             # 疾病节点里面包含几种属性信息
165 |             node = Node("Disease", name=disease_dict['name'], desc=disease_dict['desc'],
166 |                         prevent=disease_dict['prevent'], cause=disease_dict['cause'],
167 |                         easy_get=disease_dict['easy_get'], cure_lasttime=disease_dict['cure_lasttime'],
168 |                         cure_department=disease_dict['cure_department']
169 |                         , cure_way=disease_dict['cure_way'], cured_prob=disease_dict['cured_prob'])
170 |             self.g.create(node)
171 |             count += 1
172 |             print(count)
173 |         return
174 | 
175 |     def create_node(self, label, nodes):
176 |         '''
177 |         建立节点
178 |         '''
179 |         count = 0
180 |         for node_name in nodes:
181 |             node = Node(label, name=node_name)
182 |             self.g.create(node)
183 |             count += 1
184 |             print(count, len(nodes))
185 |         return
186 | 
187 |     def create_graphnodes(self):
188 |         Drugs, Foods, Checks, Departments, Producers, Symptoms, Diseases, disease_infos, rels_check, rels_recommandeat, rels_noteat, rels_doeat, rels_department, rels_commonddrug, rels_drug_producer, rels_recommanddrug, rels_symptom, rels_acompany, rels_category = self.read_nodes()
189 |         self.create_diseases_nodes(disease_infos)
190 | 
191 |         self.create_node('Drug', Drugs)
192 |         print(len(Drugs))
193 | 
194 |         self.create_node('Food', Foods)
195 |         print(len(Foods))
196 | 
197 |         self.create_node('Check', Checks)
198 |         print(len(Checks))
199 | 
200 |         self.create_node('Department', Departments)
201 |         print(len(Departments))
202 | 
203 |         self.create_node('Producer', Producers)
204 |         print(len(Producers))
205 | 
206 |         self.create_node('Symptom', Symptoms)
207 | 
208 |         return
209 | 
210 |     def create_graphrels(self):
211 |         Drugs, Foods, Checks, Departments, Producers, Symptoms, Diseases, disease_infos, rels_check, rels_recommandeat, rels_noteat, rels_doeat, rels_department, rels_commonddrug, rels_drug_producer, rels_recommanddrug, rels_symptom, rels_acompany, rels_category = self.read_nodes()
212 |         self.create_relationship('Disease', 'Food', rels_recommandeat, 'recommand_eat', '推荐食谱')
213 |         self.create_relationship('Disease', 'Food', rels_noteat, 'no_eat', '忌吃')
214 |         self.create_relationship('Disease', 'Food', rels_doeat, 'do_eat', '宜吃')
215 |         self.create_relationship('Department', 'Department', rels_department, 'belongs_to', '属于')
216 |         self.create_relationship('Disease', 'Drug', rels_commonddrug, 'common_drug', '常用药品')
217 |         self.create_relationship('Producer', 'Drug', rels_drug_producer, 'drugs_of', '生产药品')
218 |         self.create_relationship('Disease', 'Drug', rels_recommanddrug, 'recommand_drug', '好评药品')
219 |         self.create_relationship('Disease', 'Check', rels_check, 'need_check', '诊断检查')
220 |         self.create_relationship('Disease', 'Symptom', rels_symptom, 'has_symptom', '症状')
221 |         self.create_relationship('Disease', 'Disease', rels_acompany, 'acompany_with', '并发症')
222 |         self.create_relationship('Disease', 'Department', rels_category, 'belongs_to', '所属科室')
223 | 
224 |     def create_relationship(self, start_node, end_node, edges, rel_type, rel_name):
225 |         '''创建实体关联边'''
226 |         count = 0
227 |         # 去重处理
228 |         set_edges = []
229 | 
230 |         for edge in edges:
231 |             set_edges.append('###'.join(edge))
232 | 
233 |         all = len(set(set_edges))
234 |         for edge in set(set_edges):
235 |             edge = edge.split('###')
236 |             p = edge[0]
237 |             q = edge[1]
238 |             query = "match(p:%s),(q:%s) where p.name='%s'and q.name='%s' create (p)-[rel:%s{name:'%s'}]->(q)" % (
239 |                 start_node, end_node, p, q, rel_type, rel_name)
240 |             try:
241 |                 self.g.run(query)
242 |                 count += 1
243 |                 print(rel_type, count, all)
244 |             except Exception as e:
245 |                 print(e)
246 |         return
247 | 
248 | 
249 | if __name__ == '__main__':
250 |     # 实例化类图
251 |     handler = MedicalGraph()
252 | 
253 |     # 创建节点
254 |     handler.create_graphnodes()
255 |     # 创建关系
256 |     handler.create_graphrels()
257 | 
258 | 
259 | 
260 | 


--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/data_process/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | # -*- coding: utf-8 -*-
3 | # @File    : __init__.py.py
4 | # @Time    : 2020/11/25 10:15 上午
5 | # @Author  : xiaolu
6 | # @Email   : luxiaonlp@163.com
7 | # @Software: PyCharm
8 | """
9 | 


--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/data_process/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/医疗知识图谱问答/data_process/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/data_process/__pycache__/answer_search.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/医疗知识图谱问答/data_process/__pycache__/answer_search.cpython-37.pyc


--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/data_process/__pycache__/question_classifier.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/医疗知识图谱问答/data_process/__pycache__/question_classifier.cpython-37.pyc


--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/data_process/__pycache__/question_parser.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/医疗知识图谱问答/data_process/__pycache__/question_parser.cpython-37.pyc


--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/data_process/answer_search.py:
--------------------------------------------------------------------------------
  1 | """
  2 | # -*- coding: utf-8 -*-
  3 | # @File    : answer_search.py
  4 | # @Time    : 2020/11/25 11:08 上午
  5 | # @Author  : xiaolu
  6 | # @Email   : luxiaonlp@163.com
  7 | # @Software: PyCharm
  8 | """
  9 | from py2neo import Graph
 10 | 
 11 | 
 12 | class AnswerSearcher:
 13 |     def __init__(self):
 14 |         self.g = Graph()
 15 |         self.num_limit = 20
 16 | 
 17 |     def search_main(self, sqls):
 18 |         # 执行cypher查询，并返回相应结果
 19 |         final_answers = []
 20 |         for sql_ in sqls:
 21 |             question_type = sql_['question_type']
 22 |             queries = sql_['sql']
 23 |             answers = []
 24 |             for query in queries:
 25 |                 ress = self.g.run(query).data()
 26 |                 answers += ress
 27 |             final_answer = self.answer_prettify(question_type, answers)
 28 |             if final_answer:
 29 |                 final_answers.append(final_answer)
 30 |         return final_answers
 31 | 
 32 |     def answer_prettify(self, question_type, answers):
 33 |         # 根据对应的qustion_type，调用相应的回复模板
 34 |         final_answer = []
 35 |         if not answers:
 36 |             return ''
 37 |         if question_type == 'disease_symptom':
 38 |             desc = [i['n.name'] for i in answers]
 39 |             subject = answers[0]['m.name']
 40 |             final_answer = '{0}的症状包括：{1}'.format(subject, '；'.join(list(set(desc))[:self.num_limit]))
 41 | 
 42 |         elif question_type == 'symptom_disease':
 43 |             desc = [i['m.name'] for i in answers]
 44 |             subject = answers[0]['n.name']
 45 |             final_answer = '症状{0}可能染上的疾病有：{1}'.format(subject, '；'.join(list(set(desc))[:self.num_limit]))
 46 | 
 47 |         elif question_type == 'disease_cause':
 48 |             desc = [i['m.cause'] for i in answers]
 49 |             subject = answers[0]['m.name']
 50 |             final_answer = '{0}可能的成因有：{1}'.format(subject, '；'.join(list(set(desc))[:self.num_limit]))
 51 | 
 52 |         elif question_type == 'disease_prevent':
 53 |             desc = [i['m.prevent'] for i in answers]
 54 |             subject = answers[0]['m.name']
 55 |             final_answer = '{0}的预防措施包括：{1}'.format(subject, '；'.join(list(set(desc))[:self.num_limit]))
 56 | 
 57 |         elif question_type == 'disease_lasttime':
 58 |             desc = [i['m.cure_lasttime'] for i in answers]
 59 |             subject = answers[0]['m.name']
 60 |             final_answer = '{0}治疗可能持续的周期为：{1}'.format(subject, '；'.join(list(set(desc))[:self.num_limit]))
 61 | 
 62 |         elif question_type == 'disease_cureway':
 63 |             desc = [';'.join(i['m.cure_way']) for i in answers]
 64 |             subject = answers[0]['m.name']
 65 |             final_answer = '{0}可以尝试如下治疗：{1}'.format(subject, '；'.join(list(set(desc))[:self.num_limit]))
 66 | 
 67 |         elif question_type == 'disease_cureprob':
 68 |             desc = [i['m.cured_prob'] for i in answers]
 69 |             subject = answers[0]['m.name']
 70 |             final_answer = '{0}治愈的概率为（仅供参考）：{1}'.format(subject, '；'.join(list(set(desc))[:self.num_limit]))
 71 | 
 72 |         elif question_type == 'disease_easyget':
 73 |             desc = [i['m.easy_get'] for i in answers]
 74 |             subject = answers[0]['m.name']
 75 | 
 76 |             final_answer = '{0}的易感人群包括：{1}'.format(subject, '；'.join(list(set(desc))[:self.num_limit]))
 77 | 
 78 |         elif question_type == 'disease_desc':
 79 |             desc = [i['m.desc'] for i in answers]
 80 |             subject = answers[0]['m.name']
 81 |             final_answer = '{0},熟悉一下：{1}'.format(subject,  '；'.join(list(set(desc))[:self.num_limit]))
 82 | 
 83 |         elif question_type == 'disease_acompany':
 84 |             desc1 = [i['n.name'] for i in answers]
 85 |             desc2 = [i['m.name'] for i in answers]
 86 |             subject = answers[0]['m.name']
 87 |             desc = [i for i in desc1 + desc2 if i != subject]
 88 |             final_answer = '{0}的症状包括：{1}'.format(subject, '；'.join(list(set(desc))[:self.num_limit]))
 89 | 
 90 |         elif question_type == 'disease_not_food':
 91 |             desc = [i['n.name'] for i in answers]
 92 |             subject = answers[0]['m.name']
 93 |             final_answer = '{0}忌食的食物包括有：{1}'.format(subject, '；'.join(list(set(desc))[:self.num_limit]))
 94 | 
 95 |         elif question_type == 'disease_do_food':
 96 |             do_desc = [i['n.name'] for i in answers if i['r.name'] == '宜吃']
 97 |             recommand_desc = [i['n.name'] for i in answers if i['r.name'] == '推荐食谱']
 98 |             subject = answers[0]['m.name']
 99 |             final_answer = '{0}宜食的食物包括有：{1}\n推荐食谱包括有：{2}'.format(subject, ';'.join(list(set(do_desc))[:self.num_limit]), ';'.join(list(set(recommand_desc))[:self.num_limit]))
100 | 
101 |         elif question_type == 'food_not_disease':
102 |             desc = [i['m.name'] for i in answers]
103 |             subject = answers[0]['n.name']
104 |             final_answer = '患有{0}的人最好不要吃{1}'.format('；'.join(list(set(desc))[:self.num_limit]), subject)
105 | 
106 |         elif question_type == 'food_do_disease':
107 |             desc = [i['m.name'] for i in answers]
108 |             subject = answers[0]['n.name']
109 |             final_answer = '患有{0}的人建议多试试{1}'.format('；'.join(list(set(desc))[:self.num_limit]), subject)
110 | 
111 |         elif question_type == 'disease_drug':
112 |             desc = [i['n.name'] for i in answers]
113 |             subject = answers[0]['m.name']
114 |             final_answer = '{0}通常的使用的药品包括：{1}'.format(subject, '；'.join(list(set(desc))[:self.num_limit]))
115 | 
116 |         elif question_type == 'drug_disease':
117 |             desc = [i['m.name'] for i in answers]
118 |             subject = answers[0]['n.name']
119 |             final_answer = '{0}主治的疾病有{1},可以试试'.format(subject, '；'.join(list(set(desc))[:self.num_limit]))
120 | 
121 |         elif question_type == 'disease_check':
122 |             desc = [i['n.name'] for i in answers]
123 |             subject = answers[0]['m.name']
124 |             final_answer = '{0}通常可以通过以下方式检查出来：{1}'.format(subject, '；'.join(list(set(desc))[:self.num_limit]))
125 | 
126 |         elif question_type == 'check_disease':
127 |             desc = [i['m.name'] for i in answers]
128 |             subject = answers[0]['n.name']
129 |             final_answer = '通常可以通过{0}检查出来的疾病有{1}'.format(subject, '；'.join(list(set(desc))[:self.num_limit]))
130 | 
131 |         return final_answer
132 | 
133 | 
134 | if __name__ == '__main__':
135 |     searcher = AnswerSearcher()


--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/data_process/question_classifier.py:
--------------------------------------------------------------------------------
  1 | """
  2 | # -*- coding: utf-8 -*-
  3 | # @File    : question_classifier.py
  4 | # @Time    : 2020/11/25 10:16 上午
  5 | # @Author  : xiaolu
  6 | # @Email   : luxiaonlp@163.com
  7 | # @Software: PyCharm
  8 | """
  9 | import os
 10 | import ahocorasick
 11 | from pdb import set_trace
 12 | 
 13 | 
 14 | class QuestionClassifier:
 15 |     def __init__(self):
 16 |         #　特征词路径
 17 |         self.disease_path = './dict/disease.txt'
 18 |         self.department_path = './dict/department.txt'
 19 |         self.check_path = './dict/check.txt'
 20 |         self.drug_path = './dict/drug.txt'
 21 |         self.food_path = './dict/food.txt'
 22 |         self.producer_path = './dict/producer.txt'
 23 |         self.symptom_path = './dict/symptom.txt'
 24 |         self.deny_path = './dict/deny.txt'
 25 | 
 26 |         # 加载特征词
 27 |         self.disease_wds = [i.strip() for i in open(self.disease_path) if i.strip()]
 28 |         self.department_wds = [i.strip() for i in open(self.department_path) if i.strip()]
 29 |         self.check_wds = [i.strip() for i in open(self.check_path) if i.strip()]
 30 |         self.drug_wds = [i.strip() for i in open(self.drug_path) if i.strip()]
 31 |         self.food_wds = [i.strip() for i in open(self.food_path) if i.strip()]
 32 |         self.producer_wds = [i.strip() for i in open(self.producer_path) if i.strip()]
 33 |         self.symptom_wds = [i.strip() for i in open(self.symptom_path) if i.strip()]
 34 |         self.region_words = set(self.department_wds + self.disease_wds + self.check_wds + self.drug_wds + self.food_wds + self.producer_wds + self.symptom_wds)
 35 |         self.deny_words = [i.strip() for i in open(self.deny_path) if i.strip()]
 36 | 
 37 |         # 建树 加快检索  可参考ahocorasick的使用  进行学习  actree
 38 |         self.region_tree = self.build_actree(list(self.region_words))
 39 | 
 40 |         # 构建词典
 41 |         self.wdtype_dict = self.build_wdtype_dict()
 42 | 
 43 |         # 问句疑问词
 44 |         self.symptom_qwds = ['症状', '表征', '现象', '症候', '表现']
 45 |         self.cause_qwds = ['原因', '成因', '为什么', '怎么会', '怎样才', '咋样才', '怎样会', '如何会', '为啥', '为何', '如何才会', '怎么才会', '会导致', '会造成']
 46 |         self.acompany_qwds = ['并发症', '并发', '一起发生', '一并发生', '一起出现', '一并出现', '一同发生', '一同出现', '伴随发生', '伴随', '共现']
 47 |         self.food_qwds = ['饮食', '饮用', '吃', '食', '伙食', '膳食', '喝', '菜', '忌口', '补品', '保健品', '食谱', '菜谱', '食用', '食物', '补品']
 48 |         self.drug_qwds = ['药', '药品', '用药', '胶囊', '口服液', '炎片']
 49 |         self.prevent_qwds = ['预防', '防范', '抵制', '抵御', '防止', ' 躲避', '逃避', '避开', '免得', '逃开', '避开', '避掉', '躲开', '躲掉', '绕开',
 50 |                              '怎样才能不', '怎么才能不', '咋样才能不', '咋才能不', '如何才能不',
 51 |                              '怎样才不', '怎么才不', '咋样才不', '咋才不', '如何才不',
 52 |                              '怎样才可以不', '怎么才可以不', '咋样才可以不', '咋才可以不', '如何可以不',
 53 |                              '怎样才可不', '怎么才可不', '咋样才可不', '咋才可不', '如何可不']
 54 |         self.lasttime_qwds = ['周期', '多久', '多长时间', '多少时间', '几天', '几年', '多少天', '多少小时', '几个小时', '多少年']
 55 |         self.cureway_qwds = ['怎么治疗', '如何医治', '怎么医治', '怎么治', '怎么医', '如何治', '医治方式', '疗法', '咋治', '怎么办', '咋办', '咋治']
 56 |         self.cureprob_qwds = ['多大概率能治好', '多大几率能治好', '治好希望大么', '几率', '几成', '比例', '可能性', '能治', '可治', '可以治', '可以医']
 57 |         self.easyget_qwds = ['易感人群', '容易感染', '易发人群', '什么人', '哪些人', '感染', '染上', '得上']
 58 |         self.check_qwds = ['检查', '检查项目', '查出', '检查', '测出', '试出']
 59 |         self.belong_qwds = ['属于什么科', '属于', '什么科', '科室']
 60 |         self.cure_qwds = ['治疗什么', '治啥', '治疗啥', '医治啥', '治愈啥', '主治啥', '主治什么', '有什么用', '有何用', '用处', '用途',
 61 |                           '有什么好处', '有什么益处', '有何益处', '用来', '用来做啥', '用来作甚', '需要', '要']
 62 | 
 63 |         print('model init finished ......')
 64 |         return
 65 | 
 66 |     '''分类主函数'''
 67 |     def classify(self, question):
 68 |         data = {}
 69 |         medical_dict = self.check_medical(question)
 70 | 
 71 |         if not medical_dict:
 72 |             return {}
 73 | 
 74 |         data['args'] = medical_dict
 75 | 
 76 |         # 收集问句当中所涉及到的实体类型
 77 |         types = []
 78 |         for type_ in medical_dict.values():
 79 |             types += type_
 80 | 
 81 |         question_types = []
 82 |         # 症状
 83 |         if self.check_words(self.symptom_qwds, question) and ('disease' in types):
 84 |             question_type = 'disease_symptom'
 85 |             question_types.append(question_type)
 86 | 
 87 |         if self.check_words(self.symptom_qwds, question) and ('symptom' in types):
 88 |             question_type = 'symptom_disease'
 89 |             question_types.append(question_type)
 90 | 
 91 |         # 原因
 92 |         if self.check_words(self.cause_qwds, question) and ('disease' in types):
 93 |             question_type = 'disease_cause'
 94 |             question_types.append(question_type)
 95 |         # 并发症
 96 |         if self.check_words(self.acompany_qwds, question) and ('disease' in types):
 97 |             question_type = 'disease_acompany'
 98 |             question_types.append(question_type)
 99 | 
100 |         # 推荐食品
101 |         if self.check_words(self.food_qwds, question) and 'disease' in types:
102 |             deny_status = self.check_words(self.deny_words, question)
103 |             if deny_status:
104 |                 question_type = 'disease_not_food'
105 |             else:
106 |                 question_type = 'disease_do_food'
107 |             question_types.append(question_type)
108 | 
109 |         # 已知食物找疾病
110 |         if self.check_words(self.food_qwds+self.cure_qwds, question) and 'food' in types:
111 |             deny_status = self.check_words(self.deny_words, question)
112 |             if deny_status:
113 |                 question_type = 'food_not_disease'
114 |             else:
115 |                 question_type = 'food_do_disease'
116 |             question_types.append(question_type)
117 | 
118 |         # 推荐药品
119 |         if self.check_words(self.drug_qwds, question) and 'disease' in types:
120 |             question_type = 'disease_drug'
121 |             question_types.append(question_type)
122 | 
123 |         # 药品治啥病
124 |         if self.check_words(self.cure_qwds, question) and 'drug' in types:
125 |             question_type = 'drug_disease'
126 |             question_types.append(question_type)
127 | 
128 |         # 疾病接受检查项目
129 |         if self.check_words(self.check_qwds, question) and 'disease' in types:
130 |             question_type = 'disease_check'
131 |             question_types.append(question_type)
132 | 
133 |         # 已知检查项目查相应疾病
134 |         if self.check_words(self.check_qwds+self.cure_qwds, question) and 'check' in types:
135 |             question_type = 'check_disease'
136 |             question_types.append(question_type)
137 | 
138 |         #　症状防御
139 |         if self.check_words(self.prevent_qwds, question) and 'disease' in types:
140 |             question_type = 'disease_prevent'
141 |             question_types.append(question_type)
142 | 
143 |         # 疾病医疗周期
144 |         if self.check_words(self.lasttime_qwds, question) and 'disease' in types:
145 |             question_type = 'disease_lasttime'
146 |             question_types.append(question_type)
147 | 
148 |         # 疾病治疗方式
149 |         if self.check_words(self.cureway_qwds, question) and 'disease' in types:
150 |             question_type = 'disease_cureway'
151 |             question_types.append(question_type)
152 | 
153 |         # 疾病治愈可能性
154 |         if self.check_words(self.cureprob_qwds, question) and 'disease' in types:
155 |             question_type = 'disease_cureprob'
156 |             question_types.append(question_type)
157 | 
158 |         # 疾病易感染人群
159 |         if self.check_words(self.easyget_qwds, question) and 'disease' in types :
160 |             question_type = 'disease_easyget'
161 |             question_types.append(question_type)
162 | 
163 |         # 若没有查到相关的外部查询信息，那么则将该疾病的描述信息返回
164 |         if question_types == [] and 'disease' in types:
165 |             question_types = ['disease_desc']
166 | 
167 |         # 若没有查到相关的外部查询信息，那么则将该疾病的描述信息返回
168 |         if question_types == [] and 'symptom' in types:
169 |             question_types = ['symptom_disease']
170 | 
171 |         # 将多个分类结果进行合并处理，组装成一个字典
172 |         data['question_types'] = question_types
173 | 
174 |         return data
175 | 
176 |     def build_wdtype_dict(self):
177 |         # 构建词对应的类型  将词和对应的类型组成字典
178 |         wd_dict = dict()
179 |         for wd in self.region_words:
180 |             wd_dict[wd] = []
181 |             if wd in self.disease_wds:
182 |                 wd_dict[wd].append('disease')
183 |             if wd in self.department_wds:
184 |                 wd_dict[wd].append('department')
185 |             if wd in self.check_wds:
186 |                 wd_dict[wd].append('check')
187 |             if wd in self.drug_wds:
188 |                 wd_dict[wd].append('drug')
189 |             if wd in self.food_wds:
190 |                 wd_dict[wd].append('food')
191 |             if wd in self.symptom_wds:
192 |                 wd_dict[wd].append('symptom')
193 |             if wd in self.producer_wds:
194 |                 wd_dict[wd].append('producer')
195 |         return wd_dict
196 | 
197 |     def build_actree(self, wordlist):
198 |         # 构造actree树  加速过滤
199 |         actree = ahocorasick.Automaton()
200 |         for index, word in enumerate(wordlist):
201 |             actree.add_word(word, (index, word))
202 |         actree.make_automaton()
203 |         return actree
204 | 
205 |     def check_medical(self, question):
206 |         # 当用户输入一个问题时  先对问题进行过滤
207 |         region_wds = []
208 |         for i in self.region_tree.iter(question):
209 |             wd = i[1][1]   # 取出当前匹配到的词的索引位置
210 |             region_wds.append(wd)
211 |         stop_wds = []
212 |         for wd1 in region_wds:
213 |             for wd2 in region_wds:
214 |                 if wd1 in wd2 and wd1 != wd2:
215 |                     stop_wds.append(wd1)
216 |         final_wds = [i for i in region_wds if i not in stop_wds]
217 |         final_dict = {i: self.wdtype_dict.get(i) for i in final_wds}
218 |         return final_dict
219 | 
220 |     def check_words(self, wds, sent):
221 |         # 基于特征词进行分类    看当前特征在这个问题中包含不包含
222 |         for wd in wds:
223 |             if wd in sent:
224 |                 return True
225 |         return False
226 | 
227 | 
228 | if __name__ == '__main__':
229 |     handler = QuestionClassifier()
230 |     while True:
231 |         question = input("input an question:")
232 |         data = handler.classify(question)
233 |         print(data)
234 | 


--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/data_process/question_parser.py:
--------------------------------------------------------------------------------
  1 | """
  2 | # -*- coding: utf-8 -*-
  3 | # @File    : question_parser.py
  4 | # @Time    : 2020/11/25 11:04 上午
  5 | # @Author  : xiaolu
  6 | # @Email   : luxiaonlp@163.com
  7 | # @Software: PyCharm
  8 | """
  9 | class QuestionPaser:
 10 |     def build_entitydict(self, args):
 11 |         # 构建实体节点
 12 |         entity_dict = {}
 13 |         for arg, types in args.items():
 14 |             for type in types:
 15 |                 if type not in entity_dict:
 16 |                     entity_dict[type] = [arg]
 17 |                 else:
 18 |                     entity_dict[type].append(arg)
 19 |         return entity_dict
 20 | 
 21 |     def parser_main(self, res_classify):
 22 |         # 解析主函数
 23 |         args = res_classify['args']
 24 |         entity_dict = self.build_entitydict(args)
 25 |         question_types = res_classify['question_types']
 26 |         sqls = []
 27 |         for question_type in question_types:
 28 |             sql_ = {}
 29 |             sql_['question_type'] = question_type
 30 |             sql = []
 31 |             if question_type == 'disease_symptom':
 32 |                 sql = self.sql_transfer(question_type, entity_dict.get('disease'))
 33 | 
 34 |             elif question_type == 'symptom_disease':
 35 |                 sql = self.sql_transfer(question_type, entity_dict.get('symptom'))
 36 | 
 37 |             elif question_type == 'disease_cause':
 38 |                 sql = self.sql_transfer(question_type, entity_dict.get('disease'))
 39 | 
 40 |             elif question_type == 'disease_acompany':
 41 |                 sql = self.sql_transfer(question_type, entity_dict.get('disease'))
 42 | 
 43 |             elif question_type == 'disease_not_food':
 44 |                 sql = self.sql_transfer(question_type, entity_dict.get('disease'))
 45 | 
 46 |             elif question_type == 'disease_do_food':
 47 |                 sql = self.sql_transfer(question_type, entity_dict.get('disease'))
 48 | 
 49 |             elif question_type == 'food_not_disease':
 50 |                 sql = self.sql_transfer(question_type, entity_dict.get('food'))
 51 | 
 52 |             elif question_type == 'food_do_disease':
 53 |                 sql = self.sql_transfer(question_type, entity_dict.get('food'))
 54 | 
 55 |             elif question_type == 'disease_drug':
 56 |                 sql = self.sql_transfer(question_type, entity_dict.get('disease'))
 57 | 
 58 |             elif question_type == 'drug_disease':
 59 |                 sql = self.sql_transfer(question_type, entity_dict.get('drug'))
 60 | 
 61 |             elif question_type == 'disease_check':
 62 |                 sql = self.sql_transfer(question_type, entity_dict.get('disease'))
 63 | 
 64 |             elif question_type == 'check_disease':
 65 |                 sql = self.sql_transfer(question_type, entity_dict.get('check'))
 66 | 
 67 |             elif question_type == 'disease_prevent':
 68 |                 sql = self.sql_transfer(question_type, entity_dict.get('disease'))
 69 | 
 70 |             elif question_type == 'disease_lasttime':
 71 |                 sql = self.sql_transfer(question_type, entity_dict.get('disease'))
 72 | 
 73 |             elif question_type == 'disease_cureway':
 74 |                 sql = self.sql_transfer(question_type, entity_dict.get('disease'))
 75 | 
 76 |             elif question_type == 'disease_cureprob':
 77 |                 sql = self.sql_transfer(question_type, entity_dict.get('disease'))
 78 | 
 79 |             elif question_type == 'disease_easyget':
 80 |                 sql = self.sql_transfer(question_type, entity_dict.get('disease'))
 81 | 
 82 |             elif question_type == 'disease_desc':
 83 |                 sql = self.sql_transfer(question_type, entity_dict.get('disease'))
 84 | 
 85 |             if sql:
 86 |                 sql_['sql'] = sql
 87 | 
 88 |                 sqls.append(sql_)
 89 |         return sqls
 90 | 
 91 |     def sql_transfer(self, question_type, entities):
 92 |         # 针对不同的问题   进行查找
 93 |         if not entities:
 94 |             return []
 95 | 
 96 |         # 查询语句
 97 |         sql = []
 98 |         # 查询疾病的原因
 99 |         if question_type == 'disease_cause':
100 |             sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.cause".format(i) for i in entities]
101 | 
102 |         # 查询疾病的防御措施
103 |         elif question_type == 'disease_prevent':
104 |             sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.prevent".format(i) for i in entities]
105 | 
106 |         # 查询疾病的持续时间
107 |         elif question_type == 'disease_lasttime':
108 |             sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.cure_lasttime".format(i) for i in entities]
109 | 
110 |         # 查询疾病的治愈概率
111 |         elif question_type == 'disease_cureprob':
112 |             sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.cured_prob".format(i) for i in entities]
113 | 
114 |         # 查询疾病的治疗方式
115 |         elif question_type == 'disease_cureway':
116 |             sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.cure_way".format(i) for i in entities]
117 | 
118 |         # 查询疾病的易发人群
119 |         elif question_type == 'disease_easyget':
120 |             sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.easy_get".format(i) for i in entities]
121 | 
122 |         # 查询疾病的相关介绍
123 |         elif question_type == 'disease_desc':
124 |             sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.desc".format(i) for i in entities]
125 | 
126 |         # 查询疾病有哪些症状
127 |         elif question_type == 'disease_symptom':
128 |             sql = ["MATCH (m:Disease)-[r:has_symptom]->(n:Symptom) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
129 | 
130 |         # 查询症状会导致哪些疾病
131 |         elif question_type == 'symptom_disease':
132 |             sql = ["MATCH (m:Disease)-[r:has_symptom]->(n:Symptom) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
133 | 
134 |         # 查询疾病的并发症
135 |         elif question_type == 'disease_acompany':
136 |             sql1 = ["MATCH (m:Disease)-[r:acompany_with]->(n:Disease) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
137 |             sql2 = ["MATCH (m:Disease)-[r:acompany_with]->(n:Disease) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
138 |             sql = sql1 + sql2
139 |         # 查询疾病的忌口
140 |         elif question_type == 'disease_not_food':
141 |             sql = ["MATCH (m:Disease)-[r:no_eat]->(n:Food) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
142 | 
143 |         # 查询疾病建议吃的东西
144 |         elif question_type == 'disease_do_food':
145 |             sql1 = ["MATCH (m:Disease)-[r:do_eat]->(n:Food) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
146 |             sql2 = ["MATCH (m:Disease)-[r:recommand_eat]->(n:Food) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
147 |             sql = sql1 + sql2
148 | 
149 |         # 已知忌口查疾病
150 |         elif question_type == 'food_not_disease':
151 |             sql = ["MATCH (m:Disease)-[r:no_eat]->(n:Food) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
152 | 
153 |         # 已知推荐查疾病
154 |         elif question_type == 'food_do_disease':
155 |             sql1 = ["MATCH (m:Disease)-[r:do_eat]->(n:Food) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
156 |             sql2 = ["MATCH (m:Disease)-[r:recommand_eat]->(n:Food) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
157 |             sql = sql1 + sql2
158 | 
159 |         # 查询疾病常用药品－药品别名记得扩充
160 |         elif question_type == 'disease_drug':
161 |             sql1 = ["MATCH (m:Disease)-[r:common_drug]->(n:Drug) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
162 |             sql2 = ["MATCH (m:Disease)-[r:recommand_drug]->(n:Drug) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
163 |             sql = sql1 + sql2
164 | 
165 |         # 已知药品查询能够治疗的疾病
166 |         elif question_type == 'drug_disease':
167 |             sql1 = ["MATCH (m:Disease)-[r:common_drug]->(n:Drug) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
168 |             sql2 = ["MATCH (m:Disease)-[r:recommand_drug]->(n:Drug) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
169 |             sql = sql1 + sql2
170 |         # 查询疾病应该进行的检查
171 |         elif question_type == 'disease_check':
172 |             sql = ["MATCH (m:Disease)-[r:need_check]->(n:Check) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
173 | 
174 |         # 已知检查查询疾病
175 |         elif question_type == 'check_disease':
176 |             sql = ["MATCH (m:Disease)-[r:need_check]->(n:Check) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
177 |         return sql
178 | 
179 | 
180 | if __name__ == '__main__':
181 |     handler = QuestionPaser()


--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/dict/deny.txt:
--------------------------------------------------------------------------------
 1 | 否
 2 | 非
 3 | 不
 4 | 无
 5 | 弗
 6 | 勿
 7 | 毋
 8 | 未
 9 | 没
10 | 莫
11 | 没有
12 | 防止
13 | 不再
14 | 不会
15 | 不能
16 | 忌
17 | 禁止
18 | 防止
19 | 难以
20 | 忘记
21 | 忽视
22 | 放弃
23 | 拒绝
24 | 杜绝
25 | 不是
26 | 并未
27 | 并无
28 | 仍未
29 | 难以出现
30 | 切勿
31 | 不要
32 | 不可
33 | 别
34 | 管住
35 | 注意
36 | 小心
37 | 少
38 | 
39 | 


--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/dict/department.txt:
--------------------------------------------------------------------------------
 1 | 心理科
 2 | 妇科
 3 | 耳鼻喉科
 4 | 中医综合
 5 | 泌尿内科
 6 | 康复科
 7 | 神经外科
 8 | 生殖健康
 9 | 肿瘤科
10 | 肛肠科
11 | 儿科
12 | 普外科
13 | 心胸外科
14 | 风湿免疫科
15 | 小儿外科
16 | 传染科
17 | 减肥
18 | 其他科室
19 | 肾内科
20 | 皮肤性病科
21 | 口腔科
22 | 不孕不育
23 | 五官科
24 | 整形美容科
25 | 消化内科
26 | 急诊科
27 | 肝胆外科
28 | 遗传病科
29 | 精神科
30 | 神经内科
31 | 小儿内科
32 | 肿瘤内科
33 | 皮肤科
34 | 中医科
35 | 骨外科
36 | 外科
37 | 呼吸内科
38 | 其他综合
39 | 眼科
40 | 内分泌科
41 | 性病科
42 | 妇产科
43 | 肝病
44 | 肿瘤外科
45 | 儿科综合
46 | 营养科
47 | 男科
48 | 产科
49 | 感染科
50 | 泌尿外科
51 | 血液科
52 | 心内科
53 | 烧伤科
54 | 内科


--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/run_chatbot.py:
--------------------------------------------------------------------------------
 1 | """
 2 | # -*- coding: utf-8 -*-
 3 | # @File    : run_chatbot.py
 4 | # @Time    : 2020/11/25 10:07 上午
 5 | # @Author  : xiaolu
 6 | # @Email   : luxiaonlp@163.com
 7 | # @Software: PyCharm
 8 | """
 9 | from data_process.question_classifier import QuestionClassifier
10 | from data_process.question_parser import QuestionPaser
11 | from data_process.answer_search import AnswerSearcher
12 | 
13 | 
14 | class ChatBotGraph:
15 |     def __init__(self):
16 |         self.classifier = QuestionClassifier()
17 |         self.parser = QuestionPaser()
18 |         self.searcher = AnswerSearcher()
19 | 
20 |     def chat_main(self, sent):
21 |         answer = "您好, 我是小路医药智能助理,希望可以帮到您。如果没答上来，可联系120。祝您身体棒棒的!!!"
22 |         res_classify = self.classifier.classify(sent)
23 |         if not res_classify:
24 |             return answer
25 |         res_sql = self.parser.parser_main(res_classify)
26 |         final_answers = self.searcher.search_main(res_sql)
27 |         if not final_answers:
28 |             return answer
29 |         else:
30 |             return '\n'.join(final_answers)
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     handler = ChatBotGraph()
35 |     while True:
36 |         question = input("用户:")
37 |         answer = handler.chat_main(question)
38 |         print("小路:", answer)
39 | 


--------------------------------------------------------------------------------
/pyecharts使用/001-柱状图.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 001-柱状图.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-02-17
 6 | """
 7 | from pyecharts.charts import Bar
 8 | from pyecharts import options as opts
 9 | from pyecharts.globals import ThemeType
10 | 
11 | 
12 | bar = (
13 |     Bar({"theme": ThemeType.MACARONS})   # 设置主题
14 |     # Bar()
15 |         .set_global_opts(
16 |         title_opts=opts.TitleOpts(title="各种衣服价格", subtitle="VS"),
17 |         xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15))  # 名字倾斜15度
18 |     )
19 | 
20 |         .add_xaxis(["衬衫", "羊毛衫", "雪纺衫", "裤子", "高跟鞋", "袜子"])
21 |         .add_yaxis("商家A", [5, 20, 36, 10, 75, 90])
22 |         .add_yaxis("商家B", [5, 20, 36, 10, 75, 90])
23 | 
24 | )
25 | bar.render('柱状图.html')


--------------------------------------------------------------------------------
/pyecharts使用/002-折线图.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 002-折线图.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-02-17
 6 | """
 7 | from pyecharts.charts import Line
 8 | from pyecharts import options as opts
 9 | 
10 | 
11 | if __name__ == "__main__":
12 |     x_data = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
13 |     # x_data = [1, 2, 3, 4, 5, 6, 7]
14 |     y_data = [820, 932, 901, 934, 1290, 1330, 1320]
15 |     y_data2 = [237, 132, 401, 534, 290, 1230, 1120]
16 | 
17 |     line = (
18 |         Line()
19 |             .set_global_opts(
20 |             tooltip_opts=opts.TooltipOpts(is_show=True),
21 |             title_opts=opts.TitleOpts(title="收入大比拼", pos_left="center"),  # 标题
22 |             legend_opts=opts.LegendOpts(pos_left="right"),   # 线条示例放在右上角
23 |             xaxis_opts=opts.AxisOpts(type_="category", name="星期"),   # 横轴的类型与名字
24 |             # 注意横轴type_等于value 和category的区别
25 |             yaxis_opts=opts.AxisOpts(
26 |                 type_="value",
27 |                 name="收入",
28 |                 splitline_opts=opts.SplitLineOpts(is_show=True),   # 是否显示横向格子线
29 |                 is_scale=True,
30 |             ),    # 纵轴的类型与名字
31 |         )
32 |             .add_xaxis(xaxis_data=x_data)
33 |             .add_yaxis(
34 |             is_smooth=True,   # 是否进行平滑处理
35 |             series_name="小花收入",    # 标识每条线
36 |             y_axis=y_data,
37 |             symbol="emptyCircle",
38 |             linestyle_opts=opts.LineStyleOpts(width=2),   # 设置线宽
39 |             is_symbol_show=True,
40 |             label_opts=opts.LabelOpts(is_show=True),   # is_show显示是否需要标注数据
41 |         )
42 |             .add_yaxis(
43 |             series_name="王五",    # 标识每条线
44 |             y_axis=y_data2,
45 |             symbol="emptyCircle",
46 |             is_symbol_show=True,
47 |             label_opts=opts.LabelOpts(is_show=True),   # is_show显示是否需要标注数据
48 | 
49 |             # 自定义标记
50 |             markpoint_opts=opts.MarkPointOpts(
51 |                 data=[opts.MarkPointItem(name="自定义标记点", coord=[x_data[2], y_data2[2]], value=y_data2[2])]
52 |             ),
53 |         )
54 | 
55 |     )
56 |     line.render('折线图.html')
57 | 


--------------------------------------------------------------------------------
/pyecharts使用/003-饼状图.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 003-饼状图.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-02-17
 6 | """
 7 | from pyecharts import options as opts
 8 | from pyecharts.charts import Pie
 9 | from pyecharts.faker import Faker
10 | 
11 | 
12 | if __name__ == '__main__':
13 |     # 生成假数据
14 |     # a, b = Faker.choose(), Faker.values()
15 |     # print(a)
16 |     # print(b)
17 |     # ['可乐', '雪碧', '橙汁', '绿茶', '奶茶', '百威', '青岛']
18 |     # [97, 140, 75, 28, 89, 20, 143]
19 |     pie = (
20 |         Pie()
21 |             .set_global_opts(title_opts=opts.TitleOpts(title="Pie-设置颜色"))
22 | 
23 |             .add("", [list(z) for z in zip(Faker.choose(), Faker.values())])
24 | 
25 |             .set_colors(["blue", "green", "yellow", "red", "pink", "orange", "purple"])  # 每个所占面积的颜色设置
26 | 
27 |             .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))  # 标签显示的样子
28 |     )
29 |     pie.render("饼状图.html")
30 | 


--------------------------------------------------------------------------------
/pyecharts使用/折线图.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <title>Awesome-pyecharts</title>
  6 |             <script type="text/javascript" src="https://assets.pyecharts.org/assets/echarts.min.js"></script>
  7 | 
  8 | </head>
  9 | <body>
 10 |     <div id="99eeb37316d94c52b9cc168496c98d5a" class="chart-container" style="width:900px; height:500px;"></div>
 11 |     <script>
 12 |         var chart_99eeb37316d94c52b9cc168496c98d5a = echarts.init(
 13 |             document.getElementById('99eeb37316d94c52b9cc168496c98d5a'), 'white', {renderer: 'canvas'});
 14 |         var option_99eeb37316d94c52b9cc168496c98d5a = {
 15 |     "animation": true,
 16 |     "animationThreshold": 2000,
 17 |     "animationDuration": 1000,
 18 |     "animationEasing": "cubicOut",
 19 |     "animationDelay": 0,
 20 |     "animationDurationUpdate": 300,
 21 |     "animationEasingUpdate": "cubicOut",
 22 |     "animationDelayUpdate": 0,
 23 |     "color": [
 24 |         "#c23531",
 25 |         "#2f4554",
 26 |         "#61a0a8",
 27 |         "#d48265",
 28 |         "#749f83",
 29 |         "#ca8622",
 30 |         "#bda29a",
 31 |         "#6e7074",
 32 |         "#546570",
 33 |         "#c4ccd3",
 34 |         "#f05b72",
 35 |         "#ef5b9c",
 36 |         "#f47920",
 37 |         "#905a3d",
 38 |         "#fab27b",
 39 |         "#2a5caa",
 40 |         "#444693",
 41 |         "#726930",
 42 |         "#b2d235",
 43 |         "#6d8346",
 44 |         "#ac6767",
 45 |         "#1d953f",
 46 |         "#6950a1",
 47 |         "#918597"
 48 |     ],
 49 |     "series": [
 50 |         {
 51 |             "type": "line",
 52 |             "name": "\u5c0f\u82b1\u6536\u5165",
 53 |             "connectNulls": false,
 54 |             "symbol": "emptyCircle",
 55 |             "symbolSize": 4,
 56 |             "showSymbol": true,
 57 |             "smooth": true,
 58 |             "clip": true,
 59 |             "step": false,
 60 |             "data": [
 61 |                 [
 62 |                     "Mon",
 63 |                     820
 64 |                 ],
 65 |                 [
 66 |                     "Tue",
 67 |                     932
 68 |                 ],
 69 |                 [
 70 |                     "Wed",
 71 |                     901
 72 |                 ],
 73 |                 [
 74 |                     "Thu",
 75 |                     934
 76 |                 ],
 77 |                 [
 78 |                     "Fri",
 79 |                     1290
 80 |                 ],
 81 |                 [
 82 |                     "Sat",
 83 |                     1330
 84 |                 ],
 85 |                 [
 86 |                     "Sun",
 87 |                     1320
 88 |                 ]
 89 |             ],
 90 |             "hoverAnimation": true,
 91 |             "label": {
 92 |                 "show": true,
 93 |                 "position": "top",
 94 |                 "margin": 8
 95 |             },
 96 |             "lineStyle": {
 97 |                 "show": true,
 98 |                 "width": 2,
 99 |                 "opacity": 1,
100 |                 "curveness": 0,
101 |                 "type": "solid"
102 |             },
103 |             "areaStyle": {
104 |                 "opacity": 0
105 |             },
106 |             "zlevel": 0,
107 |             "z": 0
108 |         },
109 |         {
110 |             "type": "line",
111 |             "name": "\u738b\u4e94",
112 |             "connectNulls": false,
113 |             "symbol": "emptyCircle",
114 |             "symbolSize": 4,
115 |             "showSymbol": true,
116 |             "smooth": false,
117 |             "clip": true,
118 |             "step": false,
119 |             "data": [
120 |                 [
121 |                     "Mon",
122 |                     237
123 |                 ],
124 |                 [
125 |                     "Tue",
126 |                     132
127 |                 ],
128 |                 [
129 |                     "Wed",
130 |                     401
131 |                 ],
132 |                 [
133 |                     "Thu",
134 |                     534
135 |                 ],
136 |                 [
137 |                     "Fri",
138 |                     290
139 |                 ],
140 |                 [
141 |                     "Sat",
142 |                     1230
143 |                 ],
144 |                 [
145 |                     "Sun",
146 |                     1120
147 |                 ]
148 |             ],
149 |             "hoverAnimation": true,
150 |             "label": {
151 |                 "show": true,
152 |                 "position": "top",
153 |                 "margin": 8
154 |             },
155 |             "lineStyle": {
156 |                 "show": true,
157 |                 "width": 1,
158 |                 "opacity": 1,
159 |                 "curveness": 0,
160 |                 "type": "solid"
161 |             },
162 |             "areaStyle": {
163 |                 "opacity": 0
164 |             },
165 |             "markPoint": {
166 |                 "label": {
167 |                     "show": true,
168 |                     "position": "inside",
169 |                     "color": "#fff",
170 |                     "margin": 8
171 |                 },
172 |                 "data": [
173 |                     {
174 |                         "name": "\u81ea\u5b9a\u4e49\u6807\u8bb0\u70b9",
175 |                         "coord": [
176 |                             "Wed",
177 |                             401
178 |                         ],
179 |                         "value": 401
180 |                     }
181 |                 ]
182 |             },
183 |             "zlevel": 0,
184 |             "z": 0
185 |         }
186 |     ],
187 |     "legend": [
188 |         {
189 |             "data": [
190 |                 "\u5c0f\u82b1\u6536\u5165",
191 |                 "\u738b\u4e94"
192 |             ],
193 |             "selected": {
194 |                 "\u5c0f\u82b1\u6536\u5165": true,
195 |                 "\u738b\u4e94": true
196 |             },
197 |             "show": true,
198 |             "left": "right",
199 |             "padding": 5,
200 |             "itemGap": 10,
201 |             "itemWidth": 25,
202 |             "itemHeight": 14
203 |         }
204 |     ],
205 |     "tooltip": {
206 |         "show": true,
207 |         "trigger": "item",
208 |         "triggerOn": "mousemove|click",
209 |         "axisPointer": {
210 |             "type": "line"
211 |         },
212 |         "showContent": true,
213 |         "alwaysShowContent": false,
214 |         "showDelay": 0,
215 |         "hideDelay": 100,
216 |         "textStyle": {
217 |             "fontSize": 14
218 |         },
219 |         "borderWidth": 0,
220 |         "padding": 5
221 |     },
222 |     "xAxis": [
223 |         {
224 |             "type": "category",
225 |             "name": "\u661f\u671f",
226 |             "show": true,
227 |             "scale": false,
228 |             "nameLocation": "end",
229 |             "nameGap": 15,
230 |             "gridIndex": 0,
231 |             "inverse": false,
232 |             "offset": 0,
233 |             "splitNumber": 5,
234 |             "minInterval": 0,
235 |             "splitLine": {
236 |                 "show": false,
237 |                 "lineStyle": {
238 |                     "show": true,
239 |                     "width": 1,
240 |                     "opacity": 1,
241 |                     "curveness": 0,
242 |                     "type": "solid"
243 |                 }
244 |             },
245 |             "data": [
246 |                 "Mon",
247 |                 "Tue",
248 |                 "Wed",
249 |                 "Thu",
250 |                 "Fri",
251 |                 "Sat",
252 |                 "Sun"
253 |             ]
254 |         }
255 |     ],
256 |     "yAxis": [
257 |         {
258 |             "type": "value",
259 |             "name": "\u6536\u5165",
260 |             "show": true,
261 |             "scale": true,
262 |             "nameLocation": "end",
263 |             "nameGap": 15,
264 |             "gridIndex": 0,
265 |             "inverse": false,
266 |             "offset": 0,
267 |             "splitNumber": 5,
268 |             "minInterval": 0,
269 |             "splitLine": {
270 |                 "show": true,
271 |                 "lineStyle": {
272 |                     "show": true,
273 |                     "width": 1,
274 |                     "opacity": 1,
275 |                     "curveness": 0,
276 |                     "type": "solid"
277 |                 }
278 |             }
279 |         }
280 |     ],
281 |     "title": [
282 |         {
283 |             "text": "\u6536\u5165\u5927\u6bd4\u62fc",
284 |             "left": "center",
285 |             "padding": 5,
286 |             "itemGap": 10
287 |         }
288 |     ]
289 | };
290 |         chart_99eeb37316d94c52b9cc168496c98d5a.setOption(option_99eeb37316d94c52b9cc168496c98d5a);
291 |     </script>
292 | </body>
293 | </html>
294 | 


--------------------------------------------------------------------------------
/pyecharts使用/柱状图.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <title>Awesome-pyecharts</title>
  6 |             <script type="text/javascript" src="https://assets.pyecharts.org/assets/echarts.min.js"></script>
  7 |         <script type="text/javascript" src="https://assets.pyecharts.org/assets/themes/macarons.js"></script>
  8 | 
  9 | </head>
 10 | <body>
 11 |     <div id="42ef9388e1ba4eeb983fc8e53cbd7a68" class="chart-container" style="width:900px; height:500px;"></div>
 12 |     <script>
 13 |         var chart_42ef9388e1ba4eeb983fc8e53cbd7a68 = echarts.init(
 14 |             document.getElementById('42ef9388e1ba4eeb983fc8e53cbd7a68'), 'macarons', {renderer: 'canvas'});
 15 |         var option_42ef9388e1ba4eeb983fc8e53cbd7a68 = {
 16 |     "animation": true,
 17 |     "animationThreshold": 2000,
 18 |     "animationDuration": 1000,
 19 |     "animationEasing": "cubicOut",
 20 |     "animationDelay": 0,
 21 |     "animationDurationUpdate": 300,
 22 |     "animationEasingUpdate": "cubicOut",
 23 |     "animationDelayUpdate": 0,
 24 |     "series": [
 25 |         {
 26 |             "type": "bar",
 27 |             "name": "\u5546\u5bb6A",
 28 |             "legendHoverLink": true,
 29 |             "data": [
 30 |                 5,
 31 |                 20,
 32 |                 36,
 33 |                 10,
 34 |                 75,
 35 |                 90
 36 |             ],
 37 |             "showBackground": false,
 38 |             "barMinHeight": 0,
 39 |             "barCategoryGap": "20%",
 40 |             "barGap": "30%",
 41 |             "large": false,
 42 |             "largeThreshold": 400,
 43 |             "seriesLayoutBy": "column",
 44 |             "datasetIndex": 0,
 45 |             "clip": true,
 46 |             "zlevel": 0,
 47 |             "z": 2,
 48 |             "label": {
 49 |                 "show": true,
 50 |                 "position": "top",
 51 |                 "margin": 8
 52 |             }
 53 |         },
 54 |         {
 55 |             "type": "bar",
 56 |             "name": "\u5546\u5bb6B",
 57 |             "legendHoverLink": true,
 58 |             "data": [
 59 |                 5,
 60 |                 20,
 61 |                 36,
 62 |                 10,
 63 |                 75,
 64 |                 90
 65 |             ],
 66 |             "showBackground": false,
 67 |             "barMinHeight": 0,
 68 |             "barCategoryGap": "20%",
 69 |             "barGap": "30%",
 70 |             "large": false,
 71 |             "largeThreshold": 400,
 72 |             "seriesLayoutBy": "column",
 73 |             "datasetIndex": 0,
 74 |             "clip": true,
 75 |             "zlevel": 0,
 76 |             "z": 2,
 77 |             "label": {
 78 |                 "show": true,
 79 |                 "position": "top",
 80 |                 "margin": 8
 81 |             }
 82 |         }
 83 |     ],
 84 |     "legend": [
 85 |         {
 86 |             "data": [
 87 |                 "\u5546\u5bb6A",
 88 |                 "\u5546\u5bb6B"
 89 |             ],
 90 |             "selected": {
 91 |                 "\u5546\u5bb6A": true,
 92 |                 "\u5546\u5bb6B": true
 93 |             },
 94 |             "show": true,
 95 |             "padding": 5,
 96 |             "itemGap": 10,
 97 |             "itemWidth": 25,
 98 |             "itemHeight": 14
 99 |         }
100 |     ],
101 |     "tooltip": {
102 |         "show": true,
103 |         "trigger": "item",
104 |         "triggerOn": "mousemove|click",
105 |         "axisPointer": {
106 |             "type": "line"
107 |         },
108 |         "showContent": true,
109 |         "alwaysShowContent": false,
110 |         "showDelay": 0,
111 |         "hideDelay": 100,
112 |         "textStyle": {
113 |             "fontSize": 14
114 |         },
115 |         "borderWidth": 0,
116 |         "padding": 5
117 |     },
118 |     "xAxis": [
119 |         {
120 |             "show": true,
121 |             "scale": false,
122 |             "nameLocation": "end",
123 |             "nameGap": 15,
124 |             "gridIndex": 0,
125 |             "axisLabel": {
126 |                 "show": true,
127 |                 "position": "top",
128 |                 "rotate": -15,
129 |                 "margin": 8
130 |             },
131 |             "inverse": false,
132 |             "offset": 0,
133 |             "splitNumber": 5,
134 |             "minInterval": 0,
135 |             "splitLine": {
136 |                 "show": false,
137 |                 "lineStyle": {
138 |                     "show": true,
139 |                     "width": 1,
140 |                     "opacity": 1,
141 |                     "curveness": 0,
142 |                     "type": "solid"
143 |                 }
144 |             },
145 |             "data": [
146 |                 "\u886c\u886b",
147 |                 "\u7f8a\u6bdb\u886b",
148 |                 "\u96ea\u7eba\u886b",
149 |                 "\u88e4\u5b50",
150 |                 "\u9ad8\u8ddf\u978b",
151 |                 "\u889c\u5b50"
152 |             ]
153 |         }
154 |     ],
155 |     "yAxis": [
156 |         {
157 |             "show": true,
158 |             "scale": false,
159 |             "nameLocation": "end",
160 |             "nameGap": 15,
161 |             "gridIndex": 0,
162 |             "inverse": false,
163 |             "offset": 0,
164 |             "splitNumber": 5,
165 |             "minInterval": 0,
166 |             "splitLine": {
167 |                 "show": false,
168 |                 "lineStyle": {
169 |                     "show": true,
170 |                     "width": 1,
171 |                     "opacity": 1,
172 |                     "curveness": 0,
173 |                     "type": "solid"
174 |                 }
175 |             }
176 |         }
177 |     ],
178 |     "title": [
179 |         {
180 |             "text": "\u5404\u79cd\u8863\u670d\u4ef7\u683c",
181 |             "subtext": "VS",
182 |             "padding": 5,
183 |             "itemGap": 10
184 |         }
185 |     ]
186 | };
187 |         chart_42ef9388e1ba4eeb983fc8e53cbd7a68.setOption(option_42ef9388e1ba4eeb983fc8e53cbd7a68);
188 |     </script>
189 | </body>
190 | </html>
191 | 


--------------------------------------------------------------------------------
/pyecharts使用/饼状图.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <title>Awesome-pyecharts</title>
  6 |             <script type="text/javascript" src="https://assets.pyecharts.org/assets/echarts.min.js"></script>
  7 | 
  8 | </head>
  9 | <body>
 10 |     <div id="9b918a5b757b48b9956b34ee8d0bc1b7" class="chart-container" style="width:900px; height:500px;"></div>
 11 |     <script>
 12 |         var chart_9b918a5b757b48b9956b34ee8d0bc1b7 = echarts.init(
 13 |             document.getElementById('9b918a5b757b48b9956b34ee8d0bc1b7'), 'white', {renderer: 'canvas'});
 14 |         var option_9b918a5b757b48b9956b34ee8d0bc1b7 = {
 15 |     "animation": true,
 16 |     "animationThreshold": 2000,
 17 |     "animationDuration": 1000,
 18 |     "animationEasing": "cubicOut",
 19 |     "animationDelay": 0,
 20 |     "animationDurationUpdate": 300,
 21 |     "animationEasingUpdate": "cubicOut",
 22 |     "animationDelayUpdate": 0,
 23 |     "color": [
 24 |         "blue",
 25 |         "green",
 26 |         "yellow",
 27 |         "red",
 28 |         "pink",
 29 |         "orange",
 30 |         "purple"
 31 |     ],
 32 |     "series": [
 33 |         {
 34 |             "type": "pie",
 35 |             "clockwise": true,
 36 |             "data": [
 37 |                 {
 38 |                     "name": "\u5468\u4e00",
 39 |                     "value": 127
 40 |                 },
 41 |                 {
 42 |                     "name": "\u5468\u4e8c",
 43 |                     "value": 150
 44 |                 },
 45 |                 {
 46 |                     "name": "\u5468\u4e09",
 47 |                     "value": 110
 48 |                 },
 49 |                 {
 50 |                     "name": "\u5468\u56db",
 51 |                     "value": 62
 52 |                 },
 53 |                 {
 54 |                     "name": "\u5468\u4e94",
 55 |                     "value": 70
 56 |                 },
 57 |                 {
 58 |                     "name": "\u5468\u516d",
 59 |                     "value": 57
 60 |                 },
 61 |                 {
 62 |                     "name": "\u5468\u65e5",
 63 |                     "value": 56
 64 |                 }
 65 |             ],
 66 |             "radius": [
 67 |                 "0%",
 68 |                 "75%"
 69 |             ],
 70 |             "center": [
 71 |                 "50%",
 72 |                 "50%"
 73 |             ],
 74 |             "label": {
 75 |                 "show": true,
 76 |                 "position": "top",
 77 |                 "margin": 8,
 78 |                 "formatter": "{b}: {c}"
 79 |             },
 80 |             "rippleEffect": {
 81 |                 "show": true,
 82 |                 "brushType": "stroke",
 83 |                 "scale": 2.5,
 84 |                 "period": 4
 85 |             }
 86 |         }
 87 |     ],
 88 |     "legend": [
 89 |         {
 90 |             "data": [
 91 |                 "\u5468\u4e00",
 92 |                 "\u5468\u4e8c",
 93 |                 "\u5468\u4e09",
 94 |                 "\u5468\u56db",
 95 |                 "\u5468\u4e94",
 96 |                 "\u5468\u516d",
 97 |                 "\u5468\u65e5"
 98 |             ],
 99 |             "selected": {},
100 |             "show": true,
101 |             "padding": 5,
102 |             "itemGap": 10,
103 |             "itemWidth": 25,
104 |             "itemHeight": 14
105 |         }
106 |     ],
107 |     "tooltip": {
108 |         "show": true,
109 |         "trigger": "item",
110 |         "triggerOn": "mousemove|click",
111 |         "axisPointer": {
112 |             "type": "line"
113 |         },
114 |         "showContent": true,
115 |         "alwaysShowContent": false,
116 |         "showDelay": 0,
117 |         "hideDelay": 100,
118 |         "textStyle": {
119 |             "fontSize": 14
120 |         },
121 |         "borderWidth": 0,
122 |         "padding": 5
123 |     },
124 |     "title": [
125 |         {
126 |             "text": "Pie-\u8bbe\u7f6e\u989c\u8272",
127 |             "padding": 5,
128 |             "itemGap": 10
129 |         }
130 |     ]
131 | };
132 |         chart_9b918a5b757b48b9956b34ee8d0bc1b7.setOption(option_9b918a5b757b48b9956b34ee8d0bc1b7);
133 |     </script>
134 | </body>
135 | </html>
136 | 


--------------------------------------------------------------------------------
/pymysql的使用/001-创建数据库.py:
--------------------------------------------------------------------------------
 1 | """
 2 | # -*- coding: utf-8 -*-
 3 | # @File    : 001-创建数据库.py
 4 | # @Time    : 2020/11/24 1:59 下午
 5 | # @Author  : xiaolu
 6 | # @Email   : luxiaonlp@163.com
 7 | # @Software: PyCharm
 8 | """
 9 | import pymysql
10 | 
11 | if __name__ == '__main__':
12 |     # 打开数据库连接
13 |     db = pymysql.connect('localhost', 'xxxxx', 'xxxxxx')
14 | 
15 |     # 使用 cursor() 方法创建一个游标对象 cursor
16 |     cursor = db.cursor()
17 | 
18 |     # 创建数据库
19 |     db_name = 'TESTDB'
20 |     sql = "CREATE DATABASE {}".format(db_name)   # 创建数据库
21 |     cursor.execute(sql)
22 | 
23 |     # 使用 execute()  方法执行 SQL 查询
24 |     cursor.execute("SELECT VERSION()")
25 | 
26 |     # 使用fetchone()方法获取单条数据
27 |     data = cursor.fetchone()
28 |     print("数据库的版本: ", data)
29 |     # 关闭数据库连接
30 |     db.close()


--------------------------------------------------------------------------------
/pymysql的使用/002-创建表插入数据.py:
--------------------------------------------------------------------------------
 1 | """
 2 | # -*- coding: utf-8 -*-
 3 | # @File    : 002-创建表插入数据.py
 4 | # @Time    : 2020/11/24 2:20 下午
 5 | # @Author  : xiaolu
 6 | # @Email   : luxiaonlp@163.com
 7 | # @Software: PyCharm
 8 | """
 9 | import pymysql
10 | 
11 | if __name__ == '__main__':
12 |     # 打开数据库连接
13 |     db = pymysql.connect("localhost", "xxxxx", "xxxxxxx", "TESTDB")
14 | 
15 |     # 使用 cursor() 方法创建一个游标对象 cursor
16 |     cursor = db.cursor()
17 | 
18 |     # 使用 execute() 方法执行 SQL，如果表存在则删除
19 |     cursor.execute("DROP TABLE IF EXISTS EMPLOYEE")
20 | 
21 |     sql = '''CREATE TABLE EMPLOYEE (
22 |                 FIRST_NAME CHAR (20) NOT NULL,
23 |                 LAST_NAME CHAR (20),
24 |                 AGE INT,
25 |                 SEX CHAR (1),
26 |                 INCOME FLOAT 
27 |                 )
28 |     '''
29 |     cursor.execute(sql)
30 | 
31 |     # 接着插入数据
32 |     insert_sql = """INSERT INTO EMPLOYEE(FIRST_NAME, LAST_NAME, AGE, SEX, INCOME)
33 |                     VALUES ('Mac', 'Mohan', 20, 'M', 2000)"""
34 |     try:
35 |         # 执行sql语句
36 |         cursor.execute(insert_sql)
37 |         # 提交到数据库执行
38 |         db.commit()
39 |     except:
40 |         print('滚犊子,插不进去')
41 |         # 如果发生错误则回滚
42 |         db.rollback()
43 |     # 关闭数据库连接
44 |     db.close()


--------------------------------------------------------------------------------
/pymysql的使用/003-查询.py:
--------------------------------------------------------------------------------
 1 | """
 2 | # -*- coding: utf-8 -*-
 3 | # @File    : 003-查询.py
 4 | # @Time    : 2020/11/24 2:34 下午
 5 | # @Author  : xiaolu
 6 | # @Email   : luxiaonlp@163.com
 7 | # @Software: PyCharm
 8 | """
 9 | import pymysql
10 | 
11 | if __name__ == '__main__':
12 |     # 打开数据库连接
13 |     db = pymysql.connect("localhost", "xxxxxx", "xxxxxxx", "TESTDB")
14 | 
15 |     # 使用cursor()方法获取操作游标
16 |     cursor = db.cursor()
17 | 
18 |     # SQL 查询语句
19 |     sql = "SELECT * FROM EMPLOYEE WHERE INCOME > %s" % (1000)
20 |     try:
21 |         cursor.execute(sql)
22 | 
23 |         # 获取所有记录列表
24 |         results = cursor.fetchall()
25 |         for row in results:
26 |             fname = row[0]
27 |             lname = row[1]
28 |             age = row[2]
29 |             sex = row[3]
30 |             income = row[4]
31 |             print('fname: {}, lname:{}, age:{}, sex:{}, income:{}'.format(fname, lname, age, sex, income))
32 |     except:
33 |         print("啥也找不到")
34 | 
35 |     # 关闭数据库连接
36 |     db.close()


--------------------------------------------------------------------------------
/pymysql的使用/004-更新.py:
--------------------------------------------------------------------------------
 1 | """
 2 | # -*- coding: utf-8 -*-
 3 | # @File    : 004-更新.py
 4 | # @Time    : 2020/11/24 2:41 下午
 5 | # @Author  : xiaolu
 6 | # @Email   : luxiaonlp@163.com
 7 | # @Software: PyCharm
 8 | """
 9 | import pymysql
10 | 
11 | if __name__ == "__main__":
12 |     # 打开数据库连接
13 |     db = pymysql.connect("localhost", "xxxxxx", "xxxxxx", "TESTDB")
14 | 
15 |     # 使用cursor()方法获取操作游标
16 |     cursor = db.cursor()
17 | 
18 |     # SQL 更新语句  给男性加1岁
19 |     sql = "UPDATE EMPLOYEE SET AGE = AGE + 1 WHERE SEX = '%c'" % ('M')
20 |     try:
21 |         # 执行SQL语句
22 |         cursor.execute(sql)
23 |         # 提交到数据库执行
24 |         db.commit()
25 |     except:
26 |         # 发生错误时回滚
27 |         db.rollback()
28 | 
29 |     # 关闭数据库连接
30 |     db.close()


--------------------------------------------------------------------------------
/pymysql的使用/005-删除.py:
--------------------------------------------------------------------------------
 1 | """
 2 | # -*- coding: utf-8 -*-
 3 | # @File    : 005-删除.py
 4 | # @Time    : 2020/11/24 2:43 下午
 5 | # @Author  : xiaolu
 6 | # @Email   : luxiaonlp@163.com
 7 | # @Software: PyCharm
 8 | """
 9 | import pymysql
10 | 
11 | if __name__ == '__main__':
12 |     # 打开数据库连接
13 |     db = pymysql.connect("localhost", "xxxxxx", "xxxxx", "TESTDB")
14 | 
15 |     # 使用cursor()方法获取操作游标
16 |     cursor = db.cursor()
17 | 
18 |     # SQL 删除语句
19 |     sql = "DELETE FROM EMPLOYEE WHERE AGE > %s" % (20)
20 |     try:
21 |         # 执行SQL语句
22 |         cursor.execute(sql)
23 |         # 提交修改
24 |         db.commit()
25 |     except:
26 |         # 发生错误时回滚
27 |         db.rollback()
28 | 
29 |     # 关闭连接
30 |     db.close()


--------------------------------------------------------------------------------
/python并发编程/001-多线程.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 001-多线程.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-02-01
 6 | """
 7 | import threading
 8 | import time
 9 | import requests
10 | 
11 | 
12 | def craw(url):
13 |     # 这是个爬虫
14 |     r = requests.get(url)
15 |     print(url, r.status_code)
16 | 
17 | 
18 | def single_thread():
19 |     # 单线程爬虫
20 |     print('single_thread start')
21 |     for url in urls:
22 |         craw(url)
23 |     print('single_thread end')
24 | 
25 | 
26 | def multi_thread():
27 |     # 多线程爬虫
28 |     print("multi_thread begin")
29 |     threads = []
30 |     for url in urls:
31 |         threads.append(
32 |             threading.Thread(target=craw, args=(url,))   # url, 之所以加逗号 是因为这里必须为元组
33 |         )
34 | 
35 |     # 启动多线程
36 |     for thread in threads:
37 |         thread.start()
38 | 
39 |     # 等待结束
40 |     for thread in threads:
41 |         thread.join()
42 |     print("multi_thread end")
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     # 爬50页的内容
47 |     urls = ['https://www.cnblogs.com/sitehome/p/{}'.format(page) for page in range(1, 50 + 1)]
48 | 
49 |     # 单线程走起
50 |     start = time.time()
51 |     single_thread()
52 |     end = time.time()
53 |     print("single thread cost:", end - start, "seconds")
54 | 
55 |     # 多线程走起
56 |     start = time.time()
57 |     multi_thread()
58 |     end = time.time()
59 |     print("multi thread cost:", end - start, "seconds")


--------------------------------------------------------------------------------
/python并发编程/002-生产者消费者实现多线程爬虫.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 002-生产者消费者实现多线程爬虫.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-02-01
 6 | """
 7 | import queue
 8 | import time
 9 | import random
10 | import threading
11 | import requests
12 | from bs4 import BeautifulSoup
13 | 
14 | 
15 | def craw(url):
16 |     # 爬取网页内容
17 |     r = requests.get(url)
18 |     return r.text
19 | 
20 | 
21 | def parse(html):
22 |     # 解析其中的内容
23 |     soup = BeautifulSoup(html, "html.parser")
24 |     links = soup.find_all("a", class_="post-item-title")
25 |     return [(link["href"], link.get_text()) for link in links]   # 那链接和标题拿出来
26 | 
27 | 
28 | def do_craw(url_queue: queue.Queue, html_queue: queue.Queue):
29 |     '''
30 |     生产者
31 |     :param url_queue: url的队列  生产者从中拿出链接  去爬虫
32 |     :param html_queue:  生产者将爬取的内容放到这里
33 |     :return:
34 |     '''
35 |     while True:
36 |         url = url_queue.get()
37 |         html = craw(url)
38 |         html_queue.put(html)
39 |         print('线程名: ', threading.current_thread().name,
40 |               "url_queue.size=", url_queue.qsize())   # 获取url队列中还有多少待爬取的
41 |         time.sleep(random.randint(1, 2))
42 | 
43 | 
44 | def do_parse(html_queue: queue.Queue, fout):
45 |     '''
46 |     消费者
47 |     :param html_queue: 生产者生产出的内容
48 |     :param fout: 消费者将内容解析出来  存到fout中
49 |     :return:
50 |     '''
51 |     while True:
52 |         html = html_queue.get()
53 |         results = parse(html)
54 |         for result in results:
55 |             fout.write(str(result) + "\n")
56 |         print('线程名: ', threading.current_thread().name,
57 |               "html_queue.size=", html_queue.qsize())
58 |         time.sleep(random.randint(1, 2))
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     # 待爬取的网页链接
63 |     urls = [
64 |         "https://www.cnblogs.com/sitehome/p/{}".format(page) for page in range(1, 50 + 1)
65 |     ]
66 | 
67 |     url_queue = queue.Queue()
68 |     html_queue = queue.Queue()
69 | 
70 |     # 将url放进队列中
71 |     for url in urls:
72 |         url_queue.put(url)
73 | 
74 |     # 启动三个线程去做生产者
75 |     for idx in range(3):
76 |         t = threading.Thread(target=do_craw, args=(url_queue, html_queue),
77 |                              name="craw{}".format(idx))
78 |         t.start()
79 | 
80 |     fout = open("data.txt", "w")
81 |     # 启动两个线程去做消费者
82 |     for idx in range(2):
83 |         t = threading.Thread(target=do_parse, args=(html_queue, fout),
84 |                              name="parse{}".format(idx))
85 |         t.start()
86 | 


--------------------------------------------------------------------------------
/python并发编程/003-多线程锁机制.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 003-多线程锁机制.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-02-01
 6 | """
 7 | import threading
 8 | import time
 9 | 
10 | lock = threading.Lock()
11 | 
12 | 
13 | class Account:
14 |     def __init__(self, balance):
15 |         self.balance = balance
16 | 
17 | 
18 | def draw(account, amount):
19 |     with lock:
20 |         if account.balance >= amount:
21 |             # time.sleep(0.1)   # 如果不加锁，这里休息0.1秒，每次都会出问题，因为这里会引起线程阻塞，一定会切换
22 |             print(threading.current_thread().name, "取钱成功")
23 |             account.balance -= amount
24 |             print(threading.current_thread().name, "余额", account.balance)
25 |         else:
26 |             print(threading.current_thread().name,
27 |                   "取钱失败，余额不足")
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     account = Account(1000)    # 金额
32 | 
33 |     # 启动两个线程  分别去800块
34 |     ta = threading.Thread(name="ta", target=draw, args=(account, 800))
35 |     tb = threading.Thread(name="tb", target=draw, args=(account, 800))
36 | 
37 |     ta.start()
38 |     tb.start()


--------------------------------------------------------------------------------
/python并发编程/004-线程池的使用.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 004-线程池的使用.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-02-01
 6 | """
 7 | import concurrent.futures
 8 | import requests
 9 | from bs4 import BeautifulSoup
10 | 
11 | 
12 | def craw(url):
13 |     # 爬取网页内容
14 |     r = requests.get(url)
15 |     return r.text
16 | 
17 | 
18 | def parse(html):
19 |     # 解析其中的内容
20 |     soup = BeautifulSoup(html, "html.parser")
21 |     links = soup.find_all("a", class_="post-item-title")
22 |     return [(link["href"], link.get_text()) for link in links]   # 那链接和标题拿出来
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     # 待爬取的网页链接
27 |     urls = [
28 |         "https://www.cnblogs.com/sitehome/p/{}".format(page) for page in range(1, 50 + 1)
29 |     ]
30 | 
31 |     # craw
32 |     with concurrent.futures.ThreadPoolExecutor() as pool:
33 |         htmls = pool.map(craw, urls)
34 |         htmls = list(zip(urls, htmls))
35 |         for url, html in htmls:
36 |             print(url, len(html))
37 |     print("craw over")
38 | 
39 |     # parse
40 |     with concurrent.futures.ThreadPoolExecutor() as pool:
41 |         futures = {}
42 |         for url, html in htmls:
43 |             future = pool.submit(parse, html)
44 |             futures[future] = url
45 | 
46 |         # for future, url in futures.items():
47 |         #     print(url, future.result())
48 | 
49 |         for future in concurrent.futures.as_completed(futures):
50 |             url = futures[future]
51 |             print(url, future.result())
52 | 


--------------------------------------------------------------------------------
/python并发编程/005-线程池加速flask-web服务.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 005-线程池加速flask-web服务.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-02-01
 6 | """
 7 | import flask
 8 | import json
 9 | import time
10 | from concurrent.futures import ThreadPoolExecutor
11 | 
12 | app = flask.Flask(__name__)
13 | pool = ThreadPoolExecutor()
14 | 
15 | 
16 | def read_file():
17 |     time.sleep(0.1)
18 |     return "file result"
19 | 
20 | 
21 | def read_db():
22 |     time.sleep(0.2)
23 |     return "db result"
24 | 
25 | 
26 | def read_api():
27 |     time.sleep(0.3)
28 |     return "api result"
29 | 
30 | 
31 | @app.route("/")
32 | def index():
33 |     result_file = pool.submit(read_file)
34 |     result_db = pool.submit(read_db)
35 |     result_api = pool.submit(read_api)
36 | 
37 |     return json.dumps({
38 |         "result_file": result_file.result(),
39 |         "result_db": result_db.result(),
40 |         "result_api": result_api.result(),
41 |     })
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     app.run()


--------------------------------------------------------------------------------
/python并发编程/006-多进程的使用.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 006-多进程的使用.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-02-01
 6 | """
 7 | import math
 8 | from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
 9 | import time
10 | 
11 | 
12 | def is_prime(n):
13 |     if n < 2:
14 |         return False
15 |     if n == 2:
16 |         return True
17 |     if n % 2 == 0:
18 |         return False
19 |     sqrt_n = int(math.floor(math.sqrt(n)))
20 |     for i in range(3, sqrt_n + 1, 2):
21 |         if n % i == 0:
22 |             return False
23 |     return True
24 | 
25 | 
26 | def single_thread():
27 |     for number in PRIMES:
28 |         is_prime(number)
29 | 
30 | 
31 | def multi_thread():
32 |     with ThreadPoolExecutor() as pool:
33 |         pool.map(is_prime, PRIMES)
34 | 
35 | 
36 | def multi_process():
37 |     with ProcessPoolExecutor() as pool:
38 |         pool.map(is_prime, PRIMES)
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     PRIMES = [112272535095293] * 100
43 | 
44 |     start = time.time()
45 |     single_thread()
46 |     end = time.time()
47 |     print("single_thread, cost:", end - start, "seconds")
48 | 
49 |     start = time.time()
50 |     multi_thread()
51 |     end = time.time()
52 |     print("multi_thread, cost:", end - start, "seconds")
53 | 
54 |     start = time.time()
55 |     multi_process()
56 |     end = time.time()
57 |     print("multi_process, cost:", end - start, "seconds")
58 | 


--------------------------------------------------------------------------------
/python并发编程/007-多进程加速flask-web服务.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 007-多进程加速flask-web服务.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-02-01
 6 | """
 7 | import flask
 8 | from concurrent.futures import ProcessPoolExecutor
 9 | import math
10 | import json
11 | 
12 | 
13 | app = flask.Flask(__name__)
14 | 
15 | 
16 | def is_prime(n):
17 |     if n < 2:
18 |         return False
19 |     if n == 2:
20 |         return True
21 |     if n % 2 == 0:
22 |         return False
23 |     sqrt_n = int(math.floor(math.sqrt(n)))
24 |     for i in range(3, sqrt_n + 1, 2):
25 |         if n % i == 0:
26 |             return False
27 |     return True
28 | 
29 | 
30 | @app.route("/is_prime/<numbers>")
31 | def api_is_prime(numbers):
32 |     number_list = [int(x) for x in numbers.split(",")]
33 |     results = process_pool.map(is_prime, number_list)
34 |     return json.dumps(dict(zip(number_list, results)))
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     process_pool = ProcessPoolExecutor()
39 |     app.run()
40 | 


--------------------------------------------------------------------------------
/python并发编程/008-协程爬虫.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 008-协程爬虫.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-02-01
 6 | """
 7 | import asyncio
 8 | import aiohttp
 9 | import time
10 | 
11 | 
12 | async def async_craw(url):
13 |     print("craw url: ", url)
14 |     async with aiohttp.ClientSession() as session:
15 |         async with session.get(url) as resp:
16 |             result = await resp.text()
17 |             print(f"craw url: {url}, {len(result)}")
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     urls = [
22 |         "https://www.cnblogs.com/sitehome/p/{}".format(page) for page in range(1, 50 + 1)
23 |     ]
24 | 
25 |     loop = asyncio.get_event_loop()   # 获取超级循环
26 |     tasks = [loop.create_task(async_craw(url)) for url in urls]  # 建立任务
27 |     start = time.time()
28 |     loop.run_until_complete(asyncio.wait(tasks))   # 开始执行
29 |     end = time.time()
30 |     print("use time seconds: ", end - start)


--------------------------------------------------------------------------------
/python并发编程/009-使用信号量控制协程数进行爬虫.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @file   : 009-使用信号量控制协程数进行爬虫.py
 3 | @author : xiaolu
 4 | @email  : luxiaonlp@163.com
 5 | @time   : 2021-02-01
 6 | """
 7 | import asyncio
 8 | import aiohttp
 9 | import time
10 | 
11 | 
12 | async def async_craw(url):
13 |     async with semaphore:   # 加了这个
14 |         print("craw url: ", url)
15 |         async with aiohttp.ClientSession() as session:
16 |             async with session.get(url) as resp:
17 |                 result = await resp.text()
18 |                 await asyncio.sleep(5)
19 |                 print(f"craw url: {url}, {len(result)}")
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     urls = [
24 |         "https://www.cnblogs.com/sitehome/p/{}".format(page) for page in range(1, 50 + 1)
25 |     ]
26 |     semaphore = asyncio.Semaphore(10)   # 控制并发量
27 | 
28 |     loop = asyncio.get_event_loop()   # 获取超级循环
29 |     tasks = [loop.create_task(async_craw(url)) for url in urls]  # 建立任务
30 |     start = time.time()
31 |     loop.run_until_complete(asyncio.wait(tasks))   # 开始执行
32 |     end = time.time()
33 |     print("use time seconds: ", end - start)
34 | 


--------------------------------------------------------------------------------
/streamlit的使用/鸢尾花数据的分类app/app.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @file   : app.py
  3 | @author : xiaolu
  4 | @email  : luxiaonlp@163.com
  5 | @time   : 2021-06-09
  6 | """
  7 | import streamlit as st
  8 | import numpy as np
  9 | import matplotlib.pyplot as plt
 10 | from sklearn import datasets
 11 | from sklearn.model_selection import train_test_split
 12 | from sklearn.decomposition import PCA
 13 | from sklearn.svm import SVC
 14 | from sklearn.neighbors import KNeighborsClassifier
 15 | from sklearn.ensemble import RandomForestClassifier
 16 | from sklearn.metrics import accuracy_score
 17 | 
 18 | 
 19 | def get_dataset(name):
 20 |     # 加载数据集
 21 |     if name == 'Iris':
 22 |         data = datasets.load_iris()
 23 |     elif name == 'Wine':
 24 |         data = datasets.load_wine()
 25 |     else:
 26 |         data = datasets.load_breast_cancer()
 27 |     X = data.data
 28 |     y = data.target
 29 |     return X, y
 30 | 
 31 | 
 32 | def add_parameter_ui(clf_name):
 33 |     # 针对每个分类器 可以调节的超参数
 34 |     params = dict()
 35 |     if clf_name == 'SVM':
 36 |         C = st.sidebar.slider('C', 0.01, 10.0)    # 滑动条
 37 |         params['C'] = C
 38 |     elif clf_name == 'KNN':
 39 |         K = st.sidebar.slider('K', 1, 15)    # 滑动条
 40 |         params['K'] = K
 41 |     else:
 42 |         max_depth = st.sidebar.slider('max_depth', 2, 15)     # 滑动条
 43 |         params['max_depth'] = max_depth
 44 |         n_estimators = st.sidebar.slider('n_estimators', 1, 100)    # 滑动条
 45 |         params['n_estimators'] = n_estimators
 46 |     return params
 47 | 
 48 | 
 49 | def get_classifier(clf_name, params):
 50 |     # 实例化分类器
 51 |     clf = None
 52 |     if clf_name == 'SVM':
 53 |         clf = SVC(C=params['C'])
 54 |     elif clf_name == 'KNN':
 55 |         clf = KNeighborsClassifier(n_neighbors=params['K'])
 56 |     else:
 57 |         clf = RandomForestClassifier(n_estimators=params['n_estimators'],
 58 |                                      max_depth=params['max_depth'], random_state=1234)
 59 |     return clf
 60 | 
 61 | 
 62 | def plot_result():
 63 |     pca = PCA(2)
 64 |     X_projected = pca.fit_transform(X)
 65 |     x1 = X_projected[:, 0]
 66 |     x2 = X_projected[:, 1]
 67 |     fig = plt.figure()
 68 |     plt.scatter(x1, x2, c=y, alpha=0.8, cmap='viridis')
 69 | 
 70 |     plt.xlabel('feature_1')
 71 |     plt.ylabel('feature_2')
 72 |     plt.colorbar()
 73 |     st.pyplot(fig)
 74 | 
 75 | 
 76 | if __name__ == '__main__':
 77 |     # 启动该项目，命令行: streamlit run app.py
 78 |     st.title('鸢尾花数据集的分类')
 79 |     st.write('''
 80 |     # 支持选择不同的分类器(SVM/Random Forest/KNN)
 81 |     哪一个分类器更好呢？''')   # 支持markdown
 82 | 
 83 |     # 1. 可以选择不同的数据集  是一个下拉选择框
 84 |     dataset_name = st.sidebar.selectbox(
 85 |         '数据集的选择',
 86 |         ('Iris', 'Breast Cancer', 'Wine')
 87 |     )
 88 | 
 89 |     st.write('## {} 数据集'.format(dataset_name))   # 选择好数据集  这里显示
 90 | 
 91 |     # 2. 可以选择不同的分类器， 是一个下拉选择框
 92 |     classifier_name = st.sidebar.selectbox(
 93 |         '分类器的选择',
 94 |         ('KNN', 'SVM', 'Random Forest')
 95 |     )
 96 | 
 97 |     X, y = get_dataset(dataset_name)
 98 |     st.write('数据集的形状:', X.shape)
 99 |     st.write('数据集的类别数:', len(np.unique(y)))
100 | 
101 |     params = add_parameter_ui(classifier_name)
102 | 
103 |     clf = get_classifier(classifier_name, params)
104 | 
105 |     # 模型训练
106 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)
107 | 
108 |     clf.fit(X_train, y_train)
109 |     y_pred = clf.predict(X_test)
110 | 
111 |     acc = accuracy_score(y_test, y_pred)   # 准确率
112 | 
113 |     st.write('选择的分类器为: ', classifier_name)
114 |     st.write('准确率: ', acc)
115 | 
116 |     # 画图
117 |     plot_result()
118 | 


--------------------------------------------------------------------------------
/textrank4zh/001-关键词提取.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @Time ： 2020/11/4 16:37
 4 | @Auth ： xiaolu
 5 | @File ：001-关键词提取.py
 6 | @IDE ：PyCharm
 7 | @Email：luxiaonlp@163.com
 8 | """
 9 | from textrank4zh import TextRank4Keyword
10 | 
11 | 
12 | if __name__ == '__main__':
13 |     # 加载文本
14 |     data = []
15 |     with open('./data/text.txt', 'r', encoding='utf8') as f:
16 |         for line in f.readlines():
17 |             line = line.strip()
18 |             data.append(line)
19 | 
20 |     # 关键词提取
21 |     tr4w = TextRank4Keyword()
22 | 
23 |     data = data[:1]
24 |     for text in data:
25 |         tr4w.analyze(text=text, lower=True, window=2)
26 |         for item in tr4w.get_keywords(20, word_min_len=1):
27 |             print('{}:{:6f}'.format(item.word, item.weight))
28 | 
29 |     # 关键短语抽取
30 |     for text in data:
31 |         tr4w.analyze(text=text, lower=True, window=2)
32 |         for phrase in tr4w.get_keyphrases(20, min_occur_num=1):
33 |             print(phrase)
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/textrank4zh/002-摘要抽取.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @Time ： 2020/11/4 16:47
 4 | @Auth ： xiaolu
 5 | @File ：002-摘要抽取.py
 6 | @IDE ：PyCharm
 7 | @Email：luxiaonlp@163.com
 8 | """
 9 | from textrank4zh import TextRank4Sentence
10 | 
11 | if __name__ == '__main__':
12 |     # 加载文本
13 |     data = []
14 |     with open('./data/text.txt', 'r', encoding='utf8') as f:
15 |         for line in f.readlines():
16 |             line = line.strip()
17 |             data.append(line)
18 | 
19 |     # 摘要抽取
20 |     tr4s = TextRank4Sentence()
21 | 
22 |     data = data[:1]
23 |     for text in data:
24 |         tr4s.analyze(text=text, lower=True, source='all_filters')
25 |         for item in tr4s.get_key_sentences(num=3):
26 |             print(item.index, item.weight, item.sentence)
27 | 


--------------------------------------------------------------------------------
/textrank4zh/readme.txt:
--------------------------------------------------------------------------------
1 | 安装 pip install textrank4zh -i https://pypi.douban.com/simple/


--------------------------------------------------------------------------------