├── Faiss的使用
├── 001-欧式距离检索.py
├── 002-倒排表快速索引.py
├── 003-乘积量化索引.py
├── 004-faiss实现kmeans聚类.py
├── 005-faiss实现pca降维.py
├── 006-faiss实现PQ编码和解码.py
├── 007-faiss实现标量量化器.py
└── 008-faiss_use_gpu.py
├── LAC分词器
├── 001-分词.py
├── 002-词性标注和实体识别.py
├── 003-加载自己的词表进行分词.py
└── vocab.txt
├── PySpark
├── .DS_Store
├── 001-data_processing_use_pyspark.py
├── 002-linear_regression_use_pyspark.py
├── 003-logistic_regression_use_pyspark.py
├── 004-random_forests_classification_use_pyspark.py
├── 005-kmeans_cluster_use_pyspark.py
├── 006-recommendr_system_use_pyspark.py
├── 007-NLP_use_pyspark.py
└── data
│ ├── Linear_regression_dataset.csv
│ ├── Log_Reg_dataset.csv
│ ├── Movie_reviews.csv
│ ├── affairs.csv
│ ├── iris_dataset.csv
│ ├── movie_ratings_df.csv
│ └── sample_data.csv
├── README.md
├── RSA实战
├── 001-rsa生成公私钥并保存.py
└── 002-公钥加密私钥解密.py
├── apscheduler实现定时任务
└── 定时任务.py
├── chinesebert中的pinyin和glyph的处理
├── MSYH.TTC
├── image_test.py
└── pinyin_test.py
├── collections的用法
└── 001-collections中的namedtuple用法.py
├── elasticsearch
├── 001-创建库并插入数据.py
└── 002-es中的搜索.py
├── flask+echart+ajax
├── .DS_Store
├── app.py
├── static
│ ├── .DS_Store
│ ├── css
│ │ └── main.css
│ └── js
│ │ ├── controller.js
│ │ ├── echarts.min.js
│ │ ├── jquery.js
│ │ ├── left.js
│ │ └── right.js
└── templates
│ └── index.html
├── flask表单那些事
├── .DS_Store
├── app.py
└── templates
│ └── index.html
├── gensim
├── 001-TF-IDF句子相似度计算.py
├── 002-gensim文本摘要.py
└── data
│ ├── answer.txt
│ ├── question.txt
│ ├── stopwords.txt
│ ├── test.py
│ └── text.txt
├── gradio学习
├── 01-row_column_layout.py
└── 02-chatglm_web.py
├── ipdb调试python程序
├── 001-简单调试.py
└── readme.txt
├── logging模块的使用
├── 001-日志级别的使用.py
├── 002-日志控制台输出.py
├── 003-日志文件输出.py
└── 004-捕捉异常.py
├── pandas一键画图
├── 001-plot_zhexiantu.html
├── 001-plot_zhexiantu.py
├── 002-plot_sandiantu.html
├── 002-plot_sandiantu.py
├── 003-plot_zhuzhuangtu.html
└── 003-plot_zhuzhuangtu.py
├── py2neo操作neo4j
├── .DS_Store
├── py2neo简单练习
│ ├── create_graph_v1.py
│ ├── create_graph_v2.py
│ ├── mingchaonaxieshier.xlsx
│ ├── santi.xlsx
│ └── test.xlsx
├── readme.txt
├── 事件三元组抽取
│ ├── ltp的使用.py
│ ├── my_vocab.txt
│ └── readme.txt
└── 医疗知识图谱问答
│ ├── .DS_Store
│ ├── ahocorasick的使用
│ └── demo.py
│ ├── build_medical_graph.py
│ ├── data
│ ├── medical.json
│ └── medical_min.json
│ ├── data_process
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-37.pyc
│ │ ├── answer_search.cpython-37.pyc
│ │ ├── question_classifier.cpython-37.pyc
│ │ └── question_parser.cpython-37.pyc
│ ├── answer_search.py
│ ├── question_classifier.py
│ └── question_parser.py
│ ├── dict
│ ├── check.txt
│ ├── deny.txt
│ ├── department.txt
│ ├── disease.txt
│ ├── drug.txt
│ ├── food.txt
│ ├── producer.txt
│ └── symptom.txt
│ └── run_chatbot.py
├── pyecharts使用
├── 001-柱状图.py
├── 002-折线图.py
├── 003-饼状图.py
├── 折线图.html
├── 柱状图.html
└── 饼状图.html
├── pymysql的使用
├── 001-创建数据库.py
├── 002-创建表插入数据.py
├── 003-查询.py
├── 004-更新.py
└── 005-删除.py
├── python并发编程
├── 001-多线程.py
├── 002-生产者消费者实现多线程爬虫.py
├── 003-多线程锁机制.py
├── 004-线程池的使用.py
├── 005-线程池加速flask-web服务.py
├── 006-多进程的使用.py
├── 007-多进程加速flask-web服务.py
├── 008-协程爬虫.py
├── 009-使用信号量控制协程数进行爬虫.py
└── data.txt
├── streamlit的使用
└── 鸢尾花数据的分类app
│ └── app.py
└── textrank4zh
├── 001-关键词提取.py
├── 002-摘要抽取.py
├── data
└── text.txt
└── readme.txt
/Faiss的使用/001-欧式距离检索.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 001-欧式距离检索.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-08-09
6 | """
7 | import faiss
8 | import numpy as np
9 |
10 |
11 | if __name__ == '__main__':
12 | n_data, d = 1000, 512 # 检索库中的向量个数, 每个向量的维度
13 | np.random.seed(43) # 随机种子 为了多次执行结果一致
14 |
15 | # 检索库的构造
16 | data = []
17 | mu, sigma = 3, 0.1 # 这里时通过高斯分布随机产生若干向量,这两个参数为均值和方差
18 | for i in range(n_data):
19 | data.append(np.random.normal(mu, sigma, d))
20 | data = np.array(data).astype('float32') # faiss只支持32位的浮点数
21 |
22 | # 检索向量的生成
23 | query = []
24 | n_query = 10 # 生成10个query向量
25 | mu, sigma = 3, 0.1
26 | np.random.seed(12)
27 | for i in range(n_query):
28 | query.append(np.random.normal(mu, sigma, d))
29 | query = np.array(query).astype('float32')
30 |
31 | # 构建索引 记住要传入向量维度d
32 | index = faiss.IndexFlatL2(d)
33 | # print(index.is_trained) # 这里若是false就要训练 后面讲
34 |
35 | # 添加数据
36 | index.add(data)
37 | # print(index.ntotal) # 总的数据量
38 |
39 | # 开始检索
40 | k = 10 # 指定让其返回10个距离最近的
41 |
42 | # 这里我们选取data中的前五个 容易看到结果,因为自己跟自己距离肯定为0 所以最相关的肯定是自己
43 | query_self = data[:5]
44 |
45 | dis, ind = index.search(query_self, k=k)
46 | print(dis) # 每条数据代表了当前这个query 与最相关的十个数据的距离
47 | print(ind) # 每条数据代表了当前这个query 最相关的十条数据的索引
48 | """
49 | [[0. 8.55197 8.634906 8.683499 8.698736 8.821949 8.902446
50 | 8.943979 8.9516735 8.972908 ]
51 | [0. 8.369204 8.482748 8.53028 8.581224 8.680499 8.684254
52 | 8.697291 8.719812 8.753435 ]
53 | [0. 8.209936 8.392483 8.456179 8.473589 8.480727 8.551348
54 | 8.553277 8.576391 8.592704 ]
55 | [0. 8.473689 8.621014 8.827385 8.883725 8.980131 8.99064
56 | 9.015673 9.017438 9.027972 ]
57 | [0. 8.268832 8.349455 8.597895 8.611757 8.658188 8.675722
58 | 8.685029 8.70588 8.707612 ]]
59 | [[ 0 877 502 42 606 366 348 923 563 56]
60 | [ 1 849 974 106 348 364 877 242 280 173]
61 | [ 2 877 127 655 253 233 558 678 13 208]
62 | [ 3 421 94 348 502 402 536 646 563 735]
63 | [ 4 986 230 209 446 889 974 241 550 248]]
64 | """
65 |
66 |
67 |
68 |
--------------------------------------------------------------------------------
/Faiss的使用/002-倒排表快速索引.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 002-倒排表快速索引.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-08-09
6 | """
7 | import numpy as np
8 | import faiss
9 |
10 | if __name__ == '__main__':
11 | n_data, d = 1000, 512 # 检索库中的向量个数, 每个向量的维度
12 | np.random.seed(43) # 随机种子 为了多次执行结果一致
13 |
14 | # 检索库的构造
15 | data = []
16 | mu, sigma = 3, 0.1 # 这里时通过高斯分布随机产生若干向量,这两个参数为均值和方差
17 | for i in range(n_data):
18 | data.append(np.random.normal(mu, sigma, d))
19 | data = np.array(data).astype('float32') # faiss只支持32位的浮点数
20 |
21 | # 检索向量的生成
22 | query = []
23 | n_query = 10 # 生成10个query向量
24 | mu, sigma = 3, 0.1
25 | np.random.seed(12)
26 | for i in range(n_query):
27 | query.append(np.random.normal(mu, sigma, d))
28 | query = np.array(query).astype('float32')
29 |
30 | nlist = 50 # 将数据库向量分割为多少了维诺空间
31 | k = 10
32 | quantizer = faiss.IndexFlatL2(d) # 量化器
33 | index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2) # METRIC_L2计算L2距离, 或faiss.METRIC_INNER_PRODUCT计算内积
34 | assert not index.is_trained # 倒排表索引类型需要训练
35 | index.train(data) # 训练数据集应该与数据库数据集同分布
36 | assert index.is_trained
37 |
38 | index.add(data)
39 | index.nprobe = 2 # 选择n个维诺空间进行索引,
40 | dis, ind = index.search(query, k)
41 | print(dis)
42 | print(ind)
43 |
--------------------------------------------------------------------------------
/Faiss的使用/003-乘积量化索引.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 003-乘积量化索引.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-08-09
6 | """
7 | import numpy as np
8 | import faiss
9 |
10 | if __name__ == '__main__':
11 | n_data, d = 1000, 512 # 检索库中的向量个数, 每个向量的维度
12 | np.random.seed(43) # 随机种子 为了多次执行结果一致
13 |
14 | # 检索库的构造
15 | data = []
16 | mu, sigma = 3, 0.1 # 这里时通过高斯分布随机产生若干向量,这两个参数为均值和方差
17 | for i in range(n_data):
18 | data.append(np.random.normal(mu, sigma, d))
19 | data = np.array(data).astype('float32') # faiss只支持32位的浮点数
20 |
21 | # 检索向量的生成
22 | query = []
23 | n_query = 10 # 生成10个query向量
24 | mu, sigma = 3, 0.1
25 | np.random.seed(12)
26 | for i in range(n_query):
27 | query.append(np.random.normal(mu, sigma, d))
28 | query = np.array(query).astype('float32')
29 |
30 | nlist = 50
31 | m = 8 # 列方向划分个数,必须能被d整除
32 | k = 10
33 | quantizer = faiss.IndexFlatL2(d)
34 | index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 4) # 4 表示每个子向量被编码为 4 bits
35 |
36 | index.train(data)
37 | index.add(data)
38 | index.nprobe = 50
39 | dis, ind = index.search(data[:10], k) # 查询自身
40 | print(dis)
41 | print(ind)
42 |
43 | dis, ind = index.search(query, k) # 真实查询
44 | print(dis)
45 | print(ind)
46 |
--------------------------------------------------------------------------------
/Faiss的使用/004-faiss实现kmeans聚类.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 004-faiss实现kmeans聚类.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-08-09
6 | """
7 | import faiss
8 | import numpy as np
9 |
10 |
11 | if __name__ == '__main__':
12 | # 数据
13 | n_data, d = 2000, 512
14 | np.random.seed(43)
15 | data = []
16 | mu, sigma = 3, 0.1
17 | for i in range(n_data):
18 | data.append(np.random.normal(mu, sigma, d))
19 | data = np.array(data).astype('float32')
20 |
21 | # 聚类
22 | n_centroids = 1024 # 聚类中心个数
23 | d = data.shape[1]
24 | kmeans = faiss.Kmeans(d, n_centroids)
25 | kmeans.train(data)
26 | # 输出聚类中心
27 | # print(kmeans.centroids)
28 | # print(len(kmeans.centroids))
29 |
30 | # 看data中的前五个向量属于那个类(最有可能的两个类)
31 | D, I = kmeans.index.search(data[:5], k=2)
32 | print(D) # 与每个类的距离
33 | print(I) # 类的编号
34 | """
35 | 输出:
36 | [[4.1553707 5.2924204]
37 | [1.9329664 4.930997 ]
38 | [4.537619 4.8509283]
39 | [4.6700296 5.2252126]
40 | [2.101182 4.9292693]]
41 | [[478 568]
42 | [767 697]
43 | [568 527]
44 | [999 568]
45 | [175 853]]
46 | """
47 |
48 | print('*'*100)
49 | # 计算每个中心最近的若干条向量
50 | k = 5
51 | index = faiss.IndexFlatL2(d)
52 | index.add(data)
53 | D, I = index.search(kmeans.centroids, k)
54 | print(D)
55 | print(I)
56 |
57 |
58 |
59 |
--------------------------------------------------------------------------------
/Faiss的使用/005-faiss实现pca降维.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 005-faiss实现pca降维.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-08-09
6 | """
7 | import faiss
8 | import numpy as np
9 |
10 |
11 | if __name__ == '__main__':
12 | # 数据
13 | n_data, d = 2000, 512
14 | np.random.seed(43)
15 | data = []
16 | mu, sigma = 3, 0.1
17 | for i in range(n_data):
18 | data.append(np.random.normal(mu, sigma, d))
19 | data = np.array(data).astype('float32')
20 |
21 | mat = faiss.PCAMatrix(512, 64) # 从512维降为64维
22 | mat.train(data)
23 | assert mat.is_trained
24 | tr = mat.apply_py(data)
25 | print(tr.shape)
26 | print(tr)
27 |
28 |
--------------------------------------------------------------------------------
/Faiss的使用/006-faiss实现PQ编码和解码.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 006-faiss实现PQ编码和解码.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-08-09
6 | """
7 | import faiss
8 | import numpy as np
9 |
10 |
11 | if __name__ == '__main__':
12 | # 数据
13 | n_data, d = 2000, 512
14 | np.random.seed(43)
15 | data = []
16 | mu, sigma = 3, 0.1
17 | for i in range(n_data):
18 | data.append(np.random.normal(mu, sigma, d))
19 | data = np.array(data).astype('float32')
20 |
21 | cs = 4 # code size (bytes)
22 | # 训练数据集
23 | x = data # 原始的数据集
24 |
25 | x_train = data # 训练集
26 | pq = faiss.ProductQuantizer(d, cs, 8)
27 | pq.train(x_train)
28 |
29 | # encode编码
30 | codes = pq.compute_codes(x)
31 |
32 | # decode解码
33 | x2 = pq.decode(codes)
34 |
35 | # 编码-解码后与原始数据的差
36 | avg_relative_error = ((x - x2)**2).sum() / (x ** 2).sum()
37 | print(avg_relative_error)
--------------------------------------------------------------------------------
/Faiss的使用/007-faiss实现标量量化器.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 007-faiss实现标量量化器.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-08-09
6 | """
7 | import faiss
8 | import numpy as np
9 |
10 |
11 | if __name__ == '__main__':
12 | # 数据
13 | n_data, d = 2000, 512
14 | np.random.seed(43)
15 | data = []
16 | mu, sigma = 3, 0.1
17 | for i in range(n_data):
18 | data.append(np.random.normal(mu, sigma, d))
19 | data = np.array(data).astype('float32')
20 |
21 | x = data
22 | # 训练集
23 | x_train = data
24 | # QT_8bit allocates 8 bits per dimension (QT_4bit also works)
25 | sq = faiss.ScalarQuantizer(d, faiss.ScalarQuantizer.QT_8bit)
26 | sq.train(x_train)
27 |
28 | # encode 编码
29 | codes = sq.compute_codes(x)
30 |
31 | # decode 解码
32 | x2 = sq.decode(codes)
33 |
34 | # 计算编码-解码后与原始数据的差
35 | avg_relative_error = ((x - x2)**2).sum() / (x ** 2).sum()
36 | print(avg_relative_error)
--------------------------------------------------------------------------------
/Faiss的使用/008-faiss_use_gpu.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 008-faiss_use_gpu.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-08-25
6 | """
7 | import faiss
8 | import numpy as np
9 | import time
10 |
11 |
12 |
13 | if __name__ == '__main__':
14 | d = 512 # 向量维度
15 | nb = 300000 # 向量库的大小
16 | nq = 100 # 用这100个向量进行检索
17 |
18 | np.random.seed(1234)
19 |
20 | # 随机产生一个向量库
21 | xb = np.random.random((nb,d)).astype('float32')
22 | xb[:, 0] += np.arange(nb) / 1000.
23 |
24 | # 随机产生100个query向量
25 | xq = np.random.random((nq,d)).astype('float32')
26 | xq[:, 0] += np.arange(nq) / 1000.
27 |
28 | quantizer = faiss.IndexFlatL2(d)
29 | nlist = 100 # 将数据库向量分割为多少了维诺空间
30 | index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
31 |
32 | gpu_index = faiss.index_cpu_to_all_gpus(index) # 使用gpu也就是这行代码就行了
33 | print(gpu_index.is_trained)
34 | gpu_index.train(xb)
35 | print(gpu_index.is_trained)
36 |
37 | gpu_index.add(xb)
38 | gpu_index.nprobe = 10 # 选择10个维诺空间进行索引
39 | k = 10 # 返回十个结果
40 | D, gt_nms = gpu_index.search(xq, k)
41 | print(gt_nms)
--------------------------------------------------------------------------------
/LAC分词器/001-分词.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @Time : 2020/11/3 14:06
4 | @Auth : xiaolu
5 | @File :001-分词.py
6 | @IDE :PyCharm
7 | @Email:luxiaonlp@163.com
8 | """
9 | from LAC import LAC
10 | import jieba
11 |
12 |
13 | if __name__ == '__main__':
14 | lac = LAC(mode='seg')
15 |
16 | # 单个样本输入, 输入为unicode编码的字符串
17 | text = '大王叫我来巡山'
18 | lac_result = lac.run(text)
19 | print(lac_result)
20 |
21 | jieba_result = jieba.lcut(text)
22 | print(jieba_result)
23 |
24 | # 批量样本输入, 输入为多个句子组成的list,平均速率会更快
25 | texts = ["山里有个庙", "庙里有个老和尚跟一个小和尚"]
26 | result = lac.run(texts)
27 | print(result)
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/LAC分词器/002-词性标注和实体识别.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @Time : 2020/11/3 14:09
4 | @Auth : xiaolu
5 | @File :002-词性标注和实体识别.py
6 | @IDE :PyCharm
7 | @Email:luxiaonlp@163.com
8 | """
9 | from LAC import LAC
10 |
11 |
12 | if __name__ == '__main__':
13 | lac = LAC(mode='lac')
14 | text = '我想涨工资'
15 |
16 | lac_result = lac.run(text)
17 | print(lac_result)
18 |
19 | texts = ["汤青松长得好帅", "我喜欢做安全开发工程师"]
20 | lac_result = lac.run(texts)
21 | print(lac_result)
22 |
23 |
24 |
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/LAC分词器/003-加载自己的词表进行分词.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @Time : 2020/11/6 11:18
4 | @Auth : xiaolu
5 | @File :001-demo.py
6 | @IDE :PyCharm
7 | @Email:luxiaonlp@163.com
8 | """
9 | from LAC import LAC
10 | import jieba
11 |
12 | if __name__ == '__main__':
13 | lac = LAC()
14 | lac.load_customization('./vocab.txt', sep=None)
15 | res1 = lac.run('字节跳动阿里巴巴腾讯公司金山软件小米科技')
16 | res2 = jieba.lcut('字节跳动阿里巴巴腾讯公司金山软件小米科技')
17 | print(res1)
18 | print(res2)
19 |
20 |
21 |
--------------------------------------------------------------------------------
/LAC分词器/vocab.txt:
--------------------------------------------------------------------------------
1 | 我
2 | 爱你
3 | 我爱
--------------------------------------------------------------------------------
/PySpark/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/PySpark/.DS_Store
--------------------------------------------------------------------------------
/PySpark/001-data_processing_use_pyspark.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 001-data_processing_use_pyspark.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-04-08
6 | """
7 | import findspark
8 | findspark.init()
9 | from pyspark.sql import SparkSession
10 | from pyspark.sql.types import StringType, DoubleType, IntegerType
11 | from pyspark.sql.functions import udf
12 | from pyspark.sql.functions import pandas_udf, PandasUDFType
13 |
14 |
15 | def price_range(brand):
16 | if brand in ['Samsung', 'Apple']:
17 | return 'High Price'
18 | elif brand == 'MI':
19 | return 'Mid Price'
20 | else:
21 | return 'Low Price'
22 |
23 |
24 | def remaining_yrs(age):
25 | yrs_left = 100-age
26 | return yrs_left
27 |
28 |
29 | if __name__ == '__main__':
30 | # 1. 创建会话对象
31 | spark = SparkSession.builder.appName('data_processing').getOrCreate()
32 |
33 | # 2. 加载数据
34 | df = spark.read.csv('./data/sample_data.csv', inferSchema=True, header=True)
35 | print(df.columns) # 打印所有特征名: ['ratings', 'age', 'experience', 'family', 'mobile']
36 | print(df.count()) # 总的数据量: 33
37 |
38 | # 打印数据格式
39 | print(df.printSchema())
40 |
41 | # 打印前五条数据
42 | print(df.show(n=5))
43 |
44 | # 打印某两列 的前三条数据
45 | print(df.select('ratings', 'mobile').show(n=3))
46 |
47 | # 打印数据统计量 也就是每个特征的均值、方差等。
48 | print(df.describe().show())
49 |
50 | # 新建一列数据
51 | print(df.withColumn("age_after_10_yrs", (df["age"]+10)).show(5))
52 |
53 | # 将某列数据转换类型 编程新的一列数据
54 | print(df.withColumn('age_double', df['age'].cast(DoubleType())).show(3, False))
55 |
56 | # 过滤: 指定某个属性的取值,找出该属性取该值的全部数据
57 | print(df.filter(df['mobile'] == 'Vivo').select('age', 'ratings', 'mobile').show())
58 |
59 | # 多条件过滤
60 | print(df.filter((df['mobile'] == 'Vivo') & (df['experience'] > 10)).show())
61 |
62 | # 将某个特征下的值去重后,然后显示出来
63 | print(df.select('mobile').distinct().show())
64 | print('去重后的取值数:', df.select('mobile').distinct().count())
65 |
66 | # 根据某个特征的取值进行分组
67 | print(df.groupBy('mobile').count().show()) # 分组统计个数
68 | print(df.groupBy('mobile').mean().show()) # 分组后 计算每个特征的均值
69 | print(df.groupBy('mobile').sum().show()) # 分组后 计算每个特征的和
70 | print(df.groupBy('mobile').agg({'experience': 'sum'}).show()) # 分组后,只对experience特征求和
71 | print(df.groupBy('mobile').max().show()) # 分组后 计算每个特征的最大值
72 | print(df.groupBy('mobile').min().show()) # 分组后 计算每个特征的最小值
73 |
74 | # 普通UDF
75 | # 用户自定义数据函数UDF
76 | brand_udf = udf(price_range, StringType()) # 两个参数: 用户自定的函数,传输的数据类型
77 | print(df.withColumn('price_range', brand_udf(df['mobile'])).show()) # 将udf应用在mobile特征上
78 |
79 | # 或者采用lambda表达式
80 | age_udf = udf(lambda age: "young" if age <= 30 else "senior", StringType())
81 | print(df.withColumn("age_group", age_udf(df.age)).show())
82 |
83 | # 去掉重复的记录
84 | print(df.count())
85 | df = df.dropDuplicates()
86 | print('去掉重复记录后的数据数:', df.count())
87 |
88 | # 删除某列
89 | df_new = df.drop('mobile')
90 | print(df_new.show(5))
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
--------------------------------------------------------------------------------
/PySpark/002-linear_regression_use_pyspark.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 002-linear_regression_use_pyspark.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-04-08
6 | """
7 | import findspark
8 | findspark.init()
9 |
10 | from pyspark.sql import SparkSession
11 | from pyspark.sql.functions import corr
12 | from pyspark.ml.linalg import Vector
13 | from pyspark.ml.feature import VectorAssembler
14 | from pyspark.ml.regression import LinearRegression
15 |
16 |
17 | def analyse_data(df):
18 | '''
19 | 数据分析
20 | :param df:
21 | :return:
22 | '''
23 | # 打印数据格式
24 | print(df.printSchema())
25 |
26 | # 打印前十条数据
27 | print(df.head(10))
28 |
29 | # 看某个特征与输出的相关系数 var_1与output的相关系数
30 | print(df.select(corr('var_1', 'output')).show()) # 0.9187399607627283
31 |
32 |
33 | def feature_process(df):
34 | '''
35 | 特征工程
36 | :param df:
37 | :return:
38 | '''
39 | # 将var_1到var2合成一个向量,名字叫做features
40 | vec_assmebler = VectorAssembler(inputCols=['var_1', 'var_2', 'var_3', 'var_4', 'var_5'], outputCol='features')
41 | features_df = vec_assmebler.transform(df)
42 | # print(features_df.select('features').show(5)) # 看features的取值
43 |
44 | model_df = features_df.select('features', 'output') # 将features和输出拿出来 进行模型训练
45 | # print(model_df.show(5))
46 | return model_df
47 |
48 |
49 | if __name__ == '__main__':
50 | # 1. 加载数据集
51 | spark = SparkSession.builder.appName('lin_reg').getOrCreate()
52 | df = spark.read.csv('./data/Linear_regression_dataset.csv', inferSchema=True, header=True)
53 | # print('数据量:{}, 特征数:{}'.format(df.count(), len(df.columns))) # 数据量:1232, 特征数:6
54 |
55 | # 2. 数据分析
56 | # analyse_data(df) 如果进行数据分析 执行该函数
57 |
58 | # 3. 特征工程
59 | model_df = feature_process(df) # 将各个特征的值合并成一个向量
60 | # 划分数据
61 | train_df, test_df = model_df.randomSplit([0.7, 0.3])
62 | # print('训练集---数据量:{}, 特征数:{}'.format(train_df.count(), len(train_df.columns))) # 数据量:868, 特征数:2
63 | # print('测试集---数据量:{}, 特征数:{}'.format(test_df.count(), len(test_df.columns))) # 数据量:364, 特征数:2
64 |
65 | # 4. 模型训练
66 | lin_Reg = LinearRegression(labelCol='output')
67 | lr_model = lin_Reg.fit(train_df)
68 |
69 | # 5. 模型评价
70 | # 模型训练完毕 打印回归系数
71 | print(lr_model.coefficients)
72 |
73 | training_predictions = lr_model.evaluate(train_df)
74 | print('训练集的均方误差:', training_predictions.meanSquaredError)
75 | # 训练集的均方误差: 0.00014265219879599827
76 |
77 | testing_predictions = lr_model.evaluate(test_df)
78 | print('测试集的均方误差:', testing_predictions.meanSquaredError)
79 | # 测试集的均方误差: 0.00014983739298532136
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
--------------------------------------------------------------------------------
/PySpark/003-logistic_regression_use_pyspark.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 003-logistic_regression_use_pyspark.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-04-08
6 | """
7 | import findspark
8 | findspark.init()
9 |
10 | from pyspark.sql import SparkSession
11 | from pyspark.ml.feature import StringIndexer
12 | from pyspark.ml.feature import VectorAssembler
13 | from pyspark.ml.feature import OneHotEncoder
14 | from pyspark.ml.classification import LogisticRegression
15 | from pyspark.ml.evaluation import BinaryClassificationEvaluator
16 |
17 |
18 | def analyse_data(df):
19 | '''
20 | 数据分析
21 | :param df:
22 | :return:
23 | '''
24 | # 打印数据的格式
25 | print(df.printSchema())
26 |
27 | # 打印前五条数据
28 | print(df.show(n=5))
29 |
30 | # 简单看一下各个特征的统计指标
31 | print(df.describe().show()) # 对于离散值 是不计算均值和方差的
32 |
33 | # 按国家特征进行聚合 看看那个国家样本多
34 | print(df.groupby('Country').count().show())
35 |
36 | # 看看搜索引擎用户数量谁最高
37 | print(df.groupby('Platform').count().show())
38 |
39 |
40 | def feature_process(df):
41 | '''
42 | 特征工程
43 | :param df:
44 | :return:
45 | '''
46 | # 这里需要将国家和搜索引擎两个特征转为数值特征
47 | search_engine_indexer = StringIndexer(inputCol="Platform", outputCol='Platform_Num').fit(df)
48 | df = search_engine_indexer.transform(df)
49 | # print(df.show(3))
50 | search_engine_encoder = OneHotEncoder(inputCol='Platform_Num', outputCol='Platform_Num_Vec').fit(df)
51 | df = search_engine_encoder.transform(df)
52 | # print(df.show(3))
53 |
54 | # print('*'*150)
55 | # 然后处理国家特征
56 | country_indexer = StringIndexer(inputCol="Country", outputCol='Country_Num').fit(df)
57 | df = country_indexer.transform(df)
58 | # print(df.show(3))
59 | country_encoder = OneHotEncoder(inputCol='Country_Num', outputCol='Country_Num_Vec').fit(df)
60 | df = country_encoder.transform(df)
61 | # print(df.show(3))
62 |
63 | df_assembler = VectorAssembler(
64 | inputCols=['Platform_Num_Vec', 'Country_Num_Vec', 'Age', 'Repeat_Visitor', 'Web_pages_viewed'],
65 | outputCol='features'
66 | )
67 | df = df_assembler.transform(df)
68 | model_df = df.select(['features', 'Status'])
69 | return model_df
70 |
71 |
72 | if __name__ == "__main__":
73 | # 1. 加载数据
74 | spark = SparkSession.builder.appName('log_reg').getOrCreate()
75 | df = spark.read.csv('./data/Log_Reg_dataset.csv', inferSchema=True, header=True)
76 | # print('样本数:{}, 特征数:{}'.format(df.count(), len(df.columns))) # 样本数:20000, 特征数:6
77 |
78 | # 2. 数据分析
79 | # analyse_data(df)
80 |
81 | # 3. 特征工程
82 | model_df = feature_process(df)
83 | # print(model_df.show(3))
84 | # 切分数据集
85 | training_df, test_df = model_df.randomSplit([0.75, 0.25])
86 | print('训练集的个数:', training_df.count())
87 | print('测试集的个数:', test_df.count())
88 |
89 | print('训练集的正负样本比例:')
90 | print(training_df.groupBy('Status').count().show())
91 |
92 | print('测试集的正负样本比例:')
93 | print(test_df.groupBy('Status').count().show())
94 |
95 | # 4. 训练模型
96 | log_reg = LogisticRegression(labelCol='Status').fit(training_df)
97 |
98 | # 5. 测试模型
99 | train_results = log_reg.evaluate(training_df).predictions
100 | correct_preds = train_results.filter(train_results['Status'] == 1).filter(train_results['prediction'] == 1).count()
101 | print('训练集的正确率:', float(correct_preds)/(training_df.filter(training_df['Status'] == 1).count()))
102 |
103 | # 在测试集上的表现
104 | results = log_reg.evaluate(test_df).predictions
105 | # 计算混淆矩阵
106 | true_postives = results[(results.Status == 1) & (results.prediction == 1)].count()
107 | true_negatives = results[(results.Status == 0) & (results.prediction == 0)].count()
108 | false_positives = results[(results.Status == 0) & (results.prediction == 1)].count()
109 | false_negatives = results[(results.Status == 1) & (results.prediction == 0)].count()
110 | recall = float(true_postives)/(true_postives + false_negatives)
111 | print('召回率:', recall)
112 |
113 | precision = float(true_postives) / (true_postives + false_positives)
114 | print('精确率:', precision)
115 |
116 | accuracy = float((true_postives+true_negatives) /(results.count()))
117 | print('准确率:', accuracy)
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
--------------------------------------------------------------------------------
/PySpark/004-random_forests_classification_use_pyspark.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 004-random_forests_classification_use_pyspark.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-04-08
6 | """
7 | import findspark
8 |
9 | findspark.init()
10 |
11 | from pyspark.ml.feature import VectorAssembler
12 | from pyspark.sql import SparkSession
13 | from pyspark.ml.classification import RandomForestClassifier
14 | from pyspark.ml.classification import RandomForestClassificationModel
15 | from pyspark.ml.evaluation import BinaryClassificationEvaluator
16 | from pyspark.ml.evaluation import MulticlassClassificationEvaluator
17 |
18 |
19 | def analyse_data(df):
20 | '''
21 | 数据分析
22 | :param df:
23 | :return:
24 | '''
25 | print(df.show(5))
26 |
27 | # 看看每个特征的统计信息 如均值方差等
28 | print(df.describe().select('summary', 'rate_marriage', 'age', 'yrs_married', 'children', 'religious').show())
29 |
30 | # 人们对婚姻打分比例
31 | print(df.groupBy('rate_marriage').count().show())
32 |
33 | # 以孩子和事务为键 然后聚合。 可以发现数据集中 没孩子 没事务的人最多
34 | print(df.groupBy('children', 'affairs').count().orderBy('children', 'affairs', 'count', ascending=True).show())
35 |
36 |
37 | def feature_process(df):
38 | '''
39 | 特征工程
40 | :param df:
41 | :return:
42 | '''
43 | df_assembler = VectorAssembler(inputCols=['rate_marriage', 'age', 'yrs_married', 'children', 'religious'],
44 | outputCol="features")
45 | df = df_assembler.transform(df)
46 | model_df = df.select(['features', 'affairs'])
47 | return model_df
48 |
49 |
50 | if __name__ == '__main__':
51 | # 1. 加载数据集
52 | spark = SparkSession.builder.appName('random_forest').getOrCreate()
53 | df = spark.read.csv('./data/affairs.csv', inferSchema=True, header=True)
54 | print((df.count(), len(df.columns)))
55 |
56 | # 2. 数据分析
57 | analyse_data(df)
58 |
59 | # 3. 特征工程
60 | model_df = feature_process(df)
61 | # 切分数据集
62 | train_df, test_df = model_df.randomSplit([0.75, 0.25])
63 | print('训练集条数:', train_df.count())
64 | print('训练集标签的统计:')
65 | print(train_df.groupBy('affairs').count().show())
66 |
67 | print('测试集条数:', test_df.count())
68 | print('测试集标签的统计:')
69 | print(test_df.groupBy('affairs').count().show())
70 |
71 | # 4. 训练模型
72 | rf_classifier = RandomForestClassifier(labelCol='affairs', numTrees=50).fit(train_df)
73 |
74 | # 5. 模型评估
75 | rf_predictions = rf_classifier.transform(test_df)
76 |
77 | rf_accuracy = MulticlassClassificationEvaluator(labelCol='affairs', metricName='accuracy').evaluate(rf_predictions)
78 | print('测试集的准确率:', rf_accuracy)
79 |
80 | rf_precision = MulticlassClassificationEvaluator(labelCol='affairs', metricName='weightedPrecision').evaluate(
81 | rf_predictions)
82 | print('测试集的精确率:', rf_precision)
83 |
84 | rf_auc = BinaryClassificationEvaluator(labelCol='affairs').evaluate(rf_predictions)
85 | print('测试集的AUC值:', rf_auc)
86 |
87 | # 看一下在分类中 每个特征所起的重要性
88 | print(rf_classifier.featureImportances)
89 |
90 | # 保存模型
91 | rf_classifier.save("./RF_model")
92 |
93 | # 下次使用, 则按照下面的方式加载
94 | rf = RandomForestClassificationModel.load("./RF_model")
95 | model_preditions = rf.transform(test_df)
96 | model_preditions.show()
97 |
98 |
--------------------------------------------------------------------------------
/PySpark/005-kmeans_cluster_use_pyspark.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 005-kmeans_cluster_use_pyspark.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-04-08
6 | """
7 | import findspark
8 | findspark.init()
9 | import pyspark
10 | import pandas as pd
11 | import numpy as np
12 | import matplotlib.pyplot as plt
13 | from pyspark.sql.functions import *
14 | from pyspark.sql.types import *
15 | from pyspark.sql.functions import rand, randn
16 | from pyspark.ml.clustering import KMeans
17 | from pyspark.sql import SparkSession
18 | from pyspark.ml.linalg import Vectors
19 | from pyspark.ml.feature import VectorAssembler
20 | from pyspark.ml.evaluation import ClusteringEvaluator
21 |
22 | def analyse_data(df):
23 | '''
24 | 数据分析
25 | :param df:
26 | :return:
27 | '''
28 | print('总共的标签数:', df.select('species').distinct().count())
29 |
30 | # 每类数据集的样本数
31 | print(df.groupBy('species').count().orderBy('count', ascending=False).show())
32 |
33 |
34 | def feature_process(df):
35 | '''
36 | 特征工程
37 | :param df:
38 | :return:
39 | '''
40 | input_cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
41 | vec_assembler = VectorAssembler(inputCols = input_cols, outputCol='features')
42 | final_data = vec_assembler.transform(df)
43 | return final_data
44 |
45 |
46 | if __name__ == '__main__':
47 | # 加载鸢尾花的数据
48 | spark = SparkSession.builder.appName('k_means').getOrCreate()
49 | df = spark.read.csv('./data/iris_dataset.csv',inferSchema=True,header=True)
50 | print((df.count(),len(df.columns)))
51 |
52 | analyse_data(df)
53 |
54 | final_data = feature_process(df)
55 |
56 | errors=[]
57 |
58 | for k in range(2, 10):
59 | kmeans = KMeans(featuresCol='features', k=k)
60 | model = kmeans.fit(final_data)
61 |
62 | # Make predictions
63 | predictions = model.transform(final_data)
64 | evaluator = ClusteringEvaluator()
65 | silhouette = evaluator.evaluate(predictions) # 欧式距离
66 |
67 | # 打印聚类的中心
68 | centers = model.clusterCenters()
69 | print("Cluster Centers: ")
70 | for center in centers:
71 | print(center)
72 |
73 |
74 |
75 |
76 |
--------------------------------------------------------------------------------
/PySpark/006-recommendr_system_use_pyspark.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 006-recommendr_system_use_pyspark.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-04-09
6 | """
7 | import findspark
8 |
9 | findspark.init()
10 |
11 | from pyspark.sql import SparkSession
12 | from pyspark.sql.functions import rand
13 | from pyspark.ml.feature import StringIndexer, IndexToString
14 | from pyspark.ml.recommendation import ALS
15 | from pyspark.ml.evaluation import RegressionEvaluator
16 |
17 |
18 | def analyse_data(df):
19 | '''
20 | 数据分析
21 | :param df:
22 | :return:
23 | '''
24 | print(df.printSchema()) # 查看数据格式
25 |
26 | # 看前5条数据
27 | print(df.show(5))
28 |
29 | print(df.orderBy(rand()).show(5)) # 将数据打乱 看前五条
30 |
31 | # 与用户进行聚合,看每个用户都看过多少电影 前五名最爱看电影的人
32 | print(df.groupBy('userId').count().orderBy('count', ascending=False).show(5))
33 |
34 | # 显示前五个最热门的电影
35 | print(df.groupBy('title').count().orderBy('count', ascending=False).show(5))
36 |
37 |
38 | def feature_process(df):
39 | '''
40 | 特征工程
41 | :param df:
42 | :return:
43 | '''
44 | # 1. 将title转为数字 也就是多加了一列特征
45 | stringIndexer = StringIndexer(inputCol="title", outputCol="title_new")
46 | model = stringIndexer.fit(df)
47 | indexed = model.transform(df)
48 | print(indexed.show(5))
49 | return indexed
50 |
51 |
52 | if __name__ == '__main__':
53 | # 1. 加载数据
54 | spark = SparkSession.builder.appName('rc').getOrCreate()
55 | df = spark.read.csv('./data/movie_ratings_df.csv', inferSchema=True, header=True)
56 | # print((df.count(), len(df.columns))) # (100000, 3)
57 |
58 | # 2. 数据分析
59 | analyse_data(df)
60 |
61 | # 3. 特征工程
62 | model_df = feature_process(df)
63 | # 切分数据集
64 | train, test = model_df.randomSplit([0.75, 0.25])
65 | print('训练集条数:', train.count())
66 | print('测试集条数:', test.count())
67 | # 训练集条数: 74996
68 | # 测试集条数: 25004
69 |
70 | # 4. 模型训练
71 | rec = ALS(maxIter=10, regParam=0.01, userCol='userId',
72 | itemCol='title_new', ratingCol='rating',
73 | nonnegative=True, coldStartStrategy="drop")
74 | rec_model = rec.fit(train)
75 |
76 | # 5. 模型评估
77 | predicted_ratings = rec_model.transform(test)
78 | print(predicted_ratings.printSchema())
79 |
80 | # 计算预测和rating的均方误差
81 | evaluator=RegressionEvaluator(metricName='rmse',predictionCol='prediction',labelCol='rating')
82 | rmse=evaluator.evaluate(predicted_ratings)
83 | print(rmse)
84 |
85 |
--------------------------------------------------------------------------------
/PySpark/007-NLP_use_pyspark.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 007-NLP_use_pyspark.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-04-09
6 | """
7 | import findspark
8 |
9 | findspark.init()
10 |
11 | from pyspark.sql import SparkSession
12 | from pyspark.ml.feature import Tokenizer
13 | from pyspark.ml.feature import StopWordsRemover
14 | from pyspark.ml.feature import CountVectorizer
15 | from pyspark.ml.feature import HashingTF, IDF
16 | from pyspark.sql.functions import length
17 | from pyspark.sql.functions import udf
18 | from pyspark.sql.types import IntegerType
19 | from pyspark.sql.functions import *
20 | from pyspark.ml.feature import VectorAssembler
21 | from pyspark.ml.classification import LogisticRegression
22 | from pyspark.ml.evaluation import BinaryClassificationEvaluator
23 |
24 |
25 | def basic_op():
26 | '''
27 | 基本的操作
28 | :return:
29 | '''
30 | spark = SparkSession.builder.appName('nlp').getOrCreate()
31 | df = spark.createDataFrame([(1, 'I really liked this movie'),
32 | (2, 'I would recommend this movie to my friends'),
33 | (3, 'movie was alright but acting was horrible'),
34 | (4, 'I am never watching that movie ever again')],
35 | ['user_id', 'review'])
36 | # print(df.show())
37 |
38 | # 1. 将文本进行分词 做成新一个特征
39 | tokenization = Tokenizer(inputCol='review', outputCol='tokens')
40 | tokenized_df = tokenization.transform(df)
41 | # print(tokenized_df.show())
42 |
43 | # 2. 去除停用词
44 | stopword_removal = StopWordsRemover(inputCol='tokens', outputCol='refined_tokens')
45 | refined_df = stopword_removal.transform(tokenized_df)
46 | print(refined_df.select(['user_id', 'tokens', 'refined_tokens']).show(10))
47 |
48 | # 3. 统计向量 使用one-hot
49 | count_vec = CountVectorizer(inputCol='refined_tokens', outputCol='features')
50 | cv_df = count_vec.fit(refined_df).transform(refined_df)
51 | print(cv_df.select(['user_id', 'refined_tokens', 'features']).show(4))
52 | print('词表(注:去停用词之后的):', count_vec.fit(refined_df).vocabulary)
53 |
54 | # 4. 计算tf-idf
55 | hashing_vec = HashingTF(inputCol='refined_tokens', outputCol='tf_features')
56 | hashing_df = hashing_vec.transform(refined_df) # 先进行一个hash计算
57 | print(hashing_df.select(['user_id', 'refined_tokens', 'tf_features']).show())
58 |
59 | tf_idf_vec = IDF(inputCol='tf_features', outputCol='tf_idf_features')
60 | tf_idf_df = tf_idf_vec.fit(hashing_df).transform(hashing_df)
61 | print(tf_idf_df.select(['user_id', 'tf_idf_features']).show(4))
62 |
63 |
64 | def data_process(text_df):
65 | text_df = text_df.filter(((text_df.Sentiment == '1') | (text_df.Sentiment == '0')))
66 | print('清洗后的数据量:', text_df.count())
67 |
68 | print('正负样本的分布')
69 | print(text_df.groupBy('Sentiment').count().show())
70 |
71 | # 加入长度特征
72 | text_df = text_df.withColumn("Label", text_df.Sentiment.cast('float')).drop('Sentiment')
73 |
74 | # 分词
75 | text_df = text_df.withColumn('length', length(text_df['Review']))
76 | tokenization = Tokenizer(inputCol='Review', outputCol='tokens')
77 | tokenized_df = tokenization.transform(text_df)
78 |
79 | # 去停用词
80 | stopword_removal = StopWordsRemover(inputCol='tokens', outputCol='refined_tokens')
81 | refined_text_df = stopword_removal.transform(tokenized_df)
82 |
83 | len_udf = udf(lambda s: len(s), IntegerType())
84 | refined_text_df = refined_text_df.withColumn("token_count", len_udf(col('refined_tokens')))
85 |
86 | count_vec = CountVectorizer(inputCol='refined_tokens', outputCol='features')
87 | cv_text_df = count_vec.fit(refined_text_df).transform(refined_text_df)
88 |
89 | model_text_df = cv_text_df.select(['features', 'token_count', 'Label'])
90 | return model_text_df
91 |
92 |
93 | if __name__ == '__main__':
94 | # basic_op()
95 |
96 | # 下面做一个简单的文本分类
97 | spark = SparkSession.builder.appName('text_classification').getOrCreate()
98 | text_df = spark.read.csv('./data/Movie_reviews.csv', inferSchema=True, header=True, sep=',')
99 | print('数据量:', text_df.count()) # 数据量: 7087
100 |
101 | model_text_df = data_process(text_df)
102 | df_assembler = VectorAssembler(inputCols=['features', 'token_count'], outputCol='features_vec')
103 | model_text_df = df_assembler.transform(model_text_df)
104 |
105 | # 切分数据集
106 | training_df, test_df = model_text_df.randomSplit([0.75, 0.25])
107 |
108 | # 模型训练
109 | log_reg = LogisticRegression(featuresCol='features_vec', labelCol='Label').fit(training_df)
110 |
111 | # 模型评估
112 | results = log_reg.evaluate(test_df).predictions
113 |
114 | # confusion matrix
115 | true_postives = results[(results.Label == 1) & (results.prediction == 1)].count()
116 | true_negatives = results[(results.Label == 0) & (results.prediction == 0)].count()
117 | false_positives = results[(results.Label == 0) & (results.prediction == 1)].count()
118 | false_negatives = results[(results.Label == 1) & (results.prediction == 0)].count()
119 |
120 | recall = float(true_postives) / (true_postives + false_negatives)
121 | print(recall)
122 |
123 | precision = float(true_postives) / (true_postives + false_positives)
124 | print(precision)
125 |
126 | accuracy = float((true_postives + true_negatives) / (results.count()))
127 | print(accuracy)
128 |
--------------------------------------------------------------------------------
/PySpark/data/Movie_reviews.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/PySpark/data/Movie_reviews.csv
--------------------------------------------------------------------------------
/PySpark/data/iris_dataset.csv:
--------------------------------------------------------------------------------
1 | sepal_length,sepal_width,petal_length,petal_width,species
2 | 5.1,3.5,1.4,0.2,setosa
3 | 4.9,3,1.4,0.2,setosa
4 | 4.7,3.2,1.3,0.2,setosa
5 | 4.6,3.1,1.5,0.2,setosa
6 | 5,3.6,1.4,0.2,setosa
7 | 5.4,3.9,1.7,0.4,setosa
8 | 4.6,3.4,1.4,0.3,setosa
9 | 5,3.4,1.5,0.2,setosa
10 | 4.4,2.9,1.4,0.2,setosa
11 | 4.9,3.1,1.5,0.1,setosa
12 | 5.4,3.7,1.5,0.2,setosa
13 | 4.8,3.4,1.6,0.2,setosa
14 | 4.8,3,1.4,0.1,setosa
15 | 4.3,3,1.1,0.1,setosa
16 | 5.8,4,1.2,0.2,setosa
17 | 5.7,4.4,1.5,0.4,setosa
18 | 5.4,3.9,1.3,0.4,setosa
19 | 5.1,3.5,1.4,0.3,setosa
20 | 5.7,3.8,1.7,0.3,setosa
21 | 5.1,3.8,1.5,0.3,setosa
22 | 5.4,3.4,1.7,0.2,setosa
23 | 5.1,3.7,1.5,0.4,setosa
24 | 4.6,3.6,1,0.2,setosa
25 | 5.1,3.3,1.7,0.5,setosa
26 | 4.8,3.4,1.9,0.2,setosa
27 | 5,3,1.6,0.2,setosa
28 | 5,3.4,1.6,0.4,setosa
29 | 5.2,3.5,1.5,0.2,setosa
30 | 5.2,3.4,1.4,0.2,setosa
31 | 4.7,3.2,1.6,0.2,setosa
32 | 4.8,3.1,1.6,0.2,setosa
33 | 5.4,3.4,1.5,0.4,setosa
34 | 5.2,4.1,1.5,0.1,setosa
35 | 5.5,4.2,1.4,0.2,setosa
36 | 4.9,3.1,1.5,0.1,setosa
37 | 5,3.2,1.2,0.2,setosa
38 | 5.5,3.5,1.3,0.2,setosa
39 | 4.9,3.1,1.5,0.1,setosa
40 | 4.4,3,1.3,0.2,setosa
41 | 5.1,3.4,1.5,0.2,setosa
42 | 5,3.5,1.3,0.3,setosa
43 | 4.5,2.3,1.3,0.3,setosa
44 | 4.4,3.2,1.3,0.2,setosa
45 | 5,3.5,1.6,0.6,setosa
46 | 5.1,3.8,1.9,0.4,setosa
47 | 4.8,3,1.4,0.3,setosa
48 | 5.1,3.8,1.6,0.2,setosa
49 | 4.6,3.2,1.4,0.2,setosa
50 | 5.3,3.7,1.5,0.2,setosa
51 | 5,3.3,1.4,0.2,setosa
52 | 7,3.2,4.7,1.4,versicolor
53 | 6.4,3.2,4.5,1.5,versicolor
54 | 6.9,3.1,4.9,1.5,versicolor
55 | 5.5,2.3,4,1.3,versicolor
56 | 6.5,2.8,4.6,1.5,versicolor
57 | 5.7,2.8,4.5,1.3,versicolor
58 | 6.3,3.3,4.7,1.6,versicolor
59 | 4.9,2.4,3.3,1,versicolor
60 | 6.6,2.9,4.6,1.3,versicolor
61 | 5.2,2.7,3.9,1.4,versicolor
62 | 5,2,3.5,1,versicolor
63 | 5.9,3,4.2,1.5,versicolor
64 | 6,2.2,4,1,versicolor
65 | 6.1,2.9,4.7,1.4,versicolor
66 | 5.6,2.9,3.6,1.3,versicolor
67 | 6.7,3.1,4.4,1.4,versicolor
68 | 5.6,3,4.5,1.5,versicolor
69 | 5.8,2.7,4.1,1,versicolor
70 | 6.2,2.2,4.5,1.5,versicolor
71 | 5.6,2.5,3.9,1.1,versicolor
72 | 5.9,3.2,4.8,1.8,versicolor
73 | 6.1,2.8,4,1.3,versicolor
74 | 6.3,2.5,4.9,1.5,versicolor
75 | 6.1,2.8,4.7,1.2,versicolor
76 | 6.4,2.9,4.3,1.3,versicolor
77 | 6.6,3,4.4,1.4,versicolor
78 | 6.8,2.8,4.8,1.4,versicolor
79 | 6.7,3,5,1.7,versicolor
80 | 6,2.9,4.5,1.5,versicolor
81 | 5.7,2.6,3.5,1,versicolor
82 | 5.5,2.4,3.8,1.1,versicolor
83 | 5.5,2.4,3.7,1,versicolor
84 | 5.8,2.7,3.9,1.2,versicolor
85 | 6,2.7,5.1,1.6,versicolor
86 | 5.4,3,4.5,1.5,versicolor
87 | 6,3.4,4.5,1.6,versicolor
88 | 6.7,3.1,4.7,1.5,versicolor
89 | 6.3,2.3,4.4,1.3,versicolor
90 | 5.6,3,4.1,1.3,versicolor
91 | 5.5,2.5,4,1.3,versicolor
92 | 5.5,2.6,4.4,1.2,versicolor
93 | 6.1,3,4.6,1.4,versicolor
94 | 5.8,2.6,4,1.2,versicolor
95 | 5,2.3,3.3,1,versicolor
96 | 5.6,2.7,4.2,1.3,versicolor
97 | 5.7,3,4.2,1.2,versicolor
98 | 5.7,2.9,4.2,1.3,versicolor
99 | 6.2,2.9,4.3,1.3,versicolor
100 | 5.1,2.5,3,1.1,versicolor
101 | 5.7,2.8,4.1,1.3,versicolor
102 | 6.3,3.3,6,2.5,virginica
103 | 5.8,2.7,5.1,1.9,virginica
104 | 7.1,3,5.9,2.1,virginica
105 | 6.3,2.9,5.6,1.8,virginica
106 | 6.5,3,5.8,2.2,virginica
107 | 7.6,3,6.6,2.1,virginica
108 | 4.9,2.5,4.5,1.7,virginica
109 | 7.3,2.9,6.3,1.8,virginica
110 | 6.7,2.5,5.8,1.8,virginica
111 | 7.2,3.6,6.1,2.5,virginica
112 | 6.5,3.2,5.1,2,virginica
113 | 6.4,2.7,5.3,1.9,virginica
114 | 6.8,3,5.5,2.1,virginica
115 | 5.7,2.5,5,2,virginica
116 | 5.8,2.8,5.1,2.4,virginica
117 | 6.4,3.2,5.3,2.3,virginica
118 | 6.5,3,5.5,1.8,virginica
119 | 7.7,3.8,6.7,2.2,virginica
120 | 7.7,2.6,6.9,2.3,virginica
121 | 6,2.2,5,1.5,virginica
122 | 6.9,3.2,5.7,2.3,virginica
123 | 5.6,2.8,4.9,2,virginica
124 | 7.7,2.8,6.7,2,virginica
125 | 6.3,2.7,4.9,1.8,virginica
126 | 6.7,3.3,5.7,2.1,virginica
127 | 7.2,3.2,6,1.8,virginica
128 | 6.2,2.8,4.8,1.8,virginica
129 | 6.1,3,4.9,1.8,virginica
130 | 6.4,2.8,5.6,2.1,virginica
131 | 7.2,3,5.8,1.6,virginica
132 | 7.4,2.8,6.1,1.9,virginica
133 | 7.9,3.8,6.4,2,virginica
134 | 6.4,2.8,5.6,2.2,virginica
135 | 6.3,2.8,5.1,1.5,virginica
136 | 6.1,2.6,5.6,1.4,virginica
137 | 7.7,3,6.1,2.3,virginica
138 | 6.3,3.4,5.6,2.4,virginica
139 | 6.4,3.1,5.5,1.8,virginica
140 | 6,3,4.8,1.8,virginica
141 | 6.9,3.1,5.4,2.1,virginica
142 | 6.7,3.1,5.6,2.4,virginica
143 | 6.9,3.1,5.1,2.3,virginica
144 | 5.8,2.7,5.1,1.9,virginica
145 | 6.8,3.2,5.9,2.3,virginica
146 | 6.7,3.3,5.7,2.5,virginica
147 | 6.7,3,5.2,2.3,virginica
148 | 6.3,2.5,5,1.9,virginica
149 | 6.5,3,5.2,2,virginica
150 | 6.2,3.4,5.4,2.3,virginica
151 | 5.9,3,5.1,1.8,virginica
--------------------------------------------------------------------------------
/PySpark/data/movie_ratings_df.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/PySpark/data/movie_ratings_df.csv
--------------------------------------------------------------------------------
/PySpark/data/sample_data.csv:
--------------------------------------------------------------------------------
1 | ratings,age,experience,family,mobile
2 | 3,32,9,3,Vivo
3 | 3,27,13,3,Apple
4 | 4,22,2.5,0,Samsung
5 | 4,37,16.5,4,Apple
6 | 5,27,9,1,MI
7 | 4,27,9,0,Oppo
8 | 5,37,23,5,Vivo
9 | 5,37,23,5,Samsung
10 | 3,22,2.5,0,Apple
11 | 3,27,6,0,MI
12 | 2,27,6,2,Oppo
13 | 5,27,6,2,Samsung
14 | 3,37,16.5,5,Apple
15 | 5,27,6,0,MI
16 | 4,22,6,1,Oppo
17 | 4,37,9,2,Samsung
18 | 4,27,6,1,Apple
19 | 1,37,23,5,MI
20 | 2,42,23,2,Oppo
21 | 4,37,6,0,Vivo
22 | 5,22,2.5,0,Samsung
23 | 3,37,16.5,5,Apple
24 | 3,42,23,5,MI
25 | 2,27,9,2,Samsung
26 | 4,27,6,1,Apple
27 | 5,27,2.5,0,MI
28 | 2,27,6,2,Oppo
29 | 5,37,13,1,Vivo
30 | 2,32,16.5,2,Oppo
31 | 3,27,6,0,MI
32 | 3,27,6,0,MI
33 | 4,22,6,1,Oppo
34 | 4,37,6,0,Vivo
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Python-Library-Learning
2 | Here we will sort out a variety of interesting Python library learning
3 |
--------------------------------------------------------------------------------
/RSA实战/001-rsa生成公私钥并保存.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time : 2020/7/15 12:00
3 | # @Author : xiaolu
4 | # @FileName: 001-rsa生成公私钥并保存.py
5 | # @Software: PyCharm
6 | import rsa
7 |
8 | pubkey, privkey = rsa.newkeys(1024) # 生成公钥和私钥
9 |
10 | pub = pubkey.save_pkcs1() # 将生成的公钥和私钥进行转换, 以便存储
11 | pri = privkey.save_pkcs1() # save_pkcs1()是内置方法, 其默认参数就是"PEM"
12 |
13 | with open('pubkey.pem', mode='wb') as f, open('privkey.pem', mode='wb') as f1:
14 | f.write(pub)
15 | f1.write(pri)
16 |
--------------------------------------------------------------------------------
/RSA实战/002-公钥加密私钥解密.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time : 2020/7/15 12:05
3 | # @Author : xiaolu
4 | # @FileName: 002-公钥加密私钥解密.py
5 | # @Software: PyCharm
6 | import rsa
7 |
8 | if __name__ == '__main__':
9 | with open('pubkey.pem', mode='rb') as f, open('privkey.pem', 'rb') as f1:
10 | # 从文件读取公私钥
11 | pub = f.read()
12 | pri = f1.read()
13 |
14 | # 转为原始的状态
15 | pubkey = rsa.PublicKey.load_pkcs1(pub)
16 | privkey = rsa.PrivateKey.load_pkcs1(pri)
17 |
18 | message = '你是个傻逼吧'
19 | info = rsa.encrypt(message.encode('utf8'), pubkey)
20 | msg = rsa.decrypt(info, privkey)
21 | print(msg.decode('utf8'))
22 |
23 |
24 |
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/apscheduler实现定时任务/定时任务.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 定时任务.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-10-27
6 | """
7 | import time
8 | from datetime import datetime
9 | from apscheduler.schedulers.blocking import BlockingScheduler
10 |
11 |
12 | def my_job(text):
13 | print('{}'.format(text), time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
14 |
15 |
16 | if __name__ == '__main__':
17 | sched = BlockingScheduler()
18 | # sched.add_job(my_job, 'interval', days=0, hours=24, minutes=0, seconds=0) # 每隔24小时执行一次
19 | # sched.add_job(my_job, 'interval', seconds=5, args=['北京时间:']) # 每个5秒执行 用interval
20 |
21 | # 指定某个时间点执行一次
22 | sched.add_job(my_job, 'date', run_date=datetime(2021, 10, 27, 17, 8, 5), args=['北京时间:'])
23 | sched.start()
--------------------------------------------------------------------------------
/chinesebert中的pinyin和glyph的处理/MSYH.TTC:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/chinesebert中的pinyin和glyph的处理/MSYH.TTC
--------------------------------------------------------------------------------
/chinesebert中的pinyin和glyph的处理/image_test.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : image_test.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-07-22
6 | """
7 | import os
8 | import pygame
9 |
10 | chinese_dir = 'chinese'
11 | if not os.path.exists(chinese_dir):
12 | os.mkdir(chinese_dir)
13 |
14 | pygame.init()
15 |
16 | # 1. 写出所有的汉字
17 | # start, end = (0x4E00, 0x9FA5) # 汉字编码范围
18 | #
19 | # for codepoint in range(int(start), int(end)):
20 | # word = chr(codepoint)
21 | # font = pygame.font.Font("MSYH.TTC", 22) # 当前目录下要有微软雅黑的字体文件msyh.ttc,或者去c:\Windows\Fonts目录下找
22 | # rtext = font.render(word, True, (0, 0, 0), (255, 255, 255))
23 | # pygame.image.save(rtext, os.path.join(chinese_dir, word + ".png"))
24 |
25 | # 2. 指定汉字 对于不同的字体 可以切换MSYH.TTC文件就行
26 | word = '新'
27 | font = pygame.font.Font("MSYH.TTC", 22) # 当前目录下要有微软雅黑的字体文件msyh.ttc,或者去c:\Windows\Fonts目录下找
28 | rtext = font.render(word, True, (0, 0, 0), (255, 255, 255))
29 | pygame.image.save(rtext, os.path.join(chinese_dir, word + ".png"))
30 |
31 |
--------------------------------------------------------------------------------
/chinesebert中的pinyin和glyph的处理/pinyin_test.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : pinyin_test.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-07-22
6 | """
7 | from pypinyin import pinyin, lazy_pinyin, Style
8 |
9 | if __name__ == '__main__':
10 | print(pinyin('新浪微博')) # 输出: [['xīn'], ['làng'], ['wēi'], ['bó']]
11 |
12 | print(lazy_pinyin('新浪微博')) # 输出: ['xin', 'lang', 'wei', 'bo']
13 |
14 | # 将拼音用数字表示 然后跟在拼音的后面
15 | style = Style.TONE3 # 1代表一声、2代表二声、3代表三声、4代表四声
16 | print(lazy_pinyin('新浪微博', style=style))
17 |
18 |
--------------------------------------------------------------------------------
/collections的用法/001-collections中的namedtuple用法.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time : 2020/8/3 15:57
3 | # @Author : xiaolu
4 | # @FileName: 001-collections中的namedtuple用法.py
5 | # @Software: PyCharm
6 |
7 | # 我的认识 感觉nametuple是一种便捷类的使用
8 | from collections import namedtuple
9 |
10 | Point = namedtuple("Point", ['x', 'y'])
11 | # 相当于定义了一个Point类,其中x, y为类的属性
12 | p = Point(1, 2)
13 | print(p.x)
14 | print(p.y)
15 |
16 |
17 | # 在深度学习中 我们可以定义参数文件
18 | from collections import namedtuple
19 | Config = namedtuple('Config', ['learning_rate',
20 | 'epoch',
21 | 'device',
22 | 'batch_size',
23 | 'vocab_size'])
24 |
25 |
26 | config = Config(
27 | learning_rate=1e-5,
28 | epoch=10,
29 | device=4,
30 | batch_size=32,
31 | vocab_size=12239
32 | )
33 | print(config.learning_rate)
34 |
35 |
36 |
--------------------------------------------------------------------------------
/elasticsearch/001-创建库并插入数据.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time : 2020/7/30 15:04
3 | # @Author : xiaolu
4 | # @FileName: 001-创建库并插入数据.py
5 | # @Software: PyCharm
6 | from elasticsearch import Elasticsearch
7 |
8 |
9 | es = Elasticsearch()
10 |
11 | # result = es.indices.delete(index='point_type', ignore=[400, 404]) # 删除索引(库)
12 | # exit()
13 |
14 |
15 | mapping = {
16 | "settings": {
17 | "analysis": {
18 | "filter": {
19 | "jieba_stop": {
20 | "type": "stop",
21 | "stopwords_path": "stopwords/stopwords.txt"
22 | },
23 | "jieba_synonym": {
24 | "type": "synonym",
25 | "synonyms_path": "synonyms/synonyms.txt"
26 | },
27 | "my_shingle_filter": {
28 | "type": "shingle",
29 | "min_shingle_size": 2,
30 | "max_shingle_size": 2,
31 | "output_unigrams": False
32 | }
33 | },
34 | "analyzer": {
35 | "word_ans": {
36 | "tokenizer": "jieba_search", # 采用结巴分词
37 | "filter": "jieba_stop" # 采用结巴停用词过滤
38 | },
39 | "char_ana": {
40 | "tokenizer": "standard", # 对于字符 采用标准的分词方式 就是按字分割
41 | "filter": "jieba_stop" # 也采用jieba停用词过滤
42 | },
43 | "char_bigram_ana": {
44 | "type": "custom",
45 | "tokenizer": "standard",
46 | "filter": [
47 | "jieba_stop",
48 | "my_shingle_filter"
49 | ]
50 | },
51 | "word_bigram_ana": {
52 | "type": "custom",
53 | "tokenizer": "jieba_search",
54 | "filter": [
55 | "jieba_stop",
56 | "my_shingle_filter"
57 | ]
58 | }
59 | }
60 | }
61 | },
62 | "mappings": {
63 | "properties": {
64 | "title": {
65 | "type": "keyword"
66 | },
67 | "author": {
68 | "type": "keyword"
69 | },
70 | "dynasty": {
71 | "type": "keyword"
72 | },
73 | "words": {
74 | "type": "integer"
75 | },
76 | "content": {
77 | "analyzer": "word_ana",
78 | "search_analyzer": "word_ana",
79 | "type": "text"
80 | }
81 | }
82 | }
83 | }
84 | # 相当于将content入库时,会进行分词,然后采用jieba的停用词过滤方式。 当通过内容去查找时,也是先将问题分词,然后停用词过滤,在进行匹配。
85 |
86 | # es.indices.create(index='point_type', body=mapping)
87 |
88 | # 然后插入数据
89 | data = [
90 | {
91 | "title": "静夜思",
92 | "author": "李白",
93 | "dynasty": "唐",
94 | "words": "20",
95 | "content": "床前明月光,疑是地上霜。举头望明月,低头思故乡。"
96 | },
97 |
98 | {
99 | "title": "观沧海",
100 | "author": "曹操",
101 | "dynasty": "东汉末年",
102 | "words": "56",
103 | "content": "东临碣石,以观沧海。水何澹澹,山岛竦峙。树木丛生,百草丰茂。秋风萧瑟,洪波涌起。日月之行,若出其中。星汉灿烂,若出其里。幸甚至哉,歌以咏志。"
104 | },
105 |
106 | {
107 | "title": "咏鹅",
108 | "author": "骆宾王",
109 | "dynasty": "唐",
110 | "words": "18",
111 | "content": "鹅鹅鹅,曲项向天歌。白毛浮绿水,红掌拨清波。"
112 | },
113 |
114 | {
115 | "title": "将进酒",
116 | "author": "陈陶",
117 | "dynasty": "唐",
118 | "words": "14",
119 | "content": "银鸭金鹅言待谁,隋家岳渎皇家有"
120 | },
121 |
122 | {
123 | "title": "春雪",
124 | "author": "白居易",
125 | "dynasty": "唐",
126 | "words": "10",
127 | "content": "大似落鹅毛,密如飘玉屑"
128 | }
129 | ]
130 | for d in data:
131 | es.index(index='point_type', body=d)
132 |
--------------------------------------------------------------------------------
/elasticsearch/002-es中的搜索.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time : 2020/7/30 15:52
3 | # @Author : xiaolu
4 | # @FileName: 002-es中的搜索.py
5 | # @Software: PyCharm
6 | from elasticsearch import Elasticsearch
7 |
8 |
9 | if __name__ == '__main__':
10 | es = Elasticsearch()
11 | querys = '东临碣石'
12 | dsl = {
13 | 'query': {
14 | 'match': {
15 | 'title': '咏鹅'
16 | }
17 | }
18 | }
19 | results = es.search(index='point_type', body=dsl)['hits']['hits'] # 搜索多条结果的话 这里可能是一个列表
20 |
21 | res = []
22 | for result in results:
23 | res.append(result['_source'])
24 | print(res)
25 |
26 |
--------------------------------------------------------------------------------
/flask+echart+ajax/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/flask+echart+ajax/.DS_Store
--------------------------------------------------------------------------------
/flask+echart+ajax/app.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : app.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2022-01-08
6 | """
7 | import random
8 | from flask import Flask, render_template, jsonify
9 |
10 | app = Flask(__name__)
11 |
12 |
13 | @app.route('/left_data')
14 | def get_left_data():
15 | day = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
16 | # nums = [150, 230, 224, 218, 135, 147, 260]
17 | nums = [random.randint(0, 100) for _ in range(len(day))]
18 | random.shuffle(nums)
19 | data = {'day': day, 'nums': nums}
20 | return jsonify(data)
21 |
22 |
23 | @app.route('/')
24 | def index():
25 | return render_template('index.html')
26 |
27 |
28 | if __name__ == '__main__':
29 | # app.run(port=6000)
30 | app.run(host='0.0.0.0')
--------------------------------------------------------------------------------
/flask+echart+ajax/static/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/flask+echart+ajax/static/.DS_Store
--------------------------------------------------------------------------------
/flask+echart+ajax/static/css/main.css:
--------------------------------------------------------------------------------
1 | body {
2 | margin: 0;
3 | background: #333;
4 | }
5 |
6 | #left {
7 | position: absolute;
8 | width: 50%;
9 | height: 100%;
10 | top: 0%;
11 | left: 0%;
12 | background: #666666;
13 | /* color: white; */
14 | }
15 |
16 | #right {
17 | position: absolute;
18 | width: 50%;
19 | height: 100%;
20 | top: 0%;
21 | right: 0%;
22 | color: #FFFFFF;
23 | /* font-size: 20px; */
24 | background: green;
25 | }
--------------------------------------------------------------------------------
/flask+echart+ajax/static/js/controller.js:
--------------------------------------------------------------------------------
1 | function get_left_data() {
2 | $.ajax({
3 | url:"/left_data",
4 | success: function(data) {
5 | option_left.xAxis.data = data.day
6 | option_left.series[0].data = data.nums
7 | ec_left.setOption(option_left)
8 | },
9 | error: function(xhr, type, errorThrown) {
10 | }
11 | })
12 | }
13 |
14 | get_left_data()
15 | setInterval(get_left_data, 1000*5)
--------------------------------------------------------------------------------
/flask+echart+ajax/static/js/left.js:
--------------------------------------------------------------------------------
1 | var ec_left = echarts.init(document.getElementById("left"), "dark");
2 |
3 | option_left = {
4 | xAxis: {
5 | type: 'category',
6 | data: []
7 | },
8 | yAxis: {
9 | type: 'value'
10 | },
11 | series: [
12 | {
13 | data: [],
14 | type: 'line'
15 | }
16 | ]
17 | };
18 | ec_left.setOption(option_left);
--------------------------------------------------------------------------------
/flask+echart+ajax/static/js/right.js:
--------------------------------------------------------------------------------
1 | var ec_right = echarts.init(document.getElementById("right"), "dark");
2 |
3 | option_right = {
4 | xAxis: {
5 | type: 'category',
6 | data: ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
7 | },
8 | yAxis: {
9 | type: 'value'
10 | },
11 | series: [
12 | {
13 | data: [150, 230, 224, 218, 135, 147, 260],
14 | type: 'line'
15 | }
16 | ]
17 | };
18 | ec_right.setOption(option_right);
--------------------------------------------------------------------------------
/flask+echart+ajax/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Title
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/flask表单那些事/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/flask表单那些事/.DS_Store
--------------------------------------------------------------------------------
/flask表单那些事/app.py:
--------------------------------------------------------------------------------
1 | """
2 | # -*- coding: utf-8 -*-
3 | # @File : app.py
4 | # @Time : 2020/11/19 3:54 下午
5 | # @Author : xiaolu
6 | # @Email : luxiaonlp@163.com
7 | # @Software: PyCharm
8 | """
9 | from flask import Flask
10 | from flask import render_template, request
11 |
12 | app = Flask(__name__)
13 |
14 |
15 | @app.route('/', methods=['POST', 'GET'])
16 | def my_index():
17 | user_name = request.form.get('username')
18 | if user_name is not None:
19 | pass_word = request.form.get('pwd')
20 | sex = request.form.getlist('sex')
21 | property = request.form.getlist('property')
22 | content = request.form.get('content')
23 | print(content)
24 | print(user_name)
25 | print(pass_word)
26 | print(sex)
27 | print(property)
28 | return render_template('index.html')
29 |
30 |
31 | if __name__ == '__main__':
32 | app.run()
33 |
34 |
--------------------------------------------------------------------------------
/flask表单那些事/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | 表单那些事
6 |
7 |
8 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/gensim/001-TF-IDF句子相似度计算.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time : 2020/8/5 11:47
3 | # @Author : xiaolu
4 | # @FileName: 001-TF-IDF句子相似度计算.py
5 | # @Software: PyCharm
6 | import jieba
7 | from gensim import corpora, models, similarities
8 |
9 | import numpy as np
10 | import linecache
11 |
12 |
13 | def similarity(query_path, query):
14 | '''
15 | :param query_path: 问题库的路径
16 | :param query: 所提的问题
17 | :return: 问题库中与当前问题相似的问题索引
18 | '''
19 | # 对问题库中的问题处理
20 | questions = []
21 | with open(query_path, 'r', encoding='utf8') as f:
22 | for line in f.readlines():
23 | line = line.strip()
24 | line = jieba.lcut(line)
25 | temp = []
26 | for w in line:
27 | if w not in stopword:
28 | temp.append(w)
29 | questions.append(temp)
30 |
31 | # 创建词典
32 | dictionary = corpora.Dictionary(questions)
33 | # 基于词典,将分词列表集转换成稀疏向量集,即语料库
34 | questions = [dictionary.doc2bow(ques) for ques in questions]
35 | # 训练TF-IDF模型,传入语料库进行训练
36 | tfidf = models.TfidfModel(questions) # 传入的向量集
37 | # 用训练好的TF-IDF模型处理被检索文本,即语料库
38 | corpus_tfidf = tfidf[questions]
39 | # for temp in corpus_tfidf: # 每个问题中的每个词的tfidf值
40 | # print(temp)
41 | # 对当前所问问题进行处理
42 |
43 | new_vec = dictionary.doc2bow(query.split())
44 | new_vec_tfidf = tfidf[new_vec]
45 |
46 | # 计算当前问题与问题库中所有问题的相似度
47 | index = similarities.MatrixSimilarity(corpus_tfidf) # 最相似问题
48 | sims = index[new_vec_tfidf] # 相似的列表吧
49 | # print(sims)
50 |
51 | max_loc = np.argmax(sims) # 最相似的问题(问题库)编号
52 | max_sim = sims[max_loc]
53 | # print(max_loc) # 5 相似问题的编号
54 | # print(max_sim) # 1.0 相似程度
55 |
56 | # 句子相似度阈值
57 | sup = 0.7
58 | # row_index默认为-1,即未匹配到满足相似度阈值的问题
59 | row_index = -1
60 | if max_sim > sup:
61 | # 相似度最大值对应文件中问题所在的行索引
62 | row_index = max_loc + 1
63 | return row_index
64 |
65 |
66 | def get_answer(answer_path, row_index):
67 | """
68 | :func: 得到问题对应的答案
69 | :param answer_path: 答案存储所在文件路径
70 | :param row_index: 答案的行索引
71 | :return:
72 | """
73 | answer = linecache.getline(answer_path, row_index)
74 | return answer
75 |
76 |
77 | if __name__ == '__main__':
78 | answer_path = './data/answer.txt'
79 | query_path = './data/question.txt'
80 |
81 | # 加载停用词
82 | stopword = []
83 | with open('./data/stopwords.txt', 'r', encoding='utf8') as f:
84 | for line in f.readlines():
85 | line = line.strip()
86 | stopword.append(line)
87 | print('退出请按q')
88 | while True:
89 | question = input('>:')
90 | if question == 'q':
91 | break
92 |
93 | # 首先分词然后去除停用词
94 | res = jieba.lcut(question)
95 | question_sep = []
96 | for r in res:
97 | if r not in stopword:
98 | question_sep.append(r)
99 | # question_sep 是问题经过分词, 停用词处理后的词表
100 | query = ' '.join(line for line in question_sep)
101 |
102 | # 得到问题对应的行索引 也就是问题来了 我们先和问题库中的问题匹配 得到问题库中的相似问题
103 | row_index = similarity(query_path, query) # 找到相似问题的索引位置了
104 |
105 | answer = get_answer(answer_path, row_index)
106 | print('<:', answer)
107 |
--------------------------------------------------------------------------------
/gensim/002-gensim文本摘要.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @Time : 2020/11/3 14:43
4 | @Auth : xiaolu
5 | @File :002-gensim文本摘要.py
6 | @IDE :PyCharm
7 | @Email:luxiaonlp@163.com
8 | """
9 | import re
10 | from LAC import LAC
11 | from gensim.summarization.summarizer import summarize
12 |
13 |
14 | def clean(content):
15 | content = content.replace('.', '')
16 | content = content.replace(' ', '')
17 | content = content.replace('\n', '.')
18 | return content
19 |
20 |
21 | def process_data(text, lac):
22 | # 首先对text进行分句子 主要防止摘要为半句话
23 | text = re.split('[.。?!]', text)
24 |
25 | sentences = []
26 | for t in text:
27 | if len(t) == 0:
28 | continue
29 | t = lac.run(t)
30 | sentences.append(' '.join(t))
31 |
32 | # 最后用.将句子连起来
33 | return '. '.join(sentences)
34 |
35 |
36 | if __name__ == '__main__':
37 | lac = LAC(mode='seg')
38 |
39 | # 1. 加载文章
40 | data = []
41 | with open('./data/text.txt', 'r', encoding='utf8') as f:
42 | lines = f.readlines()
43 | for i, line in enumerate(lines):
44 | line = line.strip()
45 | line = process_data(line, lac)
46 | line = summarize(line)
47 | line = clean(line)
48 | print('*' * 20 + '第{}篇文章的摘要'.format(i + 1) + '*' * 20)
49 | print(line)
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
--------------------------------------------------------------------------------
/gensim/data/answer.txt:
--------------------------------------------------------------------------------
1 | 中国的首都是北京
2 | 美国的首都在华盛顿
3 | 陕西的省会城市是西安
4 | 山西的省会城市是太原
5 | 姚明的老婆是叶莉
6 | 姚明的女儿是姚沁蕾
7 | 国家主席是习近平
--------------------------------------------------------------------------------
/gensim/data/question.txt:
--------------------------------------------------------------------------------
1 | 中国的首都在哪儿
2 | 美国的首都在哪儿
3 | 陕西的省会城市在哪
4 | 山西的省会城市在哪儿
5 | 姚明的老婆是谁
6 | 姚明的女儿是谁
7 | 国家主席是谁呀
--------------------------------------------------------------------------------
/gensim/data/stopwords.txt:
--------------------------------------------------------------------------------
1 | ———
2 | 》),
3 | )÷(1-
4 | ”,
5 | )、
6 | =(
7 | :
8 | →
9 | ℃
10 | &
11 | *
12 | 一一
13 | ~~~~
14 | ’
15 | .
16 | 『
17 | .一
18 | ./
19 | --
20 | 』
21 | =″
22 | 【
23 | [*]
24 | }>
25 | [⑤]]
26 | [①D]
27 | c]
28 | ng昉
29 | *
30 | //
31 | [
32 | ]
33 | [②e]
34 | [②g]
35 | ={
36 | }
37 | ,也
38 | ‘
39 | A
40 | [①⑥]
41 | [②B]
42 | [①a]
43 | [④a]
44 | [①③]
45 | [③h]
46 | ③]
47 | 1.
48 | --
49 | [②b]
50 | ’‘
51 | ×××
52 | [①⑧]
53 | 0:2
54 | =[
55 | [⑤b]
56 | [②c]
57 | [④b]
58 | [②③]
59 | [③a]
60 | [④c]
61 | [①⑤]
62 | [①⑦]
63 | [①g]
64 | ∈[
65 | [①⑨]
66 | [①④]
67 | [①c]
68 | [②f]
69 | [②⑧]
70 | [②①]
71 | [①C]
72 | [③c]
73 | [③g]
74 | [②⑤]
75 | [②②]
76 | 一.
77 | [①h]
78 | .数
79 | []
80 | [①B]
81 | 数/
82 | [①i]
83 | [③e]
84 | [①①]
85 | [④d]
86 | [④e]
87 | [③b]
88 | [⑤a]
89 | [①A]
90 | [②⑧]
91 | [②⑦]
92 | [①d]
93 | [②j]
94 | 〕〔
95 | ][
96 | ://
97 | ′∈
98 | [②④
99 | [⑤e]
100 | 12%
101 | b]
102 | ...
103 | ...................
104 | …………………………………………………③
105 | ZXFITL
106 | [③F]
107 | 」
108 | [①o]
109 | ]∧′=[
110 | ∪φ∈
111 | ′|
112 | {-
113 | ②c
114 | }
115 | [③①]
116 | R.L.
117 | [①E]
118 | Ψ
119 | -[*]-
120 | ↑
121 | .日
122 | [②d]
123 | [②
124 | [②⑦]
125 | [②②]
126 | [③e]
127 | [①i]
128 | [①B]
129 | [①h]
130 | [①d]
131 | [①g]
132 | [①②]
133 | [②a]
134 | f]
135 | [⑩]
136 | a]
137 | [①e]
138 | [②h]
139 | [②⑥]
140 | [③d]
141 | [②⑩]
142 | e]
143 | 〉
144 | 】
145 | 元/吨
146 | [②⑩]
147 | 2.3%
148 | 5:0
149 | [①]
150 | ::
151 | [②]
152 | [③]
153 | [④]
154 | [⑤]
155 | [⑥]
156 | [⑦]
157 | [⑧]
158 | [⑨]
159 | ……
160 | ——
161 | ?
162 | 、
163 | 。
164 | “
165 | ”
166 | 《
167 | 》
168 | !
169 | ,
170 | :
171 | ;
172 | ?
173 | .
174 | ,
175 | .
176 | '
177 | ?
178 | ·
179 | ———
180 | ──
181 | ?
182 | —
183 | <
184 | >
185 | (
186 | )
187 | 〔
188 | 〕
189 | [
190 | ]
191 | (
192 | )
193 | -
194 | +
195 | ~
196 | ×
197 | /
198 | /
199 | ①
200 | ②
201 | ③
202 | ④
203 | ⑤
204 | ⑥
205 | ⑦
206 | ⑧
207 | ⑨
208 | ⑩
209 | Ⅲ
210 | В
211 | "
212 | ;
213 | #
214 | @
215 | γ
216 | μ
217 | φ
218 | φ.
219 | ×
220 | Δ
221 | ■
222 | ▲
223 | sub
224 | exp
225 | sup
226 | sub
227 | Lex
228 | #
229 | %
230 | &
231 | '
232 | +
233 | +ξ
234 | ++
235 | -
236 | -β
237 | <
238 | <±
239 | <Δ
240 | <λ
241 | <φ
242 | <<
243 | =
244 | =
245 | =☆
246 | =-
247 | >
248 | >λ
249 | _
250 | ~±
251 | ~+
252 | [⑤f]
253 | [⑤d]
254 | [②i]
255 | ≈
256 | [②G]
257 | [①f]
258 | LI
259 | ㈧
260 | [-
261 | ......
262 | 〉
263 | [③⑩]
264 | 第二
265 | 一番
266 | 一直
267 | 一个
268 | 一些
269 | 许多
270 | 种
271 | 有的是
272 | 也就是说
273 | 末##末
274 | 啊
275 | 阿
276 | 哎
277 | 哎呀
278 | 哎哟
279 | 唉
280 | 俺
281 | 俺们
282 | 按
283 | 按照
284 | 吧
285 | 吧哒
286 | 把
287 | 罢了
288 | 被
289 | 本
290 | 本着
291 | 比
292 | 比方
293 | 比如
294 | 鄙人
295 | 彼
296 | 彼此
297 | 边
298 | 别
299 | 别的
300 | 别说
301 | 并
302 | 并且
303 | 不比
304 | 不成
305 | 不单
306 | 不但
307 | 不独
308 | 不管
309 | 不光
310 | 不过
311 | 不仅
312 | 不拘
313 | 不论
314 | 不怕
315 | 不然
316 | 不如
317 | 不特
318 | 不惟
319 | 不问
320 | 不只
321 | 朝
322 | 朝着
323 | 趁
324 | 趁着
325 | 乘
326 | 冲
327 | 除
328 | 除此之外
329 | 除非
330 | 除了
331 | 此
332 | 此间
333 | 此外
334 | 从
335 | 从而
336 | 打
337 | 待
338 | 但
339 | 但是
340 | 当
341 | 当着
342 | 到
343 | 得
344 | 的
345 | 的话
346 | 等
347 | 等等
348 | 地
349 | 第
350 | 叮咚
351 | 对
352 | 对于
353 | 多
354 | 多少
355 | 而
356 | 而况
357 | 而且
358 | 而是
359 | 而外
360 | 而言
361 | 而已
362 | 尔后
363 | 反过来
364 | 反过来说
365 | 反之
366 | 非但
367 | 非徒
368 | 否则
369 | 嘎
370 | 嘎登
371 | 该
372 | 赶
373 | 个
374 | 各
375 | 各个
376 | 各位
377 | 各种
378 | 各自
379 | 给
380 | 根据
381 | 跟
382 | 故
383 | 故此
384 | 固然
385 | 关于
386 | 管
387 | 归
388 | 果然
389 | 果真
390 | 过
391 | 哈
392 | 哈哈
393 | 呵
394 | 和
395 | 何
396 | 何处
397 | 何况
398 | 何时
399 | 嘿
400 | 哼
401 | 哼唷
402 | 呼哧
403 | 乎
404 | 哗
405 | 还是
406 | 还有
407 | 换句话说
408 | 换言之
409 | 或
410 | 或是
411 | 或者
412 | 极了
413 | 及
414 | 及其
415 | 及至
416 | 即
417 | 即便
418 | 即或
419 | 即令
420 | 即若
421 | 即使
422 | 几
423 | 几时
424 | 己
425 | 既
426 | 既然
427 | 既是
428 | 继而
429 | 加之
430 | 假如
431 | 假若
432 | 假使
433 | 鉴于
434 | 将
435 | 较
436 | 较之
437 | 叫
438 | 接着
439 | 结果
440 | 借
441 | 紧接着
442 | 进而
443 | 尽
444 | 尽管
445 | 经
446 | 经过
447 | 就
448 | 就是
449 | 就是说
450 | 据
451 | 具体地说
452 | 具体说来
453 | 开始
454 | 开外
455 | 靠
456 | 咳
457 | 可
458 | 可见
459 | 可是
460 | 可以
461 | 况且
462 | 啦
463 | 来
464 | 来着
465 | 离
466 | 例如
467 | 哩
468 | 连
469 | 连同
470 | 两者
471 | 了
472 | 临
473 | 另
474 | 另外
475 | 另一方面
476 | 论
477 | 嘛
478 | 吗
479 | 慢说
480 | 漫说
481 | 冒
482 | 么
483 | 每
484 | 每当
485 | 们
486 | 莫若
487 | 某
488 | 某个
489 | 某些
490 | 拿
491 | 哪
492 | 哪边
493 | 哪儿
494 | 哪个
495 | 哪里
496 | 哪年
497 | 哪怕
498 | 哪天
499 | 哪些
500 | 哪样
501 | 那
502 | 那边
503 | 那儿
504 | 那个
505 | 那会儿
506 | 那里
507 | 那么
508 | 那么些
509 | 那么样
510 | 那时
511 | 那些
512 | 那样
513 | 乃
514 | 乃至
515 | 呢
516 | 能
517 | 你
518 | 你们
519 | 您
520 | 宁
521 | 宁可
522 | 宁肯
523 | 宁愿
524 | 哦
525 | 呕
526 | 啪达
527 | 旁人
528 | 呸
529 | 凭
530 | 凭借
531 | 其
532 | 其次
533 | 其二
534 | 其他
535 | 其它
536 | 其一
537 | 其余
538 | 其中
539 | 起
540 | 起见
541 | 起见
542 | 岂但
543 | 恰恰相反
544 | 前后
545 | 前者
546 | 且
547 | 然而
548 | 然后
549 | 然则
550 | 让
551 | 人家
552 | 任
553 | 任何
554 | 任凭
555 | 如
556 | 如此
557 | 如果
558 | 如何
559 | 如其
560 | 如若
561 | 如上所述
562 | 若
563 | 若非
564 | 若是
565 | 啥
566 | 上下
567 | 尚且
568 | 设若
569 | 设使
570 | 甚而
571 | 甚么
572 | 甚至
573 | 省得
574 | 时候
575 | 什么
576 | 什么样
577 | 使得
578 | 是
579 | 是的
580 | 首先
581 | 谁
582 | 谁知
583 | 顺
584 | 顺着
585 | 似的
586 | 虽
587 | 虽然
588 | 虽说
589 | 虽则
590 | 随
591 | 随着
592 | 所
593 | 所以
594 | 他
595 | 他们
596 | 他人
597 | 它
598 | 它们
599 | 她
600 | 她们
601 | 倘
602 | 倘或
603 | 倘然
604 | 倘若
605 | 倘使
606 | 腾
607 | 替
608 | 通过
609 | 同
610 | 同时
611 | 哇
612 | 万一
613 | 往
614 | 望
615 | 为
616 | 为何
617 | 为了
618 | 为什么
619 | 为着
620 | 喂
621 | 嗡嗡
622 | 我
623 | 我们
624 | 呜
625 | 呜呼
626 | 乌乎
627 | 无论
628 | 无宁
629 | 毋宁
630 | 嘻
631 | 吓
632 | 相对而言
633 | 像
634 | 向
635 | 向着
636 | 嘘
637 | 呀
638 | 焉
639 | 沿
640 | 沿着
641 | 要
642 | 要不
643 | 要不然
644 | 要不是
645 | 要么
646 | 要是
647 | 也
648 | 也罢
649 | 也好
650 | 一
651 | 一般
652 | 一旦
653 | 一方面
654 | 一来
655 | 一切
656 | 一样
657 | 一则
658 | 依
659 | 依照
660 | 矣
661 | 以
662 | 以便
663 | 以及
664 | 以免
665 | 以至
666 | 以至于
667 | 以致
668 | 抑或
669 | 因
670 | 因此
671 | 因而
672 | 因为
673 | 哟
674 | 用
675 | 由
676 | 由此可见
677 | 由于
678 | 有
679 | 有的
680 | 有关
681 | 有些
682 | 又
683 | 于
684 | 于是
685 | 于是乎
686 | 与
687 | 与此同时
688 | 与否
689 | 与其
690 | 越是
691 | 云云
692 | 哉
693 | 再说
694 | 再者
695 | 在
696 | 在下
697 | 咱
698 | 咱们
699 | 则
700 | 怎
701 | 怎么
702 | 怎么办
703 | 怎么样
704 | 怎样
705 | 咋
706 | 照
707 | 照着
708 | 者
709 | 这
710 | 这边
711 | 这儿
712 | 这个
713 | 这会儿
714 | 这就是说
715 | 这里
716 | 这么
717 | 这么点儿
718 | 这么些
719 | 这么样
720 | 这时
721 | 这些
722 | 这样
723 | 正如
724 | 吱
725 | 之
726 | 之类
727 | 之所以
728 | 之一
729 | 只是
730 | 只限
731 | 只要
732 | 只有
733 | 至
734 | 至于
735 | 诸位
736 | 着
737 | 着呢
738 | 自
739 | 自从
740 | 自个儿
741 | 自各儿
742 | 自己
743 | 自家
744 | 自身
745 | 综上所述
746 | 总的来看
747 | 总的来说
748 | 总的说来
749 | 总而言之
750 | 总之
751 | 纵
752 | 纵令
753 | 纵然
754 | 纵使
755 | 遵照
756 | 作为
757 | 兮
758 | 呃
759 | 呗
760 | 咚
761 | 咦
762 | 喏
763 | 啐
764 | 喔唷
765 | 嗬
766 | 嗯
767 | 嗳
768 |
--------------------------------------------------------------------------------
/gensim/data/test.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time : 2020/8/6 10:18
3 | # @Author : xiaolu
4 | # @FileName: test.py
5 | # @Software: PyCharm
6 | import linecache
7 |
8 |
9 | path = 'answer.txt'
10 | for i in range(5):
11 | answer = linecache.getline(path, i)
12 | answer = answer.strip()
13 | print(answer)
--------------------------------------------------------------------------------
/gradio学习/01-row_column_layout.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 01-row_column_layout.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2023-05-12
6 | """
7 | import gradio as gr
8 |
9 | title = "抽取式问答"
10 |
11 | description = "输入上下文与问题后,点击submit按钮,可从上下文中抽取出答案,赶快试试吧!"
12 |
13 | examples = [
14 | ["普希金从那里学习人民的语言,吸取了许多有益的养料,这一切对普希金后来的创作产生了很大的影响。这两年里,普希金创作了不少优秀的作品,如《囚徒》、《致大海》、《致凯恩》和《假如生活欺骗了你》等几十首抒情诗,叙事诗《努林伯爵》,历史剧《鲍里斯·戈都诺夫》,以及《叶甫盖尼·奥涅金》前六章。", "著名诗歌《假如生活欺骗了你》的作者是"],
15 | ["普希金从那里学习人民的语言,吸取了许多有益的养料,这一切对普希金后来的创作产生了很大的影响。这两年里,普希金创作了不少优秀的作品,如《囚徒》、《致大海》、《致凯恩》和《假如生活欺骗了你》等几十首抒情诗,叙事诗《努林伯爵》,历史剧《鲍里斯·戈都诺夫》,以及《叶甫盖尼·奥涅金》前六章。", "普希金创作的叙事诗叫什么"]
16 | ]
17 |
18 | article = "感兴趣的小伙伴可以阅读[Transformers实用指南](https://zhuanlan.zhihu.com/p/548336726)"
19 |
20 |
21 | # 预测函数
22 | def custom_predict(context, question):
23 | answer = '对不起 我就是不给你回答'
24 | answer = question + ": " + answer
25 | score = 0.01
26 | return answer, score
27 |
28 |
29 | # 清除输入输出
30 | def clear_input():
31 | return "", "", "", ""
32 |
33 |
34 | # 构建Blocks上下文
35 | with gr.Blocks() as demo:
36 | gr.Markdown("# 抽取式问答")
37 | gr.Markdown("输入上下文与问题后,点击submit按钮,可从上下文中抽取出答案,赶快试试吧!")
38 | with gr.Column(): # 列排列
39 | context = gr.Textbox(label="context")
40 | question = gr.Textbox(label="question")
41 | with gr.Row(): # 行排列
42 | clear = gr.Button("clear") # 清除按钮
43 | submit = gr.Button("submit") # submit提交按钮
44 | with gr.Column(): # 列排列
45 | answer = gr.Textbox(label="answer")
46 | score = gr.Label(label="score")
47 |
48 | # 绑定submit点击函数
49 | submit.click(fn=custom_predict, inputs=[context, question], outputs=[answer, score])
50 |
51 | # 绑定clear点击函数
52 | clear.click(fn=clear_input, inputs=[], outputs=[context, question, answer, score])
53 | gr.Examples(examples, inputs=[context, question])
54 | gr.Markdown("感兴趣的小伙伴可以阅读[Transformers实用指南](https://zhuanlan.zhihu.com/p/548336726)")
55 |
56 | demo.launch()
57 |
--------------------------------------------------------------------------------
/gradio学习/02-chatglm_web.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 02-chatglm_web.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2023-05-12
6 | """
7 | from transformers import AutoModel, AutoTokenizer
8 | import gradio as gr
9 |
10 | # 加载模型
11 | tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
12 | model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()
13 | model = model.eval()
14 |
15 |
16 | """Override Chatbot.postprocess"""
17 | def postprocess(self, y):
18 | if y is None:
19 | return []
20 | for i, (message, response) in enumerate(y):
21 | y[i] = (
22 | None if message is None else mdtex2html.convert((message)),
23 | None if response is None else mdtex2html.convert(response),
24 | )
25 | return y
26 |
27 |
28 | gr.Chatbot.postprocess = postprocess
29 |
30 |
31 | def parse_text(text):
32 | """copy from https://github.com/GaiZhenbiao/ChuanhuChatGPT/"""
33 | lines = text.split("\n")
34 | lines = [line for line in lines if line != ""]
35 | count = 0
36 | for i, line in enumerate(lines):
37 | if "```" in line:
38 | count += 1
39 | items = line.split('`')
40 | if count % 2 == 1:
41 | lines[i] = f''
42 | else:
43 | lines[i] = f'
'
44 | else:
45 | if i > 0:
46 | if count % 2 == 1:
47 | line = line.replace("`", "\`")
48 | line = line.replace("<", "<")
49 | line = line.replace(">", ">")
50 | line = line.replace(" ", " ")
51 | line = line.replace("*", "*")
52 | line = line.replace("_", "_")
53 | line = line.replace("-", "-")
54 | line = line.replace(".", ".")
55 | line = line.replace("!", "!")
56 | line = line.replace("(", "(")
57 | line = line.replace(")", ")")
58 | line = line.replace("$", "$")
59 | lines[i] = "
"+line
60 | text = "".join(lines)
61 | return text
62 |
63 |
64 | def predict(input, chatbot, max_length, top_p, temperature, history):
65 | chatbot.append((parse_text(input), ""))
66 | for response, history in model.stream_chat(tokenizer, input, history, max_length=max_length, top_p=top_p,
67 | temperature=temperature):
68 | chatbot[-1] = (parse_text(input), parse_text(response))
69 | yield chatbot, history
70 |
71 |
72 | def reset_user_input():
73 | return gr.update(value='')
74 |
75 |
76 | def reset_state():
77 | return [], []
78 |
79 |
80 | with gr.Blocks() as demo:
81 | gr.HTML("""ChatGLM
""") # 可以加入前端代码显示
82 |
83 | chatbot = gr.Chatbot() # 占一行 chatbot
84 | with gr.Row(): # 下面的每个元素行行排列
85 | with gr.Column(scale=4): # 行 左 占总行空间的4/5
86 | with gr.Column(scale=12):
87 | user_input = gr.Textbox(show_label=False, placeholder="Input...", lines=10).style(
88 | container=False)
89 | with gr.Column(min_width=32, scale=1):
90 | submitBtn = gr.Button("Submit", variant="primary")
91 |
92 | with gr.Column(scale=1): # 行 右 占总行空间的1/5
93 | emptyBtn = gr.Button("Clear History")
94 | max_length = gr.Slider(0, 4096, value=2048, step=1.0, label="Maximum length", interactive=True)
95 | top_p = gr.Slider(0, 1, value=0.7, step=0.01, label="Top P", interactive=True)
96 | temperature = gr.Slider(0, 1, value=0.95, step=0.01, label="Temperature", interactive=True)
97 |
98 | history = gr.State([])
99 | submitBtn.click(predict, [user_input, chatbot, max_length, top_p, temperature, history], [chatbot, history],
100 | show_progress=True)
101 | submitBtn.click(reset_user_input, [], [user_input]) # 点了提交按钮后 用户输入框也得改下
102 | emptyBtn.click(reset_state, outputs=[chatbot, history], show_progress=True)
103 |
104 | demo.queue().launch(share=False, inbrowser=True)
105 | # 如果想加登录
106 | # zhanghu = [["xiaolu", "1234"]]
107 | # demo.queue().launch(share=True, server_name='0.0.0.0', server_port=6006, auth=zhanghu, auth_message='请联系xiaolu认证进行访问')
108 |
--------------------------------------------------------------------------------
/ipdb调试python程序/001-简单调试.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time : 2020/8/6 10:04
3 | # @Author : xiaolu
4 | # @FileName: 001-简单调试.py
5 | # @Software: PyCharm
6 | from ipdb import set_trace
7 |
8 |
9 | if __name__ == "__main__":
10 | a = 0
11 | b = 1
12 | for i in range(1, 100, 2):
13 | a += i
14 | b *= i
15 | set_trace()
16 |
17 |
18 | # ipdb> print(a)
19 | # 1
20 | # ipdb> print(b)
21 | # 1
22 | # 接下来输入n 每输入一次 往后执行一行
23 |
24 | # 假设输入两次n 此时的a=4 b=1 输三次n 此时的a=4 b=3
25 |
--------------------------------------------------------------------------------
/ipdb调试python程序/readme.txt:
--------------------------------------------------------------------------------
1 | ipdb命令大全:
2 |
3 | ENTER(重复上次命令)
4 | c(继续)
5 | l(查找当前位于哪里)
6 | s(进入子程序)
7 | r(运行直到子程序结束)
8 | !
9 | h(帮助)
10 | a(rgs) 打印当前函数的参数
11 | j(ump) 让程序跳转到指定的行数
12 | l(ist) 可以列出当前将要运行的代码块
13 | n(ext) 让程序运行下一行,如果当前语句有一个函数调用,用 n 是不会进入被调用的函数体中的
14 | p(rint) 最有用的命令之一,打印某个变量
15 | q(uit) 退出调试
16 | r(eturn) 继续执行,直到函数体返回
17 | s(tep) 跟 n 相似,但是如果当前有一个函数调用,那么 s 会进入被调用的函数体中
--------------------------------------------------------------------------------
/logging模块的使用/001-日志级别的使用.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 001-日志级别的使用.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-02-21
6 | """
7 | import logging
8 |
9 | if __name__ == '__main__':
10 | logging.basicConfig(level=logging.NOTSET) # 这是级别 输出小于warning级别的信息
11 | logging.debug('数学')
12 | logging.info('英语')
13 | logging.warning('物理')
14 | logging.error('体育')
15 | logging.critical('政治')
--------------------------------------------------------------------------------
/logging模块的使用/002-日志控制台输出.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 002-日志控制台输出.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-02-21
6 | """
7 | import logging # 引入logging模块
8 | if __name__ == '__main__':
9 | logging.basicConfig(level=logging.DEBUG, # 输出的最低级别
10 | format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s') # logging.basicConfig函数对日志的输出格式及方式做相关配置
11 | # asctime是时间 filename是当前文件夹 lineno 行号 levelname 什么级别的错误 massage输出的信息
12 | # 由于日志基本配置中级别设置为DEBUG,所以一下打印信息将会全部显示在控制台上
13 | logging.info('this is a loggging info message')
14 | logging.debug('this is a loggging debug message')
15 | logging.warning('this is loggging a warning message')
16 | logging.error('this is an loggging error message')
17 | logging.critical('this is a loggging critical message')
18 |
--------------------------------------------------------------------------------
/logging模块的使用/003-日志文件输出.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 003-日志文件输出.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-02-21
6 | """
7 | import logging # 引入logging模块
8 | import os.path
9 | import time
10 |
11 | if __name__ == '__main__':
12 | # 第一步,创建一个logger
13 | logger = logging.getLogger()
14 | logger.setLevel(logging.INFO) # Log等级总开关
15 |
16 | # 第二步,创建一个handler,用于写入日志文件
17 | rq = time.strftime('%Y%m%d%H%M', time.localtime(time.time()))
18 | log_path = os.path.dirname(os.getcwd()) + '/Logs/'
19 | os.makedirs(log_path, exist_ok=True) # 创建文件夹
20 | log_name = log_path + rq + '.log' # 日志名
21 | logfile = log_name
22 | fh = logging.FileHandler(logfile, mode='w')
23 | fh.setLevel(logging.DEBUG) # 输出到file的log等级的开关
24 |
25 | # 第三步,定义handler的输出格式
26 | formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")
27 | fh.setFormatter(formatter)
28 |
29 | # 第四步,将logger添加到handler里面
30 | logger.addHandler(fh)
31 |
32 | # 日志
33 | logger.debug('this is a logger debug message')
34 | logger.info('this is a logger info message')
35 | logger.warning('this is a logger warning message')
36 | logger.error('this is a logger error message')
37 | logger.critical('this is a logger critical message')
--------------------------------------------------------------------------------
/logging模块的使用/004-捕捉异常.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 004-捕捉异常.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-02-21
6 | """
7 | import os.path
8 | import time
9 | import logging
10 |
11 | if __name__ == '__main__':
12 | # 创建一个logger
13 | logger = logging.getLogger()
14 | logger.setLevel(logging.INFO) # Log等级总开关
15 |
16 | # 创建一个handler,用于写入日志文件
17 | rq = time.strftime('%Y%m%d%H%M', time.localtime(time.time()))
18 | log_path = os.path.dirname(os.getcwd()) + '/Logs/'
19 | os.makedirs(log_path, exist_ok=True)
20 | log_name = log_path + rq + '.log'
21 | logfile = log_name
22 | fh = logging.FileHandler(logfile, mode='w')
23 | fh.setLevel(logging.DEBUG) # 输出到file的log等级的开关
24 |
25 | # 定义handler的输出格式
26 | formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")
27 | fh.setFormatter(formatter)
28 | logger.addHandler(fh)
29 |
30 | # 使用logger.XX来记录错误,这里的"error"可以根据所需要的级别进行修改
31 | try:
32 | open('/path/to/does/not/exist', 'rb')
33 | except (SystemExit, KeyboardInterrupt):
34 | raise
35 | except Exception:
36 | logger.error('Failed to open file', exc_info=True)
37 |
--------------------------------------------------------------------------------
/pandas一键画图/001-plot_zhexiantu.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 001-plot_zhexiantu.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-12-29
6 | """
7 | # 安装pandas以及pandas_bokeh # pip install pandas_bokeh pandas
8 | import numpy as np
9 | import pandas as pd
10 | import pandas_bokeh
11 |
12 | # 注意 文件名字不要夹带中文
13 |
14 | if __name__ == '__main__':
15 | np.random.seed(55)
16 | df = pd.DataFrame({"宁德时代": np.random.randn(100)+0.2,
17 | "贵州茅台": np.random.randn(100)+0.17},
18 | index=pd.date_range('1/1/2021', periods=100))
19 | df = df.cumsum() # 累加
20 | df = df + 50
21 | df.plot_bokeh.line(
22 | figsize=(800, 450), # 图片的大小
23 | title="宁德时代 vs 贵州茅台", # 表名
24 | xlabel="日期", # 横坐标的名字
25 | ylabel="股票价格 [$]", # 纵坐标的名字
26 | # yticks=[0, 100, 200, 300, 400], # y轴的虚线 可以不带
27 | ylim=(45, 80), # y轴范围
28 | xlim=("2021-01-01", "2021-04-01"), # x轴的范围
29 | colormap=["red", "blue"],
30 | plot_data_points=True, # 标记每个值
31 | plot_data_points_size=5,
32 | marker="asterisk")
33 |
--------------------------------------------------------------------------------
/pandas一键画图/002-plot_sandiantu.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 | Bokeh Plot
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
47 |
83 |
84 |
85 |
86 |
--------------------------------------------------------------------------------
/pandas一键画图/002-plot_sandiantu.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 002-plot_sandiantu.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-12-29
6 | """
7 | import pandas as pd
8 | import pandas_bokeh
9 |
10 | if __name__ == '__main__':
11 | # 随便造一些数据
12 | df = pd.DataFrame({
13 | 'length': [5.1, 4.9, 4.7, 4.6, 5., 5.4, 4.6, 5., 4.4, 4.9, 5.4, 4.8, 4.8, 4.3, 5.8, 5.7],
14 | 'width': [3.5, 3., 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.4, 3., 3., 4., 4.4],
15 | 'label': [0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1]
16 | })
17 |
18 | p_scatter = df.plot_bokeh.scatter(
19 | x="length",
20 | y="width",
21 | category="label", # 如果有类别 还可以jia
22 | title="随便一画",
23 | show_figure=True,
24 | )
--------------------------------------------------------------------------------
/pandas一键画图/003-plot_zhuzhuangtu.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 | Bokeh Plot
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
47 |
83 |
84 |
85 |
86 |
--------------------------------------------------------------------------------
/pandas一键画图/003-plot_zhuzhuangtu.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 003-plot_zhuzhuangtu.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-12-29
6 | """
7 | import pandas as pd
8 | import pandas_bokeh
9 |
10 |
11 | if __name__ == '__main__':
12 | data = {
13 | 'fruits':
14 | ['苹果', '梨', '草莓', '西瓜', '葡萄', '香蕉'],
15 | '2015': [2, 1, 4, 3, 2, 4],
16 | '2016': [5, 3, 3, 2, 4, 6],
17 | '2017': [3, 2, 4, 4, 5, 3]
18 | }
19 | df = pd.DataFrame(data).set_index("fruits") # 设置水果为索引
20 |
21 | p_bar = df.plot_bokeh.bar(
22 | ylabel="每斤的的价格 [¥]",
23 | title="水果每年的价格",
24 | alpha=0.6)
--------------------------------------------------------------------------------
/py2neo操作neo4j/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/.DS_Store
--------------------------------------------------------------------------------
/py2neo操作neo4j/py2neo简单练习/create_graph_v1.py:
--------------------------------------------------------------------------------
1 | """
2 | # -*- coding: utf-8 -*-
3 | # @File : create_graph_v1.py
4 | # @Time : 2020/11/23 6:52 下午
5 | # @Author : xiaolu
6 | # @Email : luxiaonlp@163.com
7 | # @Software: PyCharm
8 | """
9 | from py2neo import Graph, Node, Relationship, NodeMatcher
10 | import pandas as pd
11 | from pdb import set_trace
12 |
13 |
14 | def load_data():
15 | # 加载数据
16 | # data = pd.read_excel('./santi.xlsx')
17 | # data = pd.read_excel('./mingchaonaxieshier.xlsx')
18 | data = pd.read_excel('./test.xlsx')
19 | start = data['S'].tolist()
20 | relation = data['P'].tolist()
21 | end = data['O'].tolist()
22 | start_list = [str(i) for i in start]
23 | relation_list = [str(i) for i in relation]
24 | end_list = [str(i) for i in end]
25 | link_dict = dict()
26 | link_dict['start'] = start_list
27 | link_dict['relation'] = relation_list
28 | link_dict['end'] = end_list
29 | df_data = pd.DataFrame(link_dict)
30 | return df_data
31 |
32 |
33 | class DataToNeo4j:
34 | def __init__(self):
35 | link = Graph()
36 | self.graph = link
37 |
38 | self.start = 'start'
39 | self.end = 'end'
40 |
41 | self.graph.delete_all() # 将之前的图 全部删除
42 | self.matcher = NodeMatcher(link) # 为了查找
43 |
44 | def create_node(self, start, end):
45 | # 创建节点
46 | for name in start:
47 | node = Node(self.start, name=name)
48 | self.graph.create(node)
49 |
50 | for name in end:
51 | node = Node(self.end, name=name)
52 | self.graph.create(node)
53 |
54 | def create_relation(self, df_data):
55 | m = 0
56 | for m in range(0, len(df_data)):
57 | # print(list(self.matcher.match(self.start).where('_.name=' + "'" + df_data['start'][m] + "'")))
58 | # 相当于在'start'标签下找 name=某个名字的节点
59 | # print(list(self.matcher.match(self.end).where('_.name=' + "'" + df_data['end'][m] + "'")))
60 | # 相当于在'end'标签下找 name=某个名字的节点'
61 | # 然后为这两个节点创建关系
62 | try:
63 | rel = Relationship(
64 | self.matcher.match(self.start).where('_.name=' + "'" + df_data['start'][m] + "'").first(),
65 | df_data['relation'][m],
66 | self.matcher.match(self.end).where('_.name=' + "'" + df_data['end'][m] + "'").first()
67 | )
68 | self.graph.create(rel)
69 | except AttributeError as e:
70 | print(e, m)
71 |
72 |
73 | def data_extraction(df_data):
74 | node_start = []
75 | for i in df_data['start'].tolist():
76 | node_start.append(i)
77 |
78 | node_end = []
79 | for i in df_data['end'].tolist():
80 | node_end.append(i)
81 |
82 | # 去重
83 | node_start = list(set(node_start))
84 | node_end = list(set(node_end))
85 | return node_start, node_end
86 |
87 |
88 | if __name__ == '__main__':
89 | df_data = load_data()
90 | # print(df_data.head())
91 | node_start, node_end = data_extraction(df_data)
92 | # 创建图
93 | create_data = DataToNeo4j()
94 | # 节点
95 | create_data.create_node(node_start, node_end)
96 | # 关系
97 | create_data.create_relation(df_data)
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
--------------------------------------------------------------------------------
/py2neo操作neo4j/py2neo简单练习/create_graph_v2.py:
--------------------------------------------------------------------------------
1 | """
2 | # -*- coding: utf-8 -*-
3 | # @File : create_graph_v2.py
4 | # @Time : 2020/11/23 9:54 下午
5 | # @Author : xiaolu
6 | # @Email : luxiaonlp@163.com
7 | # @Software: PyCharm
8 | """
9 | """
10 | # -*- coding: utf-8 -*-
11 | # @File : create_graph_v1.py
12 | # @Time : 2020/11/23 6:52 下午
13 | # @Author : xiaolu
14 | # @Email : luxiaonlp@163.com
15 | # @Software: PyCharm
16 | """
17 | from py2neo import Graph, Node, Relationship, NodeMatcher
18 | import pandas as pd
19 | from pdb import set_trace
20 |
21 |
22 | def load_data():
23 | # 加载数据
24 | data = pd.read_excel('./santi.xlsx')
25 | # data = pd.read_excel('./mingchaonaxieshier.xlsx')
26 | # data = pd.read_excel('./test.xlsx')
27 | start = data['S'].tolist()
28 | relation = data['P'].tolist()
29 | end = data['O'].tolist()
30 | start_list = [str(i) for i in start]
31 | relation_list = [str(i) for i in relation]
32 | end_list = [str(i) for i in end]
33 | link_dict = dict()
34 | link_dict['start'] = start_list
35 | link_dict['relation'] = relation_list
36 | link_dict['end'] = end_list
37 | df_data = pd.DataFrame(link_dict)
38 | return df_data
39 |
40 |
41 | class DataToNeo4j:
42 | def __init__(self):
43 | link = Graph()
44 | self.graph = link
45 |
46 | self.start = 'start'
47 | self.end = 'end'
48 |
49 | self.graph.delete_all() # 将之前的图 全部删除
50 | self.matcher = NodeMatcher(link) # 为了查找
51 |
52 | def create_node(self, start, end):
53 | # 创建节点
54 | temp = []
55 | temp.extend(start)
56 | temp.extend(end)
57 | temp = list(set(temp))
58 | for t in temp:
59 | node = Node(self.start, name=t)
60 | self.graph.create(node)
61 |
62 |
63 | # for name in start:
64 | # node = Node(self.start, name=name)
65 | # self.graph.create(node)
66 | #
67 | # for name in end:
68 | # node = Node(self.end, name=name)
69 | # self.graph.create(node)
70 |
71 | def create_relation(self, df_data):
72 | m = 0
73 | for m in range(0, len(df_data)):
74 | # print(list(self.matcher.match(self.start).where('_.name=' + "'" + df_data['start'][m] + "'")))
75 | # 相当于在'start'标签下找 name=某个名字的节点
76 | # print(list(self.matcher.match(self.end).where('_.name=' + "'" + df_data['end'][m] + "'")))
77 | # 相当于在'end'标签下找 name=某个名字的节点'
78 | # 然后为这两个节点创建关系
79 | try:
80 | rel = Relationship(
81 | self.matcher.match(self.start).where('_.name=' + "'" + df_data['start'][m] + "'").first(),
82 | df_data['relation'][m],
83 | self.matcher.match(self.start).where('_.name=' + "'" + df_data['end'][m] + "'").first()
84 | )
85 | self.graph.create(rel)
86 | except AttributeError as e:
87 | print(e, m)
88 |
89 |
90 | def data_extraction(df_data):
91 | node_start = []
92 | for i in df_data['start'].tolist():
93 | node_start.append(i)
94 |
95 | node_end = []
96 | for i in df_data['end'].tolist():
97 | node_end.append(i)
98 |
99 | # 去重
100 | node_start = list(set(node_start))
101 | node_end = list(set(node_end))
102 | return node_start, node_end
103 |
104 |
105 | if __name__ == '__main__':
106 | df_data = load_data()
107 | # print(df_data.head())
108 | node_start, node_end = data_extraction(df_data)
109 | # 创建图
110 | create_data = DataToNeo4j()
111 | # 节点
112 | create_data.create_node(node_start, node_end)
113 | # 关系
114 | create_data.create_relation(df_data)
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
--------------------------------------------------------------------------------
/py2neo操作neo4j/py2neo简单练习/mingchaonaxieshier.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/py2neo简单练习/mingchaonaxieshier.xlsx
--------------------------------------------------------------------------------
/py2neo操作neo4j/py2neo简单练习/santi.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/py2neo简单练习/santi.xlsx
--------------------------------------------------------------------------------
/py2neo操作neo4j/py2neo简单练习/test.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/py2neo简单练习/test.xlsx
--------------------------------------------------------------------------------
/py2neo操作neo4j/readme.txt:
--------------------------------------------------------------------------------
1 | 1. 创造一个节点
2 | create (n:Person {name:"我", age=21})
3 | 2. 创建关系
4 | create (p:Person {name:"我", age:"23"})-[:包工程{金额:10000}]->(n:Person {name:"好大哥", age:"35"})
5 | 3. 删除节点 注意 删除有连接的节点时 必须先删掉关系
6 | create (n:Person {name:"XL", age:23})
7 | match (n:Person {name:"XL"}) delete n
8 | 4. 删除关系
9 | match (p:Person {name:"我", age:"23"})-[f:包工程{金额:10000}]->(n:Person {name:"好大哥", age:"35"}) delete f
10 | 5. 加上标签
11 | match (t:Person) where id(t)=2 set t:好人 return t
12 | 同个某个节点的id 找到它 然后给其设置一个好人的标签
13 | 6. 额外增加属性
14 | match (a:好人) where id(a)=2 set a.战斗力=200 return a
15 | 在好人标签中找一个节点的id为2 然后给其加一个战斗力属性 并设置其值为200
16 | 7. 查找
17 | create (:Person {name:"唐僧", age:"79"})-[:师傅 {s_time:"2020-11-23"}]->(:Person {name:"孙悟空", age:"1w"})
18 | match (a:Person)-[:师傅]->(b:Person) return a,b
19 | 创建了唐僧和孙悟空是师傅关系 然后 找关系为师傅的两个节点。
20 | 8. 快速清空数据库
21 | match (n) detach delete n
--------------------------------------------------------------------------------
/py2neo操作neo4j/事件三元组抽取/ltp的使用.py:
--------------------------------------------------------------------------------
1 | """
2 | # -*- coding: utf-8 -*-
3 | # @File : ltp的使用.py
4 | # @Time : 2020/11/25 9:20 下午
5 | # @Author : xiaolu
6 | # @Email : luxiaonlp@163.com
7 | # @Software: PyCharm
8 | """
9 | from ltp import LTP
10 | # 安装ltp: pip install ltp -i https://pypi.douban.com/simple/
11 | # 学习文档: http://ltp.ai/docs/quickstart.html
12 |
13 |
14 | def fenju():
15 | # 分句子
16 | sents = ltp.sent_split(["他叫汤姆去拿外衣。", "汤姆生病了。他去了医院。"])
17 | print(sents)
18 |
19 |
20 | def fenci():
21 | # 可以加载自己的词表
22 | ltp.init_dict(path='my_vocab.txt', max_window=4)
23 | segment, _ = ltp.seg(['我是你爸,我是你妈'])
24 | print(segment)
25 |
26 |
27 | def cixingbiaozhu():
28 | seg, hidden = ltp.seg(['他叫汤姆去拿外衣。'])
29 | pos = ltp.pos(hidden)
30 | print(seg)
31 | print(pos)
32 |
33 |
34 | def mingmingshitishibie():
35 | seg, hidden = ltp.seg(['他叫汤姆去拿外衣。孙悟空不同意咋办? 但是奥特曼肯定会同意'])
36 | ner = ltp.ner(hidden)
37 | print(seg)
38 | print(ner)
39 |
40 | for i in ner[0]:
41 | tag = i[0]
42 | name = seg[0][i[1]: i[2]+1]
43 | print(tag, ":", name)
44 |
45 |
46 | def yuyijuesebiaozhu():
47 | seg, hidden = ltp.seg(["他叫汤姆去拿外衣。"])
48 | srl = ltp.srl(hidden)
49 | print(srl) # 包含了空
50 |
51 | srl = ltp.srl(hidden, keep_empty=False)
52 | print(srl)
53 |
54 |
55 | def yicunjufafenxi():
56 | seg, hidden = ltp.seg(["他叫汤姆去拿外衣。"])
57 | dep = ltp.dep(hidden)
58 | print(dep)
59 |
60 |
61 | def yicunjufashu():
62 | seg, hidden = ltp.seg(["他叫汤姆去拿外衣。"])
63 | sdp = ltp.sdp(hidden, graph=False)
64 | print(sdp)
65 |
66 |
67 | def yicunjufafenxitu():
68 | seg, hidden = ltp.seg(["他叫汤姆去拿外衣。"])
69 | sdp = ltp.sdp(hidden, graph=True)
70 | print(sdp)
71 |
72 |
73 |
74 |
75 |
76 | if __name__ == '__main__':
77 | ltp = LTP() # ltp = LTP(path = "base|small|tiny") 默认下载small
78 |
79 | # 1. 分句
80 | # fenju()
81 |
82 | # 2. 分词
83 | # fenci()
84 |
85 | # 3. 词性标注
86 | # cixingbiaozhu()
87 |
88 | # 4. 命名实体识别
89 | # mingmingshitishibie()
90 |
91 | # 5. 语义角色标注
92 | # yuyijuesebiaozhu()
93 |
94 | # 6. 依存句法分析
95 | # yicunjufafenxi()
96 |
97 | # 7. 依存句法树
98 | yicunjufashu()
99 |
100 | # 8. 依存句法分析(图)
101 | yicunjufafenxitu()
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
--------------------------------------------------------------------------------
/py2neo操作neo4j/事件三元组抽取/my_vocab.txt:
--------------------------------------------------------------------------------
1 | 我是
2 | 你妈
--------------------------------------------------------------------------------
/py2neo操作neo4j/事件三元组抽取/readme.txt:
--------------------------------------------------------------------------------
1 | --------------2020-12-9 更新----------------------
2 | 这里不建议用ltp做三元组抽取,最近学习了一个深度学习模型进行三元组抽取 在我的另一个仓库
3 |
4 | [链接](https://github.com/shawroad/NLP_pytorch_project/tree/master/relation_extraction/lstm_cnn_information_extract)
5 |
6 |
7 | --------------2020-11-28 更新----------------------
8 | 迪哥使用的是pyltp。 这里我不推荐用pyltp,这个包目前已经不更新了。已经是老古董了。加载的模型估计也过时了。
9 |
10 | 这里我推荐使用ltp
11 |
12 | 安装: pip install ltp -i https://pypi.douban.com/simple/
13 |
14 | 测试安装成功与否: from ltp import LTP
15 |
16 | 安装成功后 下载模型 直接执行下面的代码 就可以下载了
17 | from ltp import LTP
18 | ltp = LTP() # ltp = LTP(path = "base|small|tiny") 可以指定参数 默认下载的是small 180m左右
19 |
20 |
21 | 这些操作完成以后 建议先看看ltp的使用方法
22 | 可以看代码 ltp的使用.py 或者看官方文档:http://ltp.ai/docs/quickstart.html
23 | 然后在去看三元组的抽取
24 |
--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/医疗知识图谱问答/.DS_Store
--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/ahocorasick的使用/demo.py:
--------------------------------------------------------------------------------
1 | """
2 | # -*- coding: utf-8 -*-
3 | # @File : demo.py
4 | # @Time : 2020/11/25 10:26 上午
5 | # @Author : xiaolu
6 | # @Email : luxiaonlp@163.com
7 | # @Software: PyCharm
8 | """
9 | # 安装 pip install ahocorasick -i https://pypi.douban.com/simple
10 | import ahocorasick
11 |
12 |
13 | if __name__ == '__main__':
14 | wordlist = ['长春海外制药接骨续筋片', '香菇炖甲鱼', '三鹤药业黄柏胶囊', '上海衡山熊去氧胆酸片', '升和药业依托泊苷注射液', '怡诺思',
15 | '人格障碍', '转铁蛋白饱和度', '脾囊肿', '素烧白萝卜', '利君现代冠脉宁片', '上海复华药业注射用还原型谷', '阴囊上有白色小疙瘩',
16 | '腹痛伴休克', '成都通德胰激肽原酶肠溶片', '蒸猪肝', '河北百善血尿胶囊', '精神障碍', '输卵管畸形', '元和抑眩宁胶囊', '莲藕豆腐',
17 | '辰欣哈西奈德溶液', '信谊烟酸片', '慢性胆囊炎', '参芪降糖颗粒', '康普药业盐酸普萘洛尔片', '西安迪赛胸腺肽肠溶片',
18 | '双鹭药业注射用复合辅酶', '慢性筛窦炎', '新高制药维胺酯维E乳膏', '冰黄肤乐软膏', '神经类疾病', '液晶热图',
19 | '枣(干)', '股外侧皮神经病', '浙江惠松硅炭银片', '牙根外露', '湖北潜江氯霉素滴眼液', '盐类皮质激素分泌过多', '五子衍宗丸',
20 | '小儿阵发性睡眠性血红蛋白尿症', '功能失调性子宫出血病', '茵栀黄口服液', '眼底出血和渗出', '斯达制药注射用头孢噻肟钠', '复方白芷酊',
21 | '胫腓骨骨折', '西南药业氯霉素片', '宫颈炎', '茶碱缓释胶囊', '原发性硬化性胆管炎', '郑州韩都利肺胶囊', '咽反射消失',
22 | '脊髓灰质炎', '甲状腺片', '回盲瓣功能不全', '牛黄清胃丸', '乙肝e抗体', '马齿苋粥', '动脉硬化', '宝宝乐', '肠闭锁', '肺放线菌病',
23 | '江苏晨牌产妇安颗粒', '犬吠样咳嗽', '胃康灵胶囊', '小儿烟酸缺乏病', '青龙防风通圣丸', '广东南国维生素C片', '碘化油咀嚼片',
24 | '西乐葆', '伟哥甲磺酸酚妥拉明分散片', '成都迪康药业樟脑醑', '斑疹', '五花炖墨鱼', '肉炖芸豆粉条', '陕西东泰制药益脉康胶囊',
25 | '桔梗八味颗粒', '华南牌溴丙胺太林片', '吉林敖东洮南小牛脾提取物注', '仁青芒觉', '血吸虫病与肝胆疾病', '持续性枕横位难产',
26 | '弯曲菌感染', '丝瓜蘑菇肉片汤', '长春银诺克清咽片', '肝叶萎缩', '迪皿盐酸左西替利嗪口服溶液', '阿司匹林']
27 |
28 | # 建树
29 | actree = ahocorasick.Automaton()
30 | for index, word in enumerate(wordlist):
31 | actree.add_word(word, (index, word))
32 | actree.make_automaton()
33 |
34 | for i in actree.iter('昨天发烧,服用了阿司匹林,并且还吃了牛黄清胃丸,饭是吃了瓜烧白菜,大便有点色浅'):
35 | print(i)
36 |
--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/build_medical_graph.py:
--------------------------------------------------------------------------------
1 | """
2 | # -*- coding: utf-8 -*-
3 | # @File : build_medical_graph.py
4 | # @Time : 2020/11/24 8:39 下午
5 | # @Author : xiaolu
6 | # @Email : luxiaonlp@163.com
7 | # @Software: PyCharm
8 | """
9 | import json
10 | from py2neo import Graph, Node
11 |
12 |
13 | class MedicalGraph:
14 | def __init__(self):
15 | self.data_path = './data/medical_min.json'
16 | self.g = Graph() # 这里填自己的信息
17 | self.g.delete_all() # 将之前的图 全部删除
18 |
19 | def read_nodes(self):
20 | # 共7类节点
21 | drugs = [] # 药品
22 | foods = [] # 食物
23 | checks = [] # 检查
24 | departments = [] # 科室
25 | producers = [] # 药品大类
26 | diseases = [] # 疾病
27 | symptoms = [] # 症状
28 |
29 | disease_infos = [] # 疾病信息
30 |
31 | # 构建节点实体关系
32 | rels_department = [] # 科室-科室关系
33 | rels_noteat = [] # 疾病-忌吃食物关系
34 | rels_doeat = [] # 疾病-宜吃食物关系
35 | rels_recommandeat = [] # 疾病-推荐吃食物关系
36 | rels_commonddrug = [] # 疾病-通用药品关系
37 | rels_recommanddrug = [] # 疾病-热门药品关系
38 | rels_check = [] # 疾病-检查关系
39 | rels_drug_producer = [] # 厂商-药物关系
40 |
41 | rels_symptom = [] # 疾病症状关系
42 | rels_acompany = [] # 疾病并发关系
43 | rels_category = [] # 疾病与科室之间的关系
44 |
45 | count = 0
46 | for data in open(self.data_path, encoding='utf8'):
47 | disease_dict = {}
48 | count += 1
49 | print(count)
50 | data_json = json.loads(data)
51 | disease = data_json['name'] # 疾病名
52 | disease_dict['name'] = disease
53 | diseases.append(disease)
54 | disease_dict['desc'] = ''
55 | disease_dict['prevent'] = ''
56 | disease_dict['cause'] = ''
57 | disease_dict['easy_get'] = ''
58 | disease_dict['cure_department'] = ''
59 | disease_dict['cure_way'] = ''
60 | disease_dict['cure_lasttime'] = ''
61 | disease_dict['symptom'] = ''
62 | disease_dict['cured_prob'] = ''
63 |
64 | # 做症状 然后做疾病和症状的关系
65 | if 'symptom' in data_json:
66 | symptoms += data_json['symptom'] # 这里加入所有的症状
67 | for symptom in data_json['symptom']:
68 | rels_symptom.append([disease, symptom])
69 |
70 | # 做并发症 并做疾病与并发症的关系
71 | if 'acompany' in data_json:
72 | for acompany in data_json['acompany']:
73 | rels_acompany.append([disease, acompany])
74 |
75 | # 做描述 不和病做关系 当做病的属性
76 | if 'desc' in data_json:
77 | disease_dict['desc'] = data_json['desc']
78 |
79 | #
80 | if 'prevent' in data_json:
81 | disease_dict['prevent'] = data_json['prevent']
82 |
83 | if 'cause' in data_json:
84 | disease_dict['cause'] = data_json['cause']
85 |
86 | if 'get_prob' in data_json:
87 | disease_dict['get_prob'] = data_json['get_prob']
88 |
89 | if 'easy_get' in data_json:
90 | disease_dict['easy_get'] = data_json['easy_get']
91 |
92 | # 科室
93 | if 'cure_department' in data_json:
94 | cure_department = data_json['cure_department']
95 | if len(cure_department) == 1:
96 | rels_category.append([disease, cure_department[0]])
97 | if len(cure_department) == 2:
98 | big = cure_department[0]
99 | small = cure_department[1]
100 | rels_department.append([small, big])
101 | rels_category.append([disease, small])
102 | disease_dict['cure_department'] = cure_department
103 | departments += cure_department
104 |
105 | if 'cure_way' in data_json:
106 | disease_dict['cure_way'] = data_json['cure_way']
107 |
108 | if 'cure_lasttime' in data_json:
109 | disease_dict['cure_lasttime'] = data_json['cure_lasttime']
110 |
111 | if 'cured_prob' in data_json:
112 | disease_dict['cured_prob'] = data_json['cured_prob']
113 |
114 | if 'common_drug' in data_json:
115 | common_drug = data_json['common_drug']
116 | for drug in common_drug:
117 | rels_commonddrug.append([disease, drug])
118 | drugs += common_drug
119 |
120 | if 'recommand_drug' in data_json:
121 | recommand_drug = data_json['recommand_drug']
122 | drugs += recommand_drug
123 | for drug in recommand_drug:
124 | rels_recommanddrug.append([disease, drug])
125 |
126 | if 'not_eat' in data_json:
127 | not_eat = data_json['not_eat']
128 | for _not in not_eat:
129 | rels_noteat.append([disease, _not])
130 |
131 | foods += not_eat
132 | do_eat = data_json['do_eat']
133 | for _do in do_eat:
134 | rels_doeat.append([disease, _do])
135 |
136 | foods += do_eat
137 | recommand_eat = data_json['recommand_eat']
138 |
139 | for _recommand in recommand_eat:
140 | rels_recommandeat.append([disease, _recommand])
141 | foods += recommand_eat
142 |
143 | if 'check' in data_json:
144 | check = data_json['check']
145 | for _check in check:
146 | rels_check.append([disease, _check])
147 | checks += check
148 | if 'drug_detail' in data_json:
149 | drug_detail = data_json['drug_detail']
150 | producer = [i.split('(')[0] for i in drug_detail]
151 | rels_drug_producer += [[i.split('(')[0], i.split('(')[-1].replace(')', '')] for i in drug_detail]
152 | producers += producer
153 | disease_infos.append(disease_dict)
154 | return set(drugs), set(foods), set(checks), set(departments), set(producers), set(symptoms), set(diseases), \
155 | disease_infos, rels_check, rels_recommandeat, rels_noteat, rels_doeat, rels_department, \
156 | rels_commonddrug, rels_drug_producer, rels_recommanddrug, rels_symptom, rels_acompany, rels_category
157 |
158 | def create_diseases_nodes(self, disease_infos):
159 | '''
160 | 创建知识图谱中心疾病的节点
161 | '''
162 | count = 0
163 | for disease_dict in disease_infos:
164 | # 疾病节点里面包含几种属性信息
165 | node = Node("Disease", name=disease_dict['name'], desc=disease_dict['desc'],
166 | prevent=disease_dict['prevent'], cause=disease_dict['cause'],
167 | easy_get=disease_dict['easy_get'], cure_lasttime=disease_dict['cure_lasttime'],
168 | cure_department=disease_dict['cure_department']
169 | , cure_way=disease_dict['cure_way'], cured_prob=disease_dict['cured_prob'])
170 | self.g.create(node)
171 | count += 1
172 | print(count)
173 | return
174 |
175 | def create_node(self, label, nodes):
176 | '''
177 | 建立节点
178 | '''
179 | count = 0
180 | for node_name in nodes:
181 | node = Node(label, name=node_name)
182 | self.g.create(node)
183 | count += 1
184 | print(count, len(nodes))
185 | return
186 |
187 | def create_graphnodes(self):
188 | Drugs, Foods, Checks, Departments, Producers, Symptoms, Diseases, disease_infos, rels_check, rels_recommandeat, rels_noteat, rels_doeat, rels_department, rels_commonddrug, rels_drug_producer, rels_recommanddrug, rels_symptom, rels_acompany, rels_category = self.read_nodes()
189 | self.create_diseases_nodes(disease_infos)
190 |
191 | self.create_node('Drug', Drugs)
192 | print(len(Drugs))
193 |
194 | self.create_node('Food', Foods)
195 | print(len(Foods))
196 |
197 | self.create_node('Check', Checks)
198 | print(len(Checks))
199 |
200 | self.create_node('Department', Departments)
201 | print(len(Departments))
202 |
203 | self.create_node('Producer', Producers)
204 | print(len(Producers))
205 |
206 | self.create_node('Symptom', Symptoms)
207 |
208 | return
209 |
210 | def create_graphrels(self):
211 | Drugs, Foods, Checks, Departments, Producers, Symptoms, Diseases, disease_infos, rels_check, rels_recommandeat, rels_noteat, rels_doeat, rels_department, rels_commonddrug, rels_drug_producer, rels_recommanddrug, rels_symptom, rels_acompany, rels_category = self.read_nodes()
212 | self.create_relationship('Disease', 'Food', rels_recommandeat, 'recommand_eat', '推荐食谱')
213 | self.create_relationship('Disease', 'Food', rels_noteat, 'no_eat', '忌吃')
214 | self.create_relationship('Disease', 'Food', rels_doeat, 'do_eat', '宜吃')
215 | self.create_relationship('Department', 'Department', rels_department, 'belongs_to', '属于')
216 | self.create_relationship('Disease', 'Drug', rels_commonddrug, 'common_drug', '常用药品')
217 | self.create_relationship('Producer', 'Drug', rels_drug_producer, 'drugs_of', '生产药品')
218 | self.create_relationship('Disease', 'Drug', rels_recommanddrug, 'recommand_drug', '好评药品')
219 | self.create_relationship('Disease', 'Check', rels_check, 'need_check', '诊断检查')
220 | self.create_relationship('Disease', 'Symptom', rels_symptom, 'has_symptom', '症状')
221 | self.create_relationship('Disease', 'Disease', rels_acompany, 'acompany_with', '并发症')
222 | self.create_relationship('Disease', 'Department', rels_category, 'belongs_to', '所属科室')
223 |
224 | def create_relationship(self, start_node, end_node, edges, rel_type, rel_name):
225 | '''创建实体关联边'''
226 | count = 0
227 | # 去重处理
228 | set_edges = []
229 |
230 | for edge in edges:
231 | set_edges.append('###'.join(edge))
232 |
233 | all = len(set(set_edges))
234 | for edge in set(set_edges):
235 | edge = edge.split('###')
236 | p = edge[0]
237 | q = edge[1]
238 | query = "match(p:%s),(q:%s) where p.name='%s'and q.name='%s' create (p)-[rel:%s{name:'%s'}]->(q)" % (
239 | start_node, end_node, p, q, rel_type, rel_name)
240 | try:
241 | self.g.run(query)
242 | count += 1
243 | print(rel_type, count, all)
244 | except Exception as e:
245 | print(e)
246 | return
247 |
248 |
249 | if __name__ == '__main__':
250 | # 实例化类图
251 | handler = MedicalGraph()
252 |
253 | # 创建节点
254 | handler.create_graphnodes()
255 | # 创建关系
256 | handler.create_graphrels()
257 |
258 |
259 |
260 |
--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/data_process/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | # -*- coding: utf-8 -*-
3 | # @File : __init__.py.py
4 | # @Time : 2020/11/25 10:15 上午
5 | # @Author : xiaolu
6 | # @Email : luxiaonlp@163.com
7 | # @Software: PyCharm
8 | """
9 |
--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/data_process/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/医疗知识图谱问答/data_process/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/data_process/__pycache__/answer_search.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/医疗知识图谱问答/data_process/__pycache__/answer_search.cpython-37.pyc
--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/data_process/__pycache__/question_classifier.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/医疗知识图谱问答/data_process/__pycache__/question_classifier.cpython-37.pyc
--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/data_process/__pycache__/question_parser.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/医疗知识图谱问答/data_process/__pycache__/question_parser.cpython-37.pyc
--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/data_process/answer_search.py:
--------------------------------------------------------------------------------
1 | """
2 | # -*- coding: utf-8 -*-
3 | # @File : answer_search.py
4 | # @Time : 2020/11/25 11:08 上午
5 | # @Author : xiaolu
6 | # @Email : luxiaonlp@163.com
7 | # @Software: PyCharm
8 | """
9 | from py2neo import Graph
10 |
11 |
12 | class AnswerSearcher:
13 | def __init__(self):
14 | self.g = Graph()
15 | self.num_limit = 20
16 |
17 | def search_main(self, sqls):
18 | # 执行cypher查询,并返回相应结果
19 | final_answers = []
20 | for sql_ in sqls:
21 | question_type = sql_['question_type']
22 | queries = sql_['sql']
23 | answers = []
24 | for query in queries:
25 | ress = self.g.run(query).data()
26 | answers += ress
27 | final_answer = self.answer_prettify(question_type, answers)
28 | if final_answer:
29 | final_answers.append(final_answer)
30 | return final_answers
31 |
32 | def answer_prettify(self, question_type, answers):
33 | # 根据对应的qustion_type,调用相应的回复模板
34 | final_answer = []
35 | if not answers:
36 | return ''
37 | if question_type == 'disease_symptom':
38 | desc = [i['n.name'] for i in answers]
39 | subject = answers[0]['m.name']
40 | final_answer = '{0}的症状包括:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
41 |
42 | elif question_type == 'symptom_disease':
43 | desc = [i['m.name'] for i in answers]
44 | subject = answers[0]['n.name']
45 | final_answer = '症状{0}可能染上的疾病有:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
46 |
47 | elif question_type == 'disease_cause':
48 | desc = [i['m.cause'] for i in answers]
49 | subject = answers[0]['m.name']
50 | final_answer = '{0}可能的成因有:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
51 |
52 | elif question_type == 'disease_prevent':
53 | desc = [i['m.prevent'] for i in answers]
54 | subject = answers[0]['m.name']
55 | final_answer = '{0}的预防措施包括:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
56 |
57 | elif question_type == 'disease_lasttime':
58 | desc = [i['m.cure_lasttime'] for i in answers]
59 | subject = answers[0]['m.name']
60 | final_answer = '{0}治疗可能持续的周期为:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
61 |
62 | elif question_type == 'disease_cureway':
63 | desc = [';'.join(i['m.cure_way']) for i in answers]
64 | subject = answers[0]['m.name']
65 | final_answer = '{0}可以尝试如下治疗:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
66 |
67 | elif question_type == 'disease_cureprob':
68 | desc = [i['m.cured_prob'] for i in answers]
69 | subject = answers[0]['m.name']
70 | final_answer = '{0}治愈的概率为(仅供参考):{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
71 |
72 | elif question_type == 'disease_easyget':
73 | desc = [i['m.easy_get'] for i in answers]
74 | subject = answers[0]['m.name']
75 |
76 | final_answer = '{0}的易感人群包括:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
77 |
78 | elif question_type == 'disease_desc':
79 | desc = [i['m.desc'] for i in answers]
80 | subject = answers[0]['m.name']
81 | final_answer = '{0},熟悉一下:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
82 |
83 | elif question_type == 'disease_acompany':
84 | desc1 = [i['n.name'] for i in answers]
85 | desc2 = [i['m.name'] for i in answers]
86 | subject = answers[0]['m.name']
87 | desc = [i for i in desc1 + desc2 if i != subject]
88 | final_answer = '{0}的症状包括:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
89 |
90 | elif question_type == 'disease_not_food':
91 | desc = [i['n.name'] for i in answers]
92 | subject = answers[0]['m.name']
93 | final_answer = '{0}忌食的食物包括有:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
94 |
95 | elif question_type == 'disease_do_food':
96 | do_desc = [i['n.name'] for i in answers if i['r.name'] == '宜吃']
97 | recommand_desc = [i['n.name'] for i in answers if i['r.name'] == '推荐食谱']
98 | subject = answers[0]['m.name']
99 | final_answer = '{0}宜食的食物包括有:{1}\n推荐食谱包括有:{2}'.format(subject, ';'.join(list(set(do_desc))[:self.num_limit]), ';'.join(list(set(recommand_desc))[:self.num_limit]))
100 |
101 | elif question_type == 'food_not_disease':
102 | desc = [i['m.name'] for i in answers]
103 | subject = answers[0]['n.name']
104 | final_answer = '患有{0}的人最好不要吃{1}'.format(';'.join(list(set(desc))[:self.num_limit]), subject)
105 |
106 | elif question_type == 'food_do_disease':
107 | desc = [i['m.name'] for i in answers]
108 | subject = answers[0]['n.name']
109 | final_answer = '患有{0}的人建议多试试{1}'.format(';'.join(list(set(desc))[:self.num_limit]), subject)
110 |
111 | elif question_type == 'disease_drug':
112 | desc = [i['n.name'] for i in answers]
113 | subject = answers[0]['m.name']
114 | final_answer = '{0}通常的使用的药品包括:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
115 |
116 | elif question_type == 'drug_disease':
117 | desc = [i['m.name'] for i in answers]
118 | subject = answers[0]['n.name']
119 | final_answer = '{0}主治的疾病有{1},可以试试'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
120 |
121 | elif question_type == 'disease_check':
122 | desc = [i['n.name'] for i in answers]
123 | subject = answers[0]['m.name']
124 | final_answer = '{0}通常可以通过以下方式检查出来:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
125 |
126 | elif question_type == 'check_disease':
127 | desc = [i['m.name'] for i in answers]
128 | subject = answers[0]['n.name']
129 | final_answer = '通常可以通过{0}检查出来的疾病有{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
130 |
131 | return final_answer
132 |
133 |
134 | if __name__ == '__main__':
135 | searcher = AnswerSearcher()
--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/data_process/question_classifier.py:
--------------------------------------------------------------------------------
1 | """
2 | # -*- coding: utf-8 -*-
3 | # @File : question_classifier.py
4 | # @Time : 2020/11/25 10:16 上午
5 | # @Author : xiaolu
6 | # @Email : luxiaonlp@163.com
7 | # @Software: PyCharm
8 | """
9 | import os
10 | import ahocorasick
11 | from pdb import set_trace
12 |
13 |
14 | class QuestionClassifier:
15 | def __init__(self):
16 | # 特征词路径
17 | self.disease_path = './dict/disease.txt'
18 | self.department_path = './dict/department.txt'
19 | self.check_path = './dict/check.txt'
20 | self.drug_path = './dict/drug.txt'
21 | self.food_path = './dict/food.txt'
22 | self.producer_path = './dict/producer.txt'
23 | self.symptom_path = './dict/symptom.txt'
24 | self.deny_path = './dict/deny.txt'
25 |
26 | # 加载特征词
27 | self.disease_wds = [i.strip() for i in open(self.disease_path) if i.strip()]
28 | self.department_wds = [i.strip() for i in open(self.department_path) if i.strip()]
29 | self.check_wds = [i.strip() for i in open(self.check_path) if i.strip()]
30 | self.drug_wds = [i.strip() for i in open(self.drug_path) if i.strip()]
31 | self.food_wds = [i.strip() for i in open(self.food_path) if i.strip()]
32 | self.producer_wds = [i.strip() for i in open(self.producer_path) if i.strip()]
33 | self.symptom_wds = [i.strip() for i in open(self.symptom_path) if i.strip()]
34 | self.region_words = set(self.department_wds + self.disease_wds + self.check_wds + self.drug_wds + self.food_wds + self.producer_wds + self.symptom_wds)
35 | self.deny_words = [i.strip() for i in open(self.deny_path) if i.strip()]
36 |
37 | # 建树 加快检索 可参考ahocorasick的使用 进行学习 actree
38 | self.region_tree = self.build_actree(list(self.region_words))
39 |
40 | # 构建词典
41 | self.wdtype_dict = self.build_wdtype_dict()
42 |
43 | # 问句疑问词
44 | self.symptom_qwds = ['症状', '表征', '现象', '症候', '表现']
45 | self.cause_qwds = ['原因', '成因', '为什么', '怎么会', '怎样才', '咋样才', '怎样会', '如何会', '为啥', '为何', '如何才会', '怎么才会', '会导致', '会造成']
46 | self.acompany_qwds = ['并发症', '并发', '一起发生', '一并发生', '一起出现', '一并出现', '一同发生', '一同出现', '伴随发生', '伴随', '共现']
47 | self.food_qwds = ['饮食', '饮用', '吃', '食', '伙食', '膳食', '喝', '菜', '忌口', '补品', '保健品', '食谱', '菜谱', '食用', '食物', '补品']
48 | self.drug_qwds = ['药', '药品', '用药', '胶囊', '口服液', '炎片']
49 | self.prevent_qwds = ['预防', '防范', '抵制', '抵御', '防止', ' 躲避', '逃避', '避开', '免得', '逃开', '避开', '避掉', '躲开', '躲掉', '绕开',
50 | '怎样才能不', '怎么才能不', '咋样才能不', '咋才能不', '如何才能不',
51 | '怎样才不', '怎么才不', '咋样才不', '咋才不', '如何才不',
52 | '怎样才可以不', '怎么才可以不', '咋样才可以不', '咋才可以不', '如何可以不',
53 | '怎样才可不', '怎么才可不', '咋样才可不', '咋才可不', '如何可不']
54 | self.lasttime_qwds = ['周期', '多久', '多长时间', '多少时间', '几天', '几年', '多少天', '多少小时', '几个小时', '多少年']
55 | self.cureway_qwds = ['怎么治疗', '如何医治', '怎么医治', '怎么治', '怎么医', '如何治', '医治方式', '疗法', '咋治', '怎么办', '咋办', '咋治']
56 | self.cureprob_qwds = ['多大概率能治好', '多大几率能治好', '治好希望大么', '几率', '几成', '比例', '可能性', '能治', '可治', '可以治', '可以医']
57 | self.easyget_qwds = ['易感人群', '容易感染', '易发人群', '什么人', '哪些人', '感染', '染上', '得上']
58 | self.check_qwds = ['检查', '检查项目', '查出', '检查', '测出', '试出']
59 | self.belong_qwds = ['属于什么科', '属于', '什么科', '科室']
60 | self.cure_qwds = ['治疗什么', '治啥', '治疗啥', '医治啥', '治愈啥', '主治啥', '主治什么', '有什么用', '有何用', '用处', '用途',
61 | '有什么好处', '有什么益处', '有何益处', '用来', '用来做啥', '用来作甚', '需要', '要']
62 |
63 | print('model init finished ......')
64 | return
65 |
66 | '''分类主函数'''
67 | def classify(self, question):
68 | data = {}
69 | medical_dict = self.check_medical(question)
70 |
71 | if not medical_dict:
72 | return {}
73 |
74 | data['args'] = medical_dict
75 |
76 | # 收集问句当中所涉及到的实体类型
77 | types = []
78 | for type_ in medical_dict.values():
79 | types += type_
80 |
81 | question_types = []
82 | # 症状
83 | if self.check_words(self.symptom_qwds, question) and ('disease' in types):
84 | question_type = 'disease_symptom'
85 | question_types.append(question_type)
86 |
87 | if self.check_words(self.symptom_qwds, question) and ('symptom' in types):
88 | question_type = 'symptom_disease'
89 | question_types.append(question_type)
90 |
91 | # 原因
92 | if self.check_words(self.cause_qwds, question) and ('disease' in types):
93 | question_type = 'disease_cause'
94 | question_types.append(question_type)
95 | # 并发症
96 | if self.check_words(self.acompany_qwds, question) and ('disease' in types):
97 | question_type = 'disease_acompany'
98 | question_types.append(question_type)
99 |
100 | # 推荐食品
101 | if self.check_words(self.food_qwds, question) and 'disease' in types:
102 | deny_status = self.check_words(self.deny_words, question)
103 | if deny_status:
104 | question_type = 'disease_not_food'
105 | else:
106 | question_type = 'disease_do_food'
107 | question_types.append(question_type)
108 |
109 | # 已知食物找疾病
110 | if self.check_words(self.food_qwds+self.cure_qwds, question) and 'food' in types:
111 | deny_status = self.check_words(self.deny_words, question)
112 | if deny_status:
113 | question_type = 'food_not_disease'
114 | else:
115 | question_type = 'food_do_disease'
116 | question_types.append(question_type)
117 |
118 | # 推荐药品
119 | if self.check_words(self.drug_qwds, question) and 'disease' in types:
120 | question_type = 'disease_drug'
121 | question_types.append(question_type)
122 |
123 | # 药品治啥病
124 | if self.check_words(self.cure_qwds, question) and 'drug' in types:
125 | question_type = 'drug_disease'
126 | question_types.append(question_type)
127 |
128 | # 疾病接受检查项目
129 | if self.check_words(self.check_qwds, question) and 'disease' in types:
130 | question_type = 'disease_check'
131 | question_types.append(question_type)
132 |
133 | # 已知检查项目查相应疾病
134 | if self.check_words(self.check_qwds+self.cure_qwds, question) and 'check' in types:
135 | question_type = 'check_disease'
136 | question_types.append(question_type)
137 |
138 | # 症状防御
139 | if self.check_words(self.prevent_qwds, question) and 'disease' in types:
140 | question_type = 'disease_prevent'
141 | question_types.append(question_type)
142 |
143 | # 疾病医疗周期
144 | if self.check_words(self.lasttime_qwds, question) and 'disease' in types:
145 | question_type = 'disease_lasttime'
146 | question_types.append(question_type)
147 |
148 | # 疾病治疗方式
149 | if self.check_words(self.cureway_qwds, question) and 'disease' in types:
150 | question_type = 'disease_cureway'
151 | question_types.append(question_type)
152 |
153 | # 疾病治愈可能性
154 | if self.check_words(self.cureprob_qwds, question) and 'disease' in types:
155 | question_type = 'disease_cureprob'
156 | question_types.append(question_type)
157 |
158 | # 疾病易感染人群
159 | if self.check_words(self.easyget_qwds, question) and 'disease' in types :
160 | question_type = 'disease_easyget'
161 | question_types.append(question_type)
162 |
163 | # 若没有查到相关的外部查询信息,那么则将该疾病的描述信息返回
164 | if question_types == [] and 'disease' in types:
165 | question_types = ['disease_desc']
166 |
167 | # 若没有查到相关的外部查询信息,那么则将该疾病的描述信息返回
168 | if question_types == [] and 'symptom' in types:
169 | question_types = ['symptom_disease']
170 |
171 | # 将多个分类结果进行合并处理,组装成一个字典
172 | data['question_types'] = question_types
173 |
174 | return data
175 |
176 | def build_wdtype_dict(self):
177 | # 构建词对应的类型 将词和对应的类型组成字典
178 | wd_dict = dict()
179 | for wd in self.region_words:
180 | wd_dict[wd] = []
181 | if wd in self.disease_wds:
182 | wd_dict[wd].append('disease')
183 | if wd in self.department_wds:
184 | wd_dict[wd].append('department')
185 | if wd in self.check_wds:
186 | wd_dict[wd].append('check')
187 | if wd in self.drug_wds:
188 | wd_dict[wd].append('drug')
189 | if wd in self.food_wds:
190 | wd_dict[wd].append('food')
191 | if wd in self.symptom_wds:
192 | wd_dict[wd].append('symptom')
193 | if wd in self.producer_wds:
194 | wd_dict[wd].append('producer')
195 | return wd_dict
196 |
197 | def build_actree(self, wordlist):
198 | # 构造actree树 加速过滤
199 | actree = ahocorasick.Automaton()
200 | for index, word in enumerate(wordlist):
201 | actree.add_word(word, (index, word))
202 | actree.make_automaton()
203 | return actree
204 |
205 | def check_medical(self, question):
206 | # 当用户输入一个问题时 先对问题进行过滤
207 | region_wds = []
208 | for i in self.region_tree.iter(question):
209 | wd = i[1][1] # 取出当前匹配到的词的索引位置
210 | region_wds.append(wd)
211 | stop_wds = []
212 | for wd1 in region_wds:
213 | for wd2 in region_wds:
214 | if wd1 in wd2 and wd1 != wd2:
215 | stop_wds.append(wd1)
216 | final_wds = [i for i in region_wds if i not in stop_wds]
217 | final_dict = {i: self.wdtype_dict.get(i) for i in final_wds}
218 | return final_dict
219 |
220 | def check_words(self, wds, sent):
221 | # 基于特征词进行分类 看当前特征在这个问题中包含不包含
222 | for wd in wds:
223 | if wd in sent:
224 | return True
225 | return False
226 |
227 |
228 | if __name__ == '__main__':
229 | handler = QuestionClassifier()
230 | while True:
231 | question = input("input an question:")
232 | data = handler.classify(question)
233 | print(data)
234 |
--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/data_process/question_parser.py:
--------------------------------------------------------------------------------
1 | """
2 | # -*- coding: utf-8 -*-
3 | # @File : question_parser.py
4 | # @Time : 2020/11/25 11:04 上午
5 | # @Author : xiaolu
6 | # @Email : luxiaonlp@163.com
7 | # @Software: PyCharm
8 | """
9 | class QuestionPaser:
10 | def build_entitydict(self, args):
11 | # 构建实体节点
12 | entity_dict = {}
13 | for arg, types in args.items():
14 | for type in types:
15 | if type not in entity_dict:
16 | entity_dict[type] = [arg]
17 | else:
18 | entity_dict[type].append(arg)
19 | return entity_dict
20 |
21 | def parser_main(self, res_classify):
22 | # 解析主函数
23 | args = res_classify['args']
24 | entity_dict = self.build_entitydict(args)
25 | question_types = res_classify['question_types']
26 | sqls = []
27 | for question_type in question_types:
28 | sql_ = {}
29 | sql_['question_type'] = question_type
30 | sql = []
31 | if question_type == 'disease_symptom':
32 | sql = self.sql_transfer(question_type, entity_dict.get('disease'))
33 |
34 | elif question_type == 'symptom_disease':
35 | sql = self.sql_transfer(question_type, entity_dict.get('symptom'))
36 |
37 | elif question_type == 'disease_cause':
38 | sql = self.sql_transfer(question_type, entity_dict.get('disease'))
39 |
40 | elif question_type == 'disease_acompany':
41 | sql = self.sql_transfer(question_type, entity_dict.get('disease'))
42 |
43 | elif question_type == 'disease_not_food':
44 | sql = self.sql_transfer(question_type, entity_dict.get('disease'))
45 |
46 | elif question_type == 'disease_do_food':
47 | sql = self.sql_transfer(question_type, entity_dict.get('disease'))
48 |
49 | elif question_type == 'food_not_disease':
50 | sql = self.sql_transfer(question_type, entity_dict.get('food'))
51 |
52 | elif question_type == 'food_do_disease':
53 | sql = self.sql_transfer(question_type, entity_dict.get('food'))
54 |
55 | elif question_type == 'disease_drug':
56 | sql = self.sql_transfer(question_type, entity_dict.get('disease'))
57 |
58 | elif question_type == 'drug_disease':
59 | sql = self.sql_transfer(question_type, entity_dict.get('drug'))
60 |
61 | elif question_type == 'disease_check':
62 | sql = self.sql_transfer(question_type, entity_dict.get('disease'))
63 |
64 | elif question_type == 'check_disease':
65 | sql = self.sql_transfer(question_type, entity_dict.get('check'))
66 |
67 | elif question_type == 'disease_prevent':
68 | sql = self.sql_transfer(question_type, entity_dict.get('disease'))
69 |
70 | elif question_type == 'disease_lasttime':
71 | sql = self.sql_transfer(question_type, entity_dict.get('disease'))
72 |
73 | elif question_type == 'disease_cureway':
74 | sql = self.sql_transfer(question_type, entity_dict.get('disease'))
75 |
76 | elif question_type == 'disease_cureprob':
77 | sql = self.sql_transfer(question_type, entity_dict.get('disease'))
78 |
79 | elif question_type == 'disease_easyget':
80 | sql = self.sql_transfer(question_type, entity_dict.get('disease'))
81 |
82 | elif question_type == 'disease_desc':
83 | sql = self.sql_transfer(question_type, entity_dict.get('disease'))
84 |
85 | if sql:
86 | sql_['sql'] = sql
87 |
88 | sqls.append(sql_)
89 | return sqls
90 |
91 | def sql_transfer(self, question_type, entities):
92 | # 针对不同的问题 进行查找
93 | if not entities:
94 | return []
95 |
96 | # 查询语句
97 | sql = []
98 | # 查询疾病的原因
99 | if question_type == 'disease_cause':
100 | sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.cause".format(i) for i in entities]
101 |
102 | # 查询疾病的防御措施
103 | elif question_type == 'disease_prevent':
104 | sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.prevent".format(i) for i in entities]
105 |
106 | # 查询疾病的持续时间
107 | elif question_type == 'disease_lasttime':
108 | sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.cure_lasttime".format(i) for i in entities]
109 |
110 | # 查询疾病的治愈概率
111 | elif question_type == 'disease_cureprob':
112 | sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.cured_prob".format(i) for i in entities]
113 |
114 | # 查询疾病的治疗方式
115 | elif question_type == 'disease_cureway':
116 | sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.cure_way".format(i) for i in entities]
117 |
118 | # 查询疾病的易发人群
119 | elif question_type == 'disease_easyget':
120 | sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.easy_get".format(i) for i in entities]
121 |
122 | # 查询疾病的相关介绍
123 | elif question_type == 'disease_desc':
124 | sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.desc".format(i) for i in entities]
125 |
126 | # 查询疾病有哪些症状
127 | elif question_type == 'disease_symptom':
128 | sql = ["MATCH (m:Disease)-[r:has_symptom]->(n:Symptom) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
129 |
130 | # 查询症状会导致哪些疾病
131 | elif question_type == 'symptom_disease':
132 | sql = ["MATCH (m:Disease)-[r:has_symptom]->(n:Symptom) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
133 |
134 | # 查询疾病的并发症
135 | elif question_type == 'disease_acompany':
136 | sql1 = ["MATCH (m:Disease)-[r:acompany_with]->(n:Disease) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
137 | sql2 = ["MATCH (m:Disease)-[r:acompany_with]->(n:Disease) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
138 | sql = sql1 + sql2
139 | # 查询疾病的忌口
140 | elif question_type == 'disease_not_food':
141 | sql = ["MATCH (m:Disease)-[r:no_eat]->(n:Food) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
142 |
143 | # 查询疾病建议吃的东西
144 | elif question_type == 'disease_do_food':
145 | sql1 = ["MATCH (m:Disease)-[r:do_eat]->(n:Food) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
146 | sql2 = ["MATCH (m:Disease)-[r:recommand_eat]->(n:Food) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
147 | sql = sql1 + sql2
148 |
149 | # 已知忌口查疾病
150 | elif question_type == 'food_not_disease':
151 | sql = ["MATCH (m:Disease)-[r:no_eat]->(n:Food) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
152 |
153 | # 已知推荐查疾病
154 | elif question_type == 'food_do_disease':
155 | sql1 = ["MATCH (m:Disease)-[r:do_eat]->(n:Food) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
156 | sql2 = ["MATCH (m:Disease)-[r:recommand_eat]->(n:Food) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
157 | sql = sql1 + sql2
158 |
159 | # 查询疾病常用药品-药品别名记得扩充
160 | elif question_type == 'disease_drug':
161 | sql1 = ["MATCH (m:Disease)-[r:common_drug]->(n:Drug) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
162 | sql2 = ["MATCH (m:Disease)-[r:recommand_drug]->(n:Drug) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
163 | sql = sql1 + sql2
164 |
165 | # 已知药品查询能够治疗的疾病
166 | elif question_type == 'drug_disease':
167 | sql1 = ["MATCH (m:Disease)-[r:common_drug]->(n:Drug) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
168 | sql2 = ["MATCH (m:Disease)-[r:recommand_drug]->(n:Drug) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
169 | sql = sql1 + sql2
170 | # 查询疾病应该进行的检查
171 | elif question_type == 'disease_check':
172 | sql = ["MATCH (m:Disease)-[r:need_check]->(n:Check) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
173 |
174 | # 已知检查查询疾病
175 | elif question_type == 'check_disease':
176 | sql = ["MATCH (m:Disease)-[r:need_check]->(n:Check) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
177 | return sql
178 |
179 |
180 | if __name__ == '__main__':
181 | handler = QuestionPaser()
--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/dict/deny.txt:
--------------------------------------------------------------------------------
1 | 否
2 | 非
3 | 不
4 | 无
5 | 弗
6 | 勿
7 | 毋
8 | 未
9 | 没
10 | 莫
11 | 没有
12 | 防止
13 | 不再
14 | 不会
15 | 不能
16 | 忌
17 | 禁止
18 | 防止
19 | 难以
20 | 忘记
21 | 忽视
22 | 放弃
23 | 拒绝
24 | 杜绝
25 | 不是
26 | 并未
27 | 并无
28 | 仍未
29 | 难以出现
30 | 切勿
31 | 不要
32 | 不可
33 | 别
34 | 管住
35 | 注意
36 | 小心
37 | 少
38 |
39 |
--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/dict/department.txt:
--------------------------------------------------------------------------------
1 | 心理科
2 | 妇科
3 | 耳鼻喉科
4 | 中医综合
5 | 泌尿内科
6 | 康复科
7 | 神经外科
8 | 生殖健康
9 | 肿瘤科
10 | 肛肠科
11 | 儿科
12 | 普外科
13 | 心胸外科
14 | 风湿免疫科
15 | 小儿外科
16 | 传染科
17 | 减肥
18 | 其他科室
19 | 肾内科
20 | 皮肤性病科
21 | 口腔科
22 | 不孕不育
23 | 五官科
24 | 整形美容科
25 | 消化内科
26 | 急诊科
27 | 肝胆外科
28 | 遗传病科
29 | 精神科
30 | 神经内科
31 | 小儿内科
32 | 肿瘤内科
33 | 皮肤科
34 | 中医科
35 | 骨外科
36 | 外科
37 | 呼吸内科
38 | 其他综合
39 | 眼科
40 | 内分泌科
41 | 性病科
42 | 妇产科
43 | 肝病
44 | 肿瘤外科
45 | 儿科综合
46 | 营养科
47 | 男科
48 | 产科
49 | 感染科
50 | 泌尿外科
51 | 血液科
52 | 心内科
53 | 烧伤科
54 | 内科
--------------------------------------------------------------------------------
/py2neo操作neo4j/医疗知识图谱问答/run_chatbot.py:
--------------------------------------------------------------------------------
1 | """
2 | # -*- coding: utf-8 -*-
3 | # @File : run_chatbot.py
4 | # @Time : 2020/11/25 10:07 上午
5 | # @Author : xiaolu
6 | # @Email : luxiaonlp@163.com
7 | # @Software: PyCharm
8 | """
9 | from data_process.question_classifier import QuestionClassifier
10 | from data_process.question_parser import QuestionPaser
11 | from data_process.answer_search import AnswerSearcher
12 |
13 |
14 | class ChatBotGraph:
15 | def __init__(self):
16 | self.classifier = QuestionClassifier()
17 | self.parser = QuestionPaser()
18 | self.searcher = AnswerSearcher()
19 |
20 | def chat_main(self, sent):
21 | answer = "您好, 我是小路医药智能助理,希望可以帮到您。如果没答上来,可联系120。祝您身体棒棒的!!!"
22 | res_classify = self.classifier.classify(sent)
23 | if not res_classify:
24 | return answer
25 | res_sql = self.parser.parser_main(res_classify)
26 | final_answers = self.searcher.search_main(res_sql)
27 | if not final_answers:
28 | return answer
29 | else:
30 | return '\n'.join(final_answers)
31 |
32 |
33 | if __name__ == '__main__':
34 | handler = ChatBotGraph()
35 | while True:
36 | question = input("用户:")
37 | answer = handler.chat_main(question)
38 | print("小路:", answer)
39 |
--------------------------------------------------------------------------------
/pyecharts使用/001-柱状图.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 001-柱状图.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-02-17
6 | """
7 | from pyecharts.charts import Bar
8 | from pyecharts import options as opts
9 | from pyecharts.globals import ThemeType
10 |
11 |
12 | bar = (
13 | Bar({"theme": ThemeType.MACARONS}) # 设置主题
14 | # Bar()
15 | .set_global_opts(
16 | title_opts=opts.TitleOpts(title="各种衣服价格", subtitle="VS"),
17 | xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)) # 名字倾斜15度
18 | )
19 |
20 | .add_xaxis(["衬衫", "羊毛衫", "雪纺衫", "裤子", "高跟鞋", "袜子"])
21 | .add_yaxis("商家A", [5, 20, 36, 10, 75, 90])
22 | .add_yaxis("商家B", [5, 20, 36, 10, 75, 90])
23 |
24 | )
25 | bar.render('柱状图.html')
--------------------------------------------------------------------------------
/pyecharts使用/002-折线图.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 002-折线图.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-02-17
6 | """
7 | from pyecharts.charts import Line
8 | from pyecharts import options as opts
9 |
10 |
11 | if __name__ == "__main__":
12 | x_data = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
13 | # x_data = [1, 2, 3, 4, 5, 6, 7]
14 | y_data = [820, 932, 901, 934, 1290, 1330, 1320]
15 | y_data2 = [237, 132, 401, 534, 290, 1230, 1120]
16 |
17 | line = (
18 | Line()
19 | .set_global_opts(
20 | tooltip_opts=opts.TooltipOpts(is_show=True),
21 | title_opts=opts.TitleOpts(title="收入大比拼", pos_left="center"), # 标题
22 | legend_opts=opts.LegendOpts(pos_left="right"), # 线条示例放在右上角
23 | xaxis_opts=opts.AxisOpts(type_="category", name="星期"), # 横轴的类型与名字
24 | # 注意横轴type_等于value 和category的区别
25 | yaxis_opts=opts.AxisOpts(
26 | type_="value",
27 | name="收入",
28 | splitline_opts=opts.SplitLineOpts(is_show=True), # 是否显示横向格子线
29 | is_scale=True,
30 | ), # 纵轴的类型与名字
31 | )
32 | .add_xaxis(xaxis_data=x_data)
33 | .add_yaxis(
34 | is_smooth=True, # 是否进行平滑处理
35 | series_name="小花收入", # 标识每条线
36 | y_axis=y_data,
37 | symbol="emptyCircle",
38 | linestyle_opts=opts.LineStyleOpts(width=2), # 设置线宽
39 | is_symbol_show=True,
40 | label_opts=opts.LabelOpts(is_show=True), # is_show显示是否需要标注数据
41 | )
42 | .add_yaxis(
43 | series_name="王五", # 标识每条线
44 | y_axis=y_data2,
45 | symbol="emptyCircle",
46 | is_symbol_show=True,
47 | label_opts=opts.LabelOpts(is_show=True), # is_show显示是否需要标注数据
48 |
49 | # 自定义标记
50 | markpoint_opts=opts.MarkPointOpts(
51 | data=[opts.MarkPointItem(name="自定义标记点", coord=[x_data[2], y_data2[2]], value=y_data2[2])]
52 | ),
53 | )
54 |
55 | )
56 | line.render('折线图.html')
57 |
--------------------------------------------------------------------------------
/pyecharts使用/003-饼状图.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 003-饼状图.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-02-17
6 | """
7 | from pyecharts import options as opts
8 | from pyecharts.charts import Pie
9 | from pyecharts.faker import Faker
10 |
11 |
12 | if __name__ == '__main__':
13 | # 生成假数据
14 | # a, b = Faker.choose(), Faker.values()
15 | # print(a)
16 | # print(b)
17 | # ['可乐', '雪碧', '橙汁', '绿茶', '奶茶', '百威', '青岛']
18 | # [97, 140, 75, 28, 89, 20, 143]
19 | pie = (
20 | Pie()
21 | .set_global_opts(title_opts=opts.TitleOpts(title="Pie-设置颜色"))
22 |
23 | .add("", [list(z) for z in zip(Faker.choose(), Faker.values())])
24 |
25 | .set_colors(["blue", "green", "yellow", "red", "pink", "orange", "purple"]) # 每个所占面积的颜色设置
26 |
27 | .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}")) # 标签显示的样子
28 | )
29 | pie.render("饼状图.html")
30 |
--------------------------------------------------------------------------------
/pyecharts使用/折线图.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Awesome-pyecharts
6 |
7 |
8 |
9 |
10 |
11 |
292 |
293 |
294 |
--------------------------------------------------------------------------------
/pyecharts使用/柱状图.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Awesome-pyecharts
6 |
7 |
8 |
9 |
10 |
11 |
12 |
189 |
190 |
191 |
--------------------------------------------------------------------------------
/pyecharts使用/饼状图.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Awesome-pyecharts
6 |
7 |
8 |
9 |
10 |
11 |
134 |
135 |
136 |
--------------------------------------------------------------------------------
/pymysql的使用/001-创建数据库.py:
--------------------------------------------------------------------------------
1 | """
2 | # -*- coding: utf-8 -*-
3 | # @File : 001-创建数据库.py
4 | # @Time : 2020/11/24 1:59 下午
5 | # @Author : xiaolu
6 | # @Email : luxiaonlp@163.com
7 | # @Software: PyCharm
8 | """
9 | import pymysql
10 |
11 | if __name__ == '__main__':
12 | # 打开数据库连接
13 | db = pymysql.connect('localhost', 'xxxxx', 'xxxxxx')
14 |
15 | # 使用 cursor() 方法创建一个游标对象 cursor
16 | cursor = db.cursor()
17 |
18 | # 创建数据库
19 | db_name = 'TESTDB'
20 | sql = "CREATE DATABASE {}".format(db_name) # 创建数据库
21 | cursor.execute(sql)
22 |
23 | # 使用 execute() 方法执行 SQL 查询
24 | cursor.execute("SELECT VERSION()")
25 |
26 | # 使用fetchone()方法获取单条数据
27 | data = cursor.fetchone()
28 | print("数据库的版本: ", data)
29 | # 关闭数据库连接
30 | db.close()
--------------------------------------------------------------------------------
/pymysql的使用/002-创建表插入数据.py:
--------------------------------------------------------------------------------
1 | """
2 | # -*- coding: utf-8 -*-
3 | # @File : 002-创建表插入数据.py
4 | # @Time : 2020/11/24 2:20 下午
5 | # @Author : xiaolu
6 | # @Email : luxiaonlp@163.com
7 | # @Software: PyCharm
8 | """
9 | import pymysql
10 |
11 | if __name__ == '__main__':
12 | # 打开数据库连接
13 | db = pymysql.connect("localhost", "xxxxx", "xxxxxxx", "TESTDB")
14 |
15 | # 使用 cursor() 方法创建一个游标对象 cursor
16 | cursor = db.cursor()
17 |
18 | # 使用 execute() 方法执行 SQL,如果表存在则删除
19 | cursor.execute("DROP TABLE IF EXISTS EMPLOYEE")
20 |
21 | sql = '''CREATE TABLE EMPLOYEE (
22 | FIRST_NAME CHAR (20) NOT NULL,
23 | LAST_NAME CHAR (20),
24 | AGE INT,
25 | SEX CHAR (1),
26 | INCOME FLOAT
27 | )
28 | '''
29 | cursor.execute(sql)
30 |
31 | # 接着插入数据
32 | insert_sql = """INSERT INTO EMPLOYEE(FIRST_NAME, LAST_NAME, AGE, SEX, INCOME)
33 | VALUES ('Mac', 'Mohan', 20, 'M', 2000)"""
34 | try:
35 | # 执行sql语句
36 | cursor.execute(insert_sql)
37 | # 提交到数据库执行
38 | db.commit()
39 | except:
40 | print('滚犊子,插不进去')
41 | # 如果发生错误则回滚
42 | db.rollback()
43 | # 关闭数据库连接
44 | db.close()
--------------------------------------------------------------------------------
/pymysql的使用/003-查询.py:
--------------------------------------------------------------------------------
1 | """
2 | # -*- coding: utf-8 -*-
3 | # @File : 003-查询.py
4 | # @Time : 2020/11/24 2:34 下午
5 | # @Author : xiaolu
6 | # @Email : luxiaonlp@163.com
7 | # @Software: PyCharm
8 | """
9 | import pymysql
10 |
11 | if __name__ == '__main__':
12 | # 打开数据库连接
13 | db = pymysql.connect("localhost", "xxxxxx", "xxxxxxx", "TESTDB")
14 |
15 | # 使用cursor()方法获取操作游标
16 | cursor = db.cursor()
17 |
18 | # SQL 查询语句
19 | sql = "SELECT * FROM EMPLOYEE WHERE INCOME > %s" % (1000)
20 | try:
21 | cursor.execute(sql)
22 |
23 | # 获取所有记录列表
24 | results = cursor.fetchall()
25 | for row in results:
26 | fname = row[0]
27 | lname = row[1]
28 | age = row[2]
29 | sex = row[3]
30 | income = row[4]
31 | print('fname: {}, lname:{}, age:{}, sex:{}, income:{}'.format(fname, lname, age, sex, income))
32 | except:
33 | print("啥也找不到")
34 |
35 | # 关闭数据库连接
36 | db.close()
--------------------------------------------------------------------------------
/pymysql的使用/004-更新.py:
--------------------------------------------------------------------------------
1 | """
2 | # -*- coding: utf-8 -*-
3 | # @File : 004-更新.py
4 | # @Time : 2020/11/24 2:41 下午
5 | # @Author : xiaolu
6 | # @Email : luxiaonlp@163.com
7 | # @Software: PyCharm
8 | """
9 | import pymysql
10 |
11 | if __name__ == "__main__":
12 | # 打开数据库连接
13 | db = pymysql.connect("localhost", "xxxxxx", "xxxxxx", "TESTDB")
14 |
15 | # 使用cursor()方法获取操作游标
16 | cursor = db.cursor()
17 |
18 | # SQL 更新语句 给男性加1岁
19 | sql = "UPDATE EMPLOYEE SET AGE = AGE + 1 WHERE SEX = '%c'" % ('M')
20 | try:
21 | # 执行SQL语句
22 | cursor.execute(sql)
23 | # 提交到数据库执行
24 | db.commit()
25 | except:
26 | # 发生错误时回滚
27 | db.rollback()
28 |
29 | # 关闭数据库连接
30 | db.close()
--------------------------------------------------------------------------------
/pymysql的使用/005-删除.py:
--------------------------------------------------------------------------------
1 | """
2 | # -*- coding: utf-8 -*-
3 | # @File : 005-删除.py
4 | # @Time : 2020/11/24 2:43 下午
5 | # @Author : xiaolu
6 | # @Email : luxiaonlp@163.com
7 | # @Software: PyCharm
8 | """
9 | import pymysql
10 |
11 | if __name__ == '__main__':
12 | # 打开数据库连接
13 | db = pymysql.connect("localhost", "xxxxxx", "xxxxx", "TESTDB")
14 |
15 | # 使用cursor()方法获取操作游标
16 | cursor = db.cursor()
17 |
18 | # SQL 删除语句
19 | sql = "DELETE FROM EMPLOYEE WHERE AGE > %s" % (20)
20 | try:
21 | # 执行SQL语句
22 | cursor.execute(sql)
23 | # 提交修改
24 | db.commit()
25 | except:
26 | # 发生错误时回滚
27 | db.rollback()
28 |
29 | # 关闭连接
30 | db.close()
--------------------------------------------------------------------------------
/python并发编程/001-多线程.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 001-多线程.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-02-01
6 | """
7 | import threading
8 | import time
9 | import requests
10 |
11 |
12 | def craw(url):
13 | # 这是个爬虫
14 | r = requests.get(url)
15 | print(url, r.status_code)
16 |
17 |
18 | def single_thread():
19 | # 单线程爬虫
20 | print('single_thread start')
21 | for url in urls:
22 | craw(url)
23 | print('single_thread end')
24 |
25 |
26 | def multi_thread():
27 | # 多线程爬虫
28 | print("multi_thread begin")
29 | threads = []
30 | for url in urls:
31 | threads.append(
32 | threading.Thread(target=craw, args=(url,)) # url, 之所以加逗号 是因为这里必须为元组
33 | )
34 |
35 | # 启动多线程
36 | for thread in threads:
37 | thread.start()
38 |
39 | # 等待结束
40 | for thread in threads:
41 | thread.join()
42 | print("multi_thread end")
43 |
44 |
45 | if __name__ == '__main__':
46 | # 爬50页的内容
47 | urls = ['https://www.cnblogs.com/sitehome/p/{}'.format(page) for page in range(1, 50 + 1)]
48 |
49 | # 单线程走起
50 | start = time.time()
51 | single_thread()
52 | end = time.time()
53 | print("single thread cost:", end - start, "seconds")
54 |
55 | # 多线程走起
56 | start = time.time()
57 | multi_thread()
58 | end = time.time()
59 | print("multi thread cost:", end - start, "seconds")
--------------------------------------------------------------------------------
/python并发编程/002-生产者消费者实现多线程爬虫.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 002-生产者消费者实现多线程爬虫.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-02-01
6 | """
7 | import queue
8 | import time
9 | import random
10 | import threading
11 | import requests
12 | from bs4 import BeautifulSoup
13 |
14 |
15 | def craw(url):
16 | # 爬取网页内容
17 | r = requests.get(url)
18 | return r.text
19 |
20 |
21 | def parse(html):
22 | # 解析其中的内容
23 | soup = BeautifulSoup(html, "html.parser")
24 | links = soup.find_all("a", class_="post-item-title")
25 | return [(link["href"], link.get_text()) for link in links] # 那链接和标题拿出来
26 |
27 |
28 | def do_craw(url_queue: queue.Queue, html_queue: queue.Queue):
29 | '''
30 | 生产者
31 | :param url_queue: url的队列 生产者从中拿出链接 去爬虫
32 | :param html_queue: 生产者将爬取的内容放到这里
33 | :return:
34 | '''
35 | while True:
36 | url = url_queue.get()
37 | html = craw(url)
38 | html_queue.put(html)
39 | print('线程名: ', threading.current_thread().name,
40 | "url_queue.size=", url_queue.qsize()) # 获取url队列中还有多少待爬取的
41 | time.sleep(random.randint(1, 2))
42 |
43 |
44 | def do_parse(html_queue: queue.Queue, fout):
45 | '''
46 | 消费者
47 | :param html_queue: 生产者生产出的内容
48 | :param fout: 消费者将内容解析出来 存到fout中
49 | :return:
50 | '''
51 | while True:
52 | html = html_queue.get()
53 | results = parse(html)
54 | for result in results:
55 | fout.write(str(result) + "\n")
56 | print('线程名: ', threading.current_thread().name,
57 | "html_queue.size=", html_queue.qsize())
58 | time.sleep(random.randint(1, 2))
59 |
60 |
61 | if __name__ == '__main__':
62 | # 待爬取的网页链接
63 | urls = [
64 | "https://www.cnblogs.com/sitehome/p/{}".format(page) for page in range(1, 50 + 1)
65 | ]
66 |
67 | url_queue = queue.Queue()
68 | html_queue = queue.Queue()
69 |
70 | # 将url放进队列中
71 | for url in urls:
72 | url_queue.put(url)
73 |
74 | # 启动三个线程去做生产者
75 | for idx in range(3):
76 | t = threading.Thread(target=do_craw, args=(url_queue, html_queue),
77 | name="craw{}".format(idx))
78 | t.start()
79 |
80 | fout = open("data.txt", "w")
81 | # 启动两个线程去做消费者
82 | for idx in range(2):
83 | t = threading.Thread(target=do_parse, args=(html_queue, fout),
84 | name="parse{}".format(idx))
85 | t.start()
86 |
--------------------------------------------------------------------------------
/python并发编程/003-多线程锁机制.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 003-多线程锁机制.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-02-01
6 | """
7 | import threading
8 | import time
9 |
10 | lock = threading.Lock()
11 |
12 |
13 | class Account:
14 | def __init__(self, balance):
15 | self.balance = balance
16 |
17 |
18 | def draw(account, amount):
19 | with lock:
20 | if account.balance >= amount:
21 | # time.sleep(0.1) # 如果不加锁,这里休息0.1秒,每次都会出问题,因为这里会引起线程阻塞,一定会切换
22 | print(threading.current_thread().name, "取钱成功")
23 | account.balance -= amount
24 | print(threading.current_thread().name, "余额", account.balance)
25 | else:
26 | print(threading.current_thread().name,
27 | "取钱失败,余额不足")
28 |
29 |
30 | if __name__ == "__main__":
31 | account = Account(1000) # 金额
32 |
33 | # 启动两个线程 分别去800块
34 | ta = threading.Thread(name="ta", target=draw, args=(account, 800))
35 | tb = threading.Thread(name="tb", target=draw, args=(account, 800))
36 |
37 | ta.start()
38 | tb.start()
--------------------------------------------------------------------------------
/python并发编程/004-线程池的使用.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 004-线程池的使用.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-02-01
6 | """
7 | import concurrent.futures
8 | import requests
9 | from bs4 import BeautifulSoup
10 |
11 |
12 | def craw(url):
13 | # 爬取网页内容
14 | r = requests.get(url)
15 | return r.text
16 |
17 |
18 | def parse(html):
19 | # 解析其中的内容
20 | soup = BeautifulSoup(html, "html.parser")
21 | links = soup.find_all("a", class_="post-item-title")
22 | return [(link["href"], link.get_text()) for link in links] # 那链接和标题拿出来
23 |
24 |
25 | if __name__ == '__main__':
26 | # 待爬取的网页链接
27 | urls = [
28 | "https://www.cnblogs.com/sitehome/p/{}".format(page) for page in range(1, 50 + 1)
29 | ]
30 |
31 | # craw
32 | with concurrent.futures.ThreadPoolExecutor() as pool:
33 | htmls = pool.map(craw, urls)
34 | htmls = list(zip(urls, htmls))
35 | for url, html in htmls:
36 | print(url, len(html))
37 | print("craw over")
38 |
39 | # parse
40 | with concurrent.futures.ThreadPoolExecutor() as pool:
41 | futures = {}
42 | for url, html in htmls:
43 | future = pool.submit(parse, html)
44 | futures[future] = url
45 |
46 | # for future, url in futures.items():
47 | # print(url, future.result())
48 |
49 | for future in concurrent.futures.as_completed(futures):
50 | url = futures[future]
51 | print(url, future.result())
52 |
--------------------------------------------------------------------------------
/python并发编程/005-线程池加速flask-web服务.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 005-线程池加速flask-web服务.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-02-01
6 | """
7 | import flask
8 | import json
9 | import time
10 | from concurrent.futures import ThreadPoolExecutor
11 |
12 | app = flask.Flask(__name__)
13 | pool = ThreadPoolExecutor()
14 |
15 |
16 | def read_file():
17 | time.sleep(0.1)
18 | return "file result"
19 |
20 |
21 | def read_db():
22 | time.sleep(0.2)
23 | return "db result"
24 |
25 |
26 | def read_api():
27 | time.sleep(0.3)
28 | return "api result"
29 |
30 |
31 | @app.route("/")
32 | def index():
33 | result_file = pool.submit(read_file)
34 | result_db = pool.submit(read_db)
35 | result_api = pool.submit(read_api)
36 |
37 | return json.dumps({
38 | "result_file": result_file.result(),
39 | "result_db": result_db.result(),
40 | "result_api": result_api.result(),
41 | })
42 |
43 |
44 | if __name__ == "__main__":
45 | app.run()
--------------------------------------------------------------------------------
/python并发编程/006-多进程的使用.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 006-多进程的使用.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-02-01
6 | """
7 | import math
8 | from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
9 | import time
10 |
11 |
12 | def is_prime(n):
13 | if n < 2:
14 | return False
15 | if n == 2:
16 | return True
17 | if n % 2 == 0:
18 | return False
19 | sqrt_n = int(math.floor(math.sqrt(n)))
20 | for i in range(3, sqrt_n + 1, 2):
21 | if n % i == 0:
22 | return False
23 | return True
24 |
25 |
26 | def single_thread():
27 | for number in PRIMES:
28 | is_prime(number)
29 |
30 |
31 | def multi_thread():
32 | with ThreadPoolExecutor() as pool:
33 | pool.map(is_prime, PRIMES)
34 |
35 |
36 | def multi_process():
37 | with ProcessPoolExecutor() as pool:
38 | pool.map(is_prime, PRIMES)
39 |
40 |
41 | if __name__ == "__main__":
42 | PRIMES = [112272535095293] * 100
43 |
44 | start = time.time()
45 | single_thread()
46 | end = time.time()
47 | print("single_thread, cost:", end - start, "seconds")
48 |
49 | start = time.time()
50 | multi_thread()
51 | end = time.time()
52 | print("multi_thread, cost:", end - start, "seconds")
53 |
54 | start = time.time()
55 | multi_process()
56 | end = time.time()
57 | print("multi_process, cost:", end - start, "seconds")
58 |
--------------------------------------------------------------------------------
/python并发编程/007-多进程加速flask-web服务.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 007-多进程加速flask-web服务.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-02-01
6 | """
7 | import flask
8 | from concurrent.futures import ProcessPoolExecutor
9 | import math
10 | import json
11 |
12 |
13 | app = flask.Flask(__name__)
14 |
15 |
16 | def is_prime(n):
17 | if n < 2:
18 | return False
19 | if n == 2:
20 | return True
21 | if n % 2 == 0:
22 | return False
23 | sqrt_n = int(math.floor(math.sqrt(n)))
24 | for i in range(3, sqrt_n + 1, 2):
25 | if n % i == 0:
26 | return False
27 | return True
28 |
29 |
30 | @app.route("/is_prime/")
31 | def api_is_prime(numbers):
32 | number_list = [int(x) for x in numbers.split(",")]
33 | results = process_pool.map(is_prime, number_list)
34 | return json.dumps(dict(zip(number_list, results)))
35 |
36 |
37 | if __name__ == "__main__":
38 | process_pool = ProcessPoolExecutor()
39 | app.run()
40 |
--------------------------------------------------------------------------------
/python并发编程/008-协程爬虫.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 008-协程爬虫.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-02-01
6 | """
7 | import asyncio
8 | import aiohttp
9 | import time
10 |
11 |
12 | async def async_craw(url):
13 | print("craw url: ", url)
14 | async with aiohttp.ClientSession() as session:
15 | async with session.get(url) as resp:
16 | result = await resp.text()
17 | print(f"craw url: {url}, {len(result)}")
18 |
19 |
20 | if __name__ == '__main__':
21 | urls = [
22 | "https://www.cnblogs.com/sitehome/p/{}".format(page) for page in range(1, 50 + 1)
23 | ]
24 |
25 | loop = asyncio.get_event_loop() # 获取超级循环
26 | tasks = [loop.create_task(async_craw(url)) for url in urls] # 建立任务
27 | start = time.time()
28 | loop.run_until_complete(asyncio.wait(tasks)) # 开始执行
29 | end = time.time()
30 | print("use time seconds: ", end - start)
--------------------------------------------------------------------------------
/python并发编程/009-使用信号量控制协程数进行爬虫.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : 009-使用信号量控制协程数进行爬虫.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-02-01
6 | """
7 | import asyncio
8 | import aiohttp
9 | import time
10 |
11 |
12 | async def async_craw(url):
13 | async with semaphore: # 加了这个
14 | print("craw url: ", url)
15 | async with aiohttp.ClientSession() as session:
16 | async with session.get(url) as resp:
17 | result = await resp.text()
18 | await asyncio.sleep(5)
19 | print(f"craw url: {url}, {len(result)}")
20 |
21 |
22 | if __name__ == '__main__':
23 | urls = [
24 | "https://www.cnblogs.com/sitehome/p/{}".format(page) for page in range(1, 50 + 1)
25 | ]
26 | semaphore = asyncio.Semaphore(10) # 控制并发量
27 |
28 | loop = asyncio.get_event_loop() # 获取超级循环
29 | tasks = [loop.create_task(async_craw(url)) for url in urls] # 建立任务
30 | start = time.time()
31 | loop.run_until_complete(asyncio.wait(tasks)) # 开始执行
32 | end = time.time()
33 | print("use time seconds: ", end - start)
34 |
--------------------------------------------------------------------------------
/streamlit的使用/鸢尾花数据的分类app/app.py:
--------------------------------------------------------------------------------
1 | """
2 | @file : app.py
3 | @author : xiaolu
4 | @email : luxiaonlp@163.com
5 | @time : 2021-06-09
6 | """
7 | import streamlit as st
8 | import numpy as np
9 | import matplotlib.pyplot as plt
10 | from sklearn import datasets
11 | from sklearn.model_selection import train_test_split
12 | from sklearn.decomposition import PCA
13 | from sklearn.svm import SVC
14 | from sklearn.neighbors import KNeighborsClassifier
15 | from sklearn.ensemble import RandomForestClassifier
16 | from sklearn.metrics import accuracy_score
17 |
18 |
19 | def get_dataset(name):
20 | # 加载数据集
21 | if name == 'Iris':
22 | data = datasets.load_iris()
23 | elif name == 'Wine':
24 | data = datasets.load_wine()
25 | else:
26 | data = datasets.load_breast_cancer()
27 | X = data.data
28 | y = data.target
29 | return X, y
30 |
31 |
32 | def add_parameter_ui(clf_name):
33 | # 针对每个分类器 可以调节的超参数
34 | params = dict()
35 | if clf_name == 'SVM':
36 | C = st.sidebar.slider('C', 0.01, 10.0) # 滑动条
37 | params['C'] = C
38 | elif clf_name == 'KNN':
39 | K = st.sidebar.slider('K', 1, 15) # 滑动条
40 | params['K'] = K
41 | else:
42 | max_depth = st.sidebar.slider('max_depth', 2, 15) # 滑动条
43 | params['max_depth'] = max_depth
44 | n_estimators = st.sidebar.slider('n_estimators', 1, 100) # 滑动条
45 | params['n_estimators'] = n_estimators
46 | return params
47 |
48 |
49 | def get_classifier(clf_name, params):
50 | # 实例化分类器
51 | clf = None
52 | if clf_name == 'SVM':
53 | clf = SVC(C=params['C'])
54 | elif clf_name == 'KNN':
55 | clf = KNeighborsClassifier(n_neighbors=params['K'])
56 | else:
57 | clf = RandomForestClassifier(n_estimators=params['n_estimators'],
58 | max_depth=params['max_depth'], random_state=1234)
59 | return clf
60 |
61 |
62 | def plot_result():
63 | pca = PCA(2)
64 | X_projected = pca.fit_transform(X)
65 | x1 = X_projected[:, 0]
66 | x2 = X_projected[:, 1]
67 | fig = plt.figure()
68 | plt.scatter(x1, x2, c=y, alpha=0.8, cmap='viridis')
69 |
70 | plt.xlabel('feature_1')
71 | plt.ylabel('feature_2')
72 | plt.colorbar()
73 | st.pyplot(fig)
74 |
75 |
76 | if __name__ == '__main__':
77 | # 启动该项目,命令行: streamlit run app.py
78 | st.title('鸢尾花数据集的分类')
79 | st.write('''
80 | # 支持选择不同的分类器(SVM/Random Forest/KNN)
81 | 哪一个分类器更好呢?''') # 支持markdown
82 |
83 | # 1. 可以选择不同的数据集 是一个下拉选择框
84 | dataset_name = st.sidebar.selectbox(
85 | '数据集的选择',
86 | ('Iris', 'Breast Cancer', 'Wine')
87 | )
88 |
89 | st.write('## {} 数据集'.format(dataset_name)) # 选择好数据集 这里显示
90 |
91 | # 2. 可以选择不同的分类器, 是一个下拉选择框
92 | classifier_name = st.sidebar.selectbox(
93 | '分类器的选择',
94 | ('KNN', 'SVM', 'Random Forest')
95 | )
96 |
97 | X, y = get_dataset(dataset_name)
98 | st.write('数据集的形状:', X.shape)
99 | st.write('数据集的类别数:', len(np.unique(y)))
100 |
101 | params = add_parameter_ui(classifier_name)
102 |
103 | clf = get_classifier(classifier_name, params)
104 |
105 | # 模型训练
106 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)
107 |
108 | clf.fit(X_train, y_train)
109 | y_pred = clf.predict(X_test)
110 |
111 | acc = accuracy_score(y_test, y_pred) # 准确率
112 |
113 | st.write('选择的分类器为: ', classifier_name)
114 | st.write('准确率: ', acc)
115 |
116 | # 画图
117 | plot_result()
118 |
--------------------------------------------------------------------------------
/textrank4zh/001-关键词提取.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @Time : 2020/11/4 16:37
4 | @Auth : xiaolu
5 | @File :001-关键词提取.py
6 | @IDE :PyCharm
7 | @Email:luxiaonlp@163.com
8 | """
9 | from textrank4zh import TextRank4Keyword
10 |
11 |
12 | if __name__ == '__main__':
13 | # 加载文本
14 | data = []
15 | with open('./data/text.txt', 'r', encoding='utf8') as f:
16 | for line in f.readlines():
17 | line = line.strip()
18 | data.append(line)
19 |
20 | # 关键词提取
21 | tr4w = TextRank4Keyword()
22 |
23 | data = data[:1]
24 | for text in data:
25 | tr4w.analyze(text=text, lower=True, window=2)
26 | for item in tr4w.get_keywords(20, word_min_len=1):
27 | print('{}:{:6f}'.format(item.word, item.weight))
28 |
29 | # 关键短语抽取
30 | for text in data:
31 | tr4w.analyze(text=text, lower=True, window=2)
32 | for phrase in tr4w.get_keyphrases(20, min_occur_num=1):
33 | print(phrase)
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
--------------------------------------------------------------------------------
/textrank4zh/002-摘要抽取.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @Time : 2020/11/4 16:47
4 | @Auth : xiaolu
5 | @File :002-摘要抽取.py
6 | @IDE :PyCharm
7 | @Email:luxiaonlp@163.com
8 | """
9 | from textrank4zh import TextRank4Sentence
10 |
11 | if __name__ == '__main__':
12 | # 加载文本
13 | data = []
14 | with open('./data/text.txt', 'r', encoding='utf8') as f:
15 | for line in f.readlines():
16 | line = line.strip()
17 | data.append(line)
18 |
19 | # 摘要抽取
20 | tr4s = TextRank4Sentence()
21 |
22 | data = data[:1]
23 | for text in data:
24 | tr4s.analyze(text=text, lower=True, source='all_filters')
25 | for item in tr4s.get_key_sentences(num=3):
26 | print(item.index, item.weight, item.sentence)
27 |
--------------------------------------------------------------------------------
/textrank4zh/readme.txt:
--------------------------------------------------------------------------------
1 | 安装 pip install textrank4zh -i https://pypi.douban.com/simple/
--------------------------------------------------------------------------------