├── Faiss的使用 ├── 001-欧式距离检索.py ├── 002-倒排表快速索引.py ├── 003-乘积量化索引.py ├── 004-faiss实现kmeans聚类.py ├── 005-faiss实现pca降维.py ├── 006-faiss实现PQ编码和解码.py ├── 007-faiss实现标量量化器.py └── 008-faiss_use_gpu.py ├── LAC分词器 ├── 001-分词.py ├── 002-词性标注和实体识别.py ├── 003-加载自己的词表进行分词.py └── vocab.txt ├── PySpark ├── .DS_Store ├── 001-data_processing_use_pyspark.py ├── 002-linear_regression_use_pyspark.py ├── 003-logistic_regression_use_pyspark.py ├── 004-random_forests_classification_use_pyspark.py ├── 005-kmeans_cluster_use_pyspark.py ├── 006-recommendr_system_use_pyspark.py ├── 007-NLP_use_pyspark.py └── data │ ├── Linear_regression_dataset.csv │ ├── Log_Reg_dataset.csv │ ├── Movie_reviews.csv │ ├── affairs.csv │ ├── iris_dataset.csv │ ├── movie_ratings_df.csv │ └── sample_data.csv ├── README.md ├── RSA实战 ├── 001-rsa生成公私钥并保存.py └── 002-公钥加密私钥解密.py ├── apscheduler实现定时任务 └── 定时任务.py ├── chinesebert中的pinyin和glyph的处理 ├── MSYH.TTC ├── image_test.py └── pinyin_test.py ├── collections的用法 └── 001-collections中的namedtuple用法.py ├── elasticsearch ├── 001-创建库并插入数据.py └── 002-es中的搜索.py ├── flask+echart+ajax ├── .DS_Store ├── app.py ├── static │ ├── .DS_Store │ ├── css │ │ └── main.css │ └── js │ │ ├── controller.js │ │ ├── echarts.min.js │ │ ├── jquery.js │ │ ├── left.js │ │ └── right.js └── templates │ └── index.html ├── flask表单那些事 ├── .DS_Store ├── app.py └── templates │ └── index.html ├── gensim ├── 001-TF-IDF句子相似度计算.py ├── 002-gensim文本摘要.py └── data │ ├── answer.txt │ ├── question.txt │ ├── stopwords.txt │ ├── test.py │ └── text.txt ├── gradio学习 ├── 01-row_column_layout.py └── 02-chatglm_web.py ├── ipdb调试python程序 ├── 001-简单调试.py └── readme.txt ├── logging模块的使用 ├── 001-日志级别的使用.py ├── 002-日志控制台输出.py ├── 003-日志文件输出.py └── 004-捕捉异常.py ├── pandas一键画图 ├── 001-plot_zhexiantu.html ├── 001-plot_zhexiantu.py ├── 002-plot_sandiantu.html ├── 002-plot_sandiantu.py ├── 003-plot_zhuzhuangtu.html └── 003-plot_zhuzhuangtu.py ├── py2neo操作neo4j ├── .DS_Store ├── py2neo简单练习 │ ├── create_graph_v1.py │ ├── create_graph_v2.py │ ├── mingchaonaxieshier.xlsx │ ├── santi.xlsx │ └── test.xlsx ├── readme.txt ├── 事件三元组抽取 │ ├── ltp的使用.py │ ├── my_vocab.txt │ └── readme.txt └── 医疗知识图谱问答 │ ├── .DS_Store │ ├── ahocorasick的使用 │ └── demo.py │ ├── build_medical_graph.py │ ├── data │ ├── medical.json │ └── medical_min.json │ ├── data_process │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── answer_search.cpython-37.pyc │ │ ├── question_classifier.cpython-37.pyc │ │ └── question_parser.cpython-37.pyc │ ├── answer_search.py │ ├── question_classifier.py │ └── question_parser.py │ ├── dict │ ├── check.txt │ ├── deny.txt │ ├── department.txt │ ├── disease.txt │ ├── drug.txt │ ├── food.txt │ ├── producer.txt │ └── symptom.txt │ └── run_chatbot.py ├── pyecharts使用 ├── 001-柱状图.py ├── 002-折线图.py ├── 003-饼状图.py ├── 折线图.html ├── 柱状图.html └── 饼状图.html ├── pymysql的使用 ├── 001-创建数据库.py ├── 002-创建表插入数据.py ├── 003-查询.py ├── 004-更新.py └── 005-删除.py ├── python并发编程 ├── 001-多线程.py ├── 002-生产者消费者实现多线程爬虫.py ├── 003-多线程锁机制.py ├── 004-线程池的使用.py ├── 005-线程池加速flask-web服务.py ├── 006-多进程的使用.py ├── 007-多进程加速flask-web服务.py ├── 008-协程爬虫.py ├── 009-使用信号量控制协程数进行爬虫.py └── data.txt ├── streamlit的使用 └── 鸢尾花数据的分类app │ └── app.py └── textrank4zh ├── 001-关键词提取.py ├── 002-摘要抽取.py ├── data └── text.txt └── readme.txt /Faiss的使用/001-欧式距离检索.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 001-欧式距离检索.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-08-09 6 | """ 7 | import faiss 8 | import numpy as np 9 | 10 | 11 | if __name__ == '__main__': 12 | n_data, d = 1000, 512 # 检索库中的向量个数, 每个向量的维度 13 | np.random.seed(43) # 随机种子 为了多次执行结果一致 14 | 15 | # 检索库的构造 16 | data = [] 17 | mu, sigma = 3, 0.1 # 这里时通过高斯分布随机产生若干向量,这两个参数为均值和方差 18 | for i in range(n_data): 19 | data.append(np.random.normal(mu, sigma, d)) 20 | data = np.array(data).astype('float32') # faiss只支持32位的浮点数 21 | 22 | # 检索向量的生成 23 | query = [] 24 | n_query = 10 # 生成10个query向量 25 | mu, sigma = 3, 0.1 26 | np.random.seed(12) 27 | for i in range(n_query): 28 | query.append(np.random.normal(mu, sigma, d)) 29 | query = np.array(query).astype('float32') 30 | 31 | # 构建索引 记住要传入向量维度d 32 | index = faiss.IndexFlatL2(d) 33 | # print(index.is_trained) # 这里若是false就要训练 后面讲 34 | 35 | # 添加数据 36 | index.add(data) 37 | # print(index.ntotal) # 总的数据量 38 | 39 | # 开始检索 40 | k = 10 # 指定让其返回10个距离最近的 41 | 42 | # 这里我们选取data中的前五个 容易看到结果,因为自己跟自己距离肯定为0 所以最相关的肯定是自己 43 | query_self = data[:5] 44 | 45 | dis, ind = index.search(query_self, k=k) 46 | print(dis) # 每条数据代表了当前这个query 与最相关的十个数据的距离 47 | print(ind) # 每条数据代表了当前这个query 最相关的十条数据的索引 48 | """ 49 | [[0. 8.55197 8.634906 8.683499 8.698736 8.821949 8.902446 50 | 8.943979 8.9516735 8.972908 ] 51 | [0. 8.369204 8.482748 8.53028 8.581224 8.680499 8.684254 52 | 8.697291 8.719812 8.753435 ] 53 | [0. 8.209936 8.392483 8.456179 8.473589 8.480727 8.551348 54 | 8.553277 8.576391 8.592704 ] 55 | [0. 8.473689 8.621014 8.827385 8.883725 8.980131 8.99064 56 | 9.015673 9.017438 9.027972 ] 57 | [0. 8.268832 8.349455 8.597895 8.611757 8.658188 8.675722 58 | 8.685029 8.70588 8.707612 ]] 59 | [[ 0 877 502 42 606 366 348 923 563 56] 60 | [ 1 849 974 106 348 364 877 242 280 173] 61 | [ 2 877 127 655 253 233 558 678 13 208] 62 | [ 3 421 94 348 502 402 536 646 563 735] 63 | [ 4 986 230 209 446 889 974 241 550 248]] 64 | """ 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /Faiss的使用/002-倒排表快速索引.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 002-倒排表快速索引.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-08-09 6 | """ 7 | import numpy as np 8 | import faiss 9 | 10 | if __name__ == '__main__': 11 | n_data, d = 1000, 512 # 检索库中的向量个数, 每个向量的维度 12 | np.random.seed(43) # 随机种子 为了多次执行结果一致 13 | 14 | # 检索库的构造 15 | data = [] 16 | mu, sigma = 3, 0.1 # 这里时通过高斯分布随机产生若干向量,这两个参数为均值和方差 17 | for i in range(n_data): 18 | data.append(np.random.normal(mu, sigma, d)) 19 | data = np.array(data).astype('float32') # faiss只支持32位的浮点数 20 | 21 | # 检索向量的生成 22 | query = [] 23 | n_query = 10 # 生成10个query向量 24 | mu, sigma = 3, 0.1 25 | np.random.seed(12) 26 | for i in range(n_query): 27 | query.append(np.random.normal(mu, sigma, d)) 28 | query = np.array(query).astype('float32') 29 | 30 | nlist = 50 # 将数据库向量分割为多少了维诺空间 31 | k = 10 32 | quantizer = faiss.IndexFlatL2(d) # 量化器 33 | index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2) # METRIC_L2计算L2距离, 或faiss.METRIC_INNER_PRODUCT计算内积 34 | assert not index.is_trained # 倒排表索引类型需要训练 35 | index.train(data) # 训练数据集应该与数据库数据集同分布 36 | assert index.is_trained 37 | 38 | index.add(data) 39 | index.nprobe = 2 # 选择n个维诺空间进行索引, 40 | dis, ind = index.search(query, k) 41 | print(dis) 42 | print(ind) 43 | -------------------------------------------------------------------------------- /Faiss的使用/003-乘积量化索引.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 003-乘积量化索引.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-08-09 6 | """ 7 | import numpy as np 8 | import faiss 9 | 10 | if __name__ == '__main__': 11 | n_data, d = 1000, 512 # 检索库中的向量个数, 每个向量的维度 12 | np.random.seed(43) # 随机种子 为了多次执行结果一致 13 | 14 | # 检索库的构造 15 | data = [] 16 | mu, sigma = 3, 0.1 # 这里时通过高斯分布随机产生若干向量,这两个参数为均值和方差 17 | for i in range(n_data): 18 | data.append(np.random.normal(mu, sigma, d)) 19 | data = np.array(data).astype('float32') # faiss只支持32位的浮点数 20 | 21 | # 检索向量的生成 22 | query = [] 23 | n_query = 10 # 生成10个query向量 24 | mu, sigma = 3, 0.1 25 | np.random.seed(12) 26 | for i in range(n_query): 27 | query.append(np.random.normal(mu, sigma, d)) 28 | query = np.array(query).astype('float32') 29 | 30 | nlist = 50 31 | m = 8 # 列方向划分个数,必须能被d整除 32 | k = 10 33 | quantizer = faiss.IndexFlatL2(d) 34 | index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 4) # 4 表示每个子向量被编码为 4 bits 35 | 36 | index.train(data) 37 | index.add(data) 38 | index.nprobe = 50 39 | dis, ind = index.search(data[:10], k) # 查询自身 40 | print(dis) 41 | print(ind) 42 | 43 | dis, ind = index.search(query, k) # 真实查询 44 | print(dis) 45 | print(ind) 46 | -------------------------------------------------------------------------------- /Faiss的使用/004-faiss实现kmeans聚类.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 004-faiss实现kmeans聚类.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-08-09 6 | """ 7 | import faiss 8 | import numpy as np 9 | 10 | 11 | if __name__ == '__main__': 12 | # 数据 13 | n_data, d = 2000, 512 14 | np.random.seed(43) 15 | data = [] 16 | mu, sigma = 3, 0.1 17 | for i in range(n_data): 18 | data.append(np.random.normal(mu, sigma, d)) 19 | data = np.array(data).astype('float32') 20 | 21 | # 聚类 22 | n_centroids = 1024 # 聚类中心个数 23 | d = data.shape[1] 24 | kmeans = faiss.Kmeans(d, n_centroids) 25 | kmeans.train(data) 26 | # 输出聚类中心 27 | # print(kmeans.centroids) 28 | # print(len(kmeans.centroids)) 29 | 30 | # 看data中的前五个向量属于那个类(最有可能的两个类) 31 | D, I = kmeans.index.search(data[:5], k=2) 32 | print(D) # 与每个类的距离 33 | print(I) # 类的编号 34 | """ 35 | 输出: 36 | [[4.1553707 5.2924204] 37 | [1.9329664 4.930997 ] 38 | [4.537619 4.8509283] 39 | [4.6700296 5.2252126] 40 | [2.101182 4.9292693]] 41 | [[478 568] 42 | [767 697] 43 | [568 527] 44 | [999 568] 45 | [175 853]] 46 | """ 47 | 48 | print('*'*100) 49 | # 计算每个中心最近的若干条向量 50 | k = 5 51 | index = faiss.IndexFlatL2(d) 52 | index.add(data) 53 | D, I = index.search(kmeans.centroids, k) 54 | print(D) 55 | print(I) 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /Faiss的使用/005-faiss实现pca降维.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 005-faiss实现pca降维.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-08-09 6 | """ 7 | import faiss 8 | import numpy as np 9 | 10 | 11 | if __name__ == '__main__': 12 | # 数据 13 | n_data, d = 2000, 512 14 | np.random.seed(43) 15 | data = [] 16 | mu, sigma = 3, 0.1 17 | for i in range(n_data): 18 | data.append(np.random.normal(mu, sigma, d)) 19 | data = np.array(data).astype('float32') 20 | 21 | mat = faiss.PCAMatrix(512, 64) # 从512维降为64维 22 | mat.train(data) 23 | assert mat.is_trained 24 | tr = mat.apply_py(data) 25 | print(tr.shape) 26 | print(tr) 27 | 28 | -------------------------------------------------------------------------------- /Faiss的使用/006-faiss实现PQ编码和解码.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 006-faiss实现PQ编码和解码.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-08-09 6 | """ 7 | import faiss 8 | import numpy as np 9 | 10 | 11 | if __name__ == '__main__': 12 | # 数据 13 | n_data, d = 2000, 512 14 | np.random.seed(43) 15 | data = [] 16 | mu, sigma = 3, 0.1 17 | for i in range(n_data): 18 | data.append(np.random.normal(mu, sigma, d)) 19 | data = np.array(data).astype('float32') 20 | 21 | cs = 4 # code size (bytes) 22 | # 训练数据集 23 | x = data # 原始的数据集 24 | 25 | x_train = data # 训练集 26 | pq = faiss.ProductQuantizer(d, cs, 8) 27 | pq.train(x_train) 28 | 29 | # encode编码 30 | codes = pq.compute_codes(x) 31 | 32 | # decode解码 33 | x2 = pq.decode(codes) 34 | 35 | # 编码-解码后与原始数据的差 36 | avg_relative_error = ((x - x2)**2).sum() / (x ** 2).sum() 37 | print(avg_relative_error) -------------------------------------------------------------------------------- /Faiss的使用/007-faiss实现标量量化器.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 007-faiss实现标量量化器.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-08-09 6 | """ 7 | import faiss 8 | import numpy as np 9 | 10 | 11 | if __name__ == '__main__': 12 | # 数据 13 | n_data, d = 2000, 512 14 | np.random.seed(43) 15 | data = [] 16 | mu, sigma = 3, 0.1 17 | for i in range(n_data): 18 | data.append(np.random.normal(mu, sigma, d)) 19 | data = np.array(data).astype('float32') 20 | 21 | x = data 22 | # 训练集 23 | x_train = data 24 | # QT_8bit allocates 8 bits per dimension (QT_4bit also works) 25 | sq = faiss.ScalarQuantizer(d, faiss.ScalarQuantizer.QT_8bit) 26 | sq.train(x_train) 27 | 28 | # encode 编码 29 | codes = sq.compute_codes(x) 30 | 31 | # decode 解码 32 | x2 = sq.decode(codes) 33 | 34 | # 计算编码-解码后与原始数据的差 35 | avg_relative_error = ((x - x2)**2).sum() / (x ** 2).sum() 36 | print(avg_relative_error) -------------------------------------------------------------------------------- /Faiss的使用/008-faiss_use_gpu.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 008-faiss_use_gpu.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-08-25 6 | """ 7 | import faiss 8 | import numpy as np 9 | import time 10 | 11 | 12 | 13 | if __name__ == '__main__': 14 | d = 512 # 向量维度 15 | nb = 300000 # 向量库的大小 16 | nq = 100 # 用这100个向量进行检索 17 | 18 | np.random.seed(1234) 19 | 20 | # 随机产生一个向量库 21 | xb = np.random.random((nb,d)).astype('float32') 22 | xb[:, 0] += np.arange(nb) / 1000. 23 | 24 | # 随机产生100个query向量 25 | xq = np.random.random((nq,d)).astype('float32') 26 | xq[:, 0] += np.arange(nq) / 1000. 27 | 28 | quantizer = faiss.IndexFlatL2(d) 29 | nlist = 100 # 将数据库向量分割为多少了维诺空间 30 | index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2) 31 | 32 | gpu_index = faiss.index_cpu_to_all_gpus(index) # 使用gpu也就是这行代码就行了 33 | print(gpu_index.is_trained) 34 | gpu_index.train(xb) 35 | print(gpu_index.is_trained) 36 | 37 | gpu_index.add(xb) 38 | gpu_index.nprobe = 10 # 选择10个维诺空间进行索引 39 | k = 10 # 返回十个结果 40 | D, gt_nms = gpu_index.search(xq, k) 41 | print(gt_nms) -------------------------------------------------------------------------------- /LAC分词器/001-分词.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @Time : 2020/11/3 14:06 4 | @Auth : xiaolu 5 | @File :001-分词.py 6 | @IDE :PyCharm 7 | @Email:luxiaonlp@163.com 8 | """ 9 | from LAC import LAC 10 | import jieba 11 | 12 | 13 | if __name__ == '__main__': 14 | lac = LAC(mode='seg') 15 | 16 | # 单个样本输入, 输入为unicode编码的字符串 17 | text = '大王叫我来巡山' 18 | lac_result = lac.run(text) 19 | print(lac_result) 20 | 21 | jieba_result = jieba.lcut(text) 22 | print(jieba_result) 23 | 24 | # 批量样本输入, 输入为多个句子组成的list,平均速率会更快 25 | texts = ["山里有个庙", "庙里有个老和尚跟一个小和尚"] 26 | result = lac.run(texts) 27 | print(result) 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /LAC分词器/002-词性标注和实体识别.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @Time : 2020/11/3 14:09 4 | @Auth : xiaolu 5 | @File :002-词性标注和实体识别.py 6 | @IDE :PyCharm 7 | @Email:luxiaonlp@163.com 8 | """ 9 | from LAC import LAC 10 | 11 | 12 | if __name__ == '__main__': 13 | lac = LAC(mode='lac') 14 | text = '我想涨工资' 15 | 16 | lac_result = lac.run(text) 17 | print(lac_result) 18 | 19 | texts = ["汤青松长得好帅", "我喜欢做安全开发工程师"] 20 | lac_result = lac.run(texts) 21 | print(lac_result) 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /LAC分词器/003-加载自己的词表进行分词.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @Time : 2020/11/6 11:18 4 | @Auth : xiaolu 5 | @File :001-demo.py 6 | @IDE :PyCharm 7 | @Email:luxiaonlp@163.com 8 | """ 9 | from LAC import LAC 10 | import jieba 11 | 12 | if __name__ == '__main__': 13 | lac = LAC() 14 | lac.load_customization('./vocab.txt', sep=None) 15 | res1 = lac.run('字节跳动阿里巴巴腾讯公司金山软件小米科技') 16 | res2 = jieba.lcut('字节跳动阿里巴巴腾讯公司金山软件小米科技') 17 | print(res1) 18 | print(res2) 19 | 20 | 21 | -------------------------------------------------------------------------------- /LAC分词器/vocab.txt: -------------------------------------------------------------------------------- 1 | 我 2 | 爱你 3 | 我爱 -------------------------------------------------------------------------------- /PySpark/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/PySpark/.DS_Store -------------------------------------------------------------------------------- /PySpark/001-data_processing_use_pyspark.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 001-data_processing_use_pyspark.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-04-08 6 | """ 7 | import findspark 8 | findspark.init() 9 | from pyspark.sql import SparkSession 10 | from pyspark.sql.types import StringType, DoubleType, IntegerType 11 | from pyspark.sql.functions import udf 12 | from pyspark.sql.functions import pandas_udf, PandasUDFType 13 | 14 | 15 | def price_range(brand): 16 | if brand in ['Samsung', 'Apple']: 17 | return 'High Price' 18 | elif brand == 'MI': 19 | return 'Mid Price' 20 | else: 21 | return 'Low Price' 22 | 23 | 24 | def remaining_yrs(age): 25 | yrs_left = 100-age 26 | return yrs_left 27 | 28 | 29 | if __name__ == '__main__': 30 | # 1. 创建会话对象 31 | spark = SparkSession.builder.appName('data_processing').getOrCreate() 32 | 33 | # 2. 加载数据 34 | df = spark.read.csv('./data/sample_data.csv', inferSchema=True, header=True) 35 | print(df.columns) # 打印所有特征名: ['ratings', 'age', 'experience', 'family', 'mobile'] 36 | print(df.count()) # 总的数据量: 33 37 | 38 | # 打印数据格式 39 | print(df.printSchema()) 40 | 41 | # 打印前五条数据 42 | print(df.show(n=5)) 43 | 44 | # 打印某两列 的前三条数据 45 | print(df.select('ratings', 'mobile').show(n=3)) 46 | 47 | # 打印数据统计量 也就是每个特征的均值、方差等。 48 | print(df.describe().show()) 49 | 50 | # 新建一列数据 51 | print(df.withColumn("age_after_10_yrs", (df["age"]+10)).show(5)) 52 | 53 | # 将某列数据转换类型 编程新的一列数据 54 | print(df.withColumn('age_double', df['age'].cast(DoubleType())).show(3, False)) 55 | 56 | # 过滤: 指定某个属性的取值,找出该属性取该值的全部数据 57 | print(df.filter(df['mobile'] == 'Vivo').select('age', 'ratings', 'mobile').show()) 58 | 59 | # 多条件过滤 60 | print(df.filter((df['mobile'] == 'Vivo') & (df['experience'] > 10)).show()) 61 | 62 | # 将某个特征下的值去重后,然后显示出来 63 | print(df.select('mobile').distinct().show()) 64 | print('去重后的取值数:', df.select('mobile').distinct().count()) 65 | 66 | # 根据某个特征的取值进行分组 67 | print(df.groupBy('mobile').count().show()) # 分组统计个数 68 | print(df.groupBy('mobile').mean().show()) # 分组后 计算每个特征的均值 69 | print(df.groupBy('mobile').sum().show()) # 分组后 计算每个特征的和 70 | print(df.groupBy('mobile').agg({'experience': 'sum'}).show()) # 分组后,只对experience特征求和 71 | print(df.groupBy('mobile').max().show()) # 分组后 计算每个特征的最大值 72 | print(df.groupBy('mobile').min().show()) # 分组后 计算每个特征的最小值 73 | 74 | # 普通UDF 75 | # 用户自定义数据函数UDF 76 | brand_udf = udf(price_range, StringType()) # 两个参数: 用户自定的函数,传输的数据类型 77 | print(df.withColumn('price_range', brand_udf(df['mobile'])).show()) # 将udf应用在mobile特征上 78 | 79 | # 或者采用lambda表达式 80 | age_udf = udf(lambda age: "young" if age <= 30 else "senior", StringType()) 81 | print(df.withColumn("age_group", age_udf(df.age)).show()) 82 | 83 | # 去掉重复的记录 84 | print(df.count()) 85 | df = df.dropDuplicates() 86 | print('去掉重复记录后的数据数:', df.count()) 87 | 88 | # 删除某列 89 | df_new = df.drop('mobile') 90 | print(df_new.show(5)) 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /PySpark/002-linear_regression_use_pyspark.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 002-linear_regression_use_pyspark.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-04-08 6 | """ 7 | import findspark 8 | findspark.init() 9 | 10 | from pyspark.sql import SparkSession 11 | from pyspark.sql.functions import corr 12 | from pyspark.ml.linalg import Vector 13 | from pyspark.ml.feature import VectorAssembler 14 | from pyspark.ml.regression import LinearRegression 15 | 16 | 17 | def analyse_data(df): 18 | ''' 19 | 数据分析 20 | :param df: 21 | :return: 22 | ''' 23 | # 打印数据格式 24 | print(df.printSchema()) 25 | 26 | # 打印前十条数据 27 | print(df.head(10)) 28 | 29 | # 看某个特征与输出的相关系数 var_1与output的相关系数 30 | print(df.select(corr('var_1', 'output')).show()) # 0.9187399607627283 31 | 32 | 33 | def feature_process(df): 34 | ''' 35 | 特征工程 36 | :param df: 37 | :return: 38 | ''' 39 | # 将var_1到var2合成一个向量,名字叫做features 40 | vec_assmebler = VectorAssembler(inputCols=['var_1', 'var_2', 'var_3', 'var_4', 'var_5'], outputCol='features') 41 | features_df = vec_assmebler.transform(df) 42 | # print(features_df.select('features').show(5)) # 看features的取值 43 | 44 | model_df = features_df.select('features', 'output') # 将features和输出拿出来 进行模型训练 45 | # print(model_df.show(5)) 46 | return model_df 47 | 48 | 49 | if __name__ == '__main__': 50 | # 1. 加载数据集 51 | spark = SparkSession.builder.appName('lin_reg').getOrCreate() 52 | df = spark.read.csv('./data/Linear_regression_dataset.csv', inferSchema=True, header=True) 53 | # print('数据量:{}, 特征数:{}'.format(df.count(), len(df.columns))) # 数据量:1232, 特征数:6 54 | 55 | # 2. 数据分析 56 | # analyse_data(df) 如果进行数据分析 执行该函数 57 | 58 | # 3. 特征工程 59 | model_df = feature_process(df) # 将各个特征的值合并成一个向量 60 | # 划分数据 61 | train_df, test_df = model_df.randomSplit([0.7, 0.3]) 62 | # print('训练集---数据量:{}, 特征数:{}'.format(train_df.count(), len(train_df.columns))) # 数据量:868, 特征数:2 63 | # print('测试集---数据量:{}, 特征数:{}'.format(test_df.count(), len(test_df.columns))) # 数据量:364, 特征数:2 64 | 65 | # 4. 模型训练 66 | lin_Reg = LinearRegression(labelCol='output') 67 | lr_model = lin_Reg.fit(train_df) 68 | 69 | # 5. 模型评价 70 | # 模型训练完毕 打印回归系数 71 | print(lr_model.coefficients) 72 | 73 | training_predictions = lr_model.evaluate(train_df) 74 | print('训练集的均方误差:', training_predictions.meanSquaredError) 75 | # 训练集的均方误差: 0.00014265219879599827 76 | 77 | testing_predictions = lr_model.evaluate(test_df) 78 | print('测试集的均方误差:', testing_predictions.meanSquaredError) 79 | # 测试集的均方误差: 0.00014983739298532136 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /PySpark/003-logistic_regression_use_pyspark.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 003-logistic_regression_use_pyspark.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-04-08 6 | """ 7 | import findspark 8 | findspark.init() 9 | 10 | from pyspark.sql import SparkSession 11 | from pyspark.ml.feature import StringIndexer 12 | from pyspark.ml.feature import VectorAssembler 13 | from pyspark.ml.feature import OneHotEncoder 14 | from pyspark.ml.classification import LogisticRegression 15 | from pyspark.ml.evaluation import BinaryClassificationEvaluator 16 | 17 | 18 | def analyse_data(df): 19 | ''' 20 | 数据分析 21 | :param df: 22 | :return: 23 | ''' 24 | # 打印数据的格式 25 | print(df.printSchema()) 26 | 27 | # 打印前五条数据 28 | print(df.show(n=5)) 29 | 30 | # 简单看一下各个特征的统计指标 31 | print(df.describe().show()) # 对于离散值 是不计算均值和方差的 32 | 33 | # 按国家特征进行聚合 看看那个国家样本多 34 | print(df.groupby('Country').count().show()) 35 | 36 | # 看看搜索引擎用户数量谁最高 37 | print(df.groupby('Platform').count().show()) 38 | 39 | 40 | def feature_process(df): 41 | ''' 42 | 特征工程 43 | :param df: 44 | :return: 45 | ''' 46 | # 这里需要将国家和搜索引擎两个特征转为数值特征 47 | search_engine_indexer = StringIndexer(inputCol="Platform", outputCol='Platform_Num').fit(df) 48 | df = search_engine_indexer.transform(df) 49 | # print(df.show(3)) 50 | search_engine_encoder = OneHotEncoder(inputCol='Platform_Num', outputCol='Platform_Num_Vec').fit(df) 51 | df = search_engine_encoder.transform(df) 52 | # print(df.show(3)) 53 | 54 | # print('*'*150) 55 | # 然后处理国家特征 56 | country_indexer = StringIndexer(inputCol="Country", outputCol='Country_Num').fit(df) 57 | df = country_indexer.transform(df) 58 | # print(df.show(3)) 59 | country_encoder = OneHotEncoder(inputCol='Country_Num', outputCol='Country_Num_Vec').fit(df) 60 | df = country_encoder.transform(df) 61 | # print(df.show(3)) 62 | 63 | df_assembler = VectorAssembler( 64 | inputCols=['Platform_Num_Vec', 'Country_Num_Vec', 'Age', 'Repeat_Visitor', 'Web_pages_viewed'], 65 | outputCol='features' 66 | ) 67 | df = df_assembler.transform(df) 68 | model_df = df.select(['features', 'Status']) 69 | return model_df 70 | 71 | 72 | if __name__ == "__main__": 73 | # 1. 加载数据 74 | spark = SparkSession.builder.appName('log_reg').getOrCreate() 75 | df = spark.read.csv('./data/Log_Reg_dataset.csv', inferSchema=True, header=True) 76 | # print('样本数:{}, 特征数:{}'.format(df.count(), len(df.columns))) # 样本数:20000, 特征数:6 77 | 78 | # 2. 数据分析 79 | # analyse_data(df) 80 | 81 | # 3. 特征工程 82 | model_df = feature_process(df) 83 | # print(model_df.show(3)) 84 | # 切分数据集 85 | training_df, test_df = model_df.randomSplit([0.75, 0.25]) 86 | print('训练集的个数:', training_df.count()) 87 | print('测试集的个数:', test_df.count()) 88 | 89 | print('训练集的正负样本比例:') 90 | print(training_df.groupBy('Status').count().show()) 91 | 92 | print('测试集的正负样本比例:') 93 | print(test_df.groupBy('Status').count().show()) 94 | 95 | # 4. 训练模型 96 | log_reg = LogisticRegression(labelCol='Status').fit(training_df) 97 | 98 | # 5. 测试模型 99 | train_results = log_reg.evaluate(training_df).predictions 100 | correct_preds = train_results.filter(train_results['Status'] == 1).filter(train_results['prediction'] == 1).count() 101 | print('训练集的正确率:', float(correct_preds)/(training_df.filter(training_df['Status'] == 1).count())) 102 | 103 | # 在测试集上的表现 104 | results = log_reg.evaluate(test_df).predictions 105 | # 计算混淆矩阵 106 | true_postives = results[(results.Status == 1) & (results.prediction == 1)].count() 107 | true_negatives = results[(results.Status == 0) & (results.prediction == 0)].count() 108 | false_positives = results[(results.Status == 0) & (results.prediction == 1)].count() 109 | false_negatives = results[(results.Status == 1) & (results.prediction == 0)].count() 110 | recall = float(true_postives)/(true_postives + false_negatives) 111 | print('召回率:', recall) 112 | 113 | precision = float(true_postives) / (true_postives + false_positives) 114 | print('精确率:', precision) 115 | 116 | accuracy = float((true_postives+true_negatives) /(results.count())) 117 | print('准确率:', accuracy) 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | -------------------------------------------------------------------------------- /PySpark/004-random_forests_classification_use_pyspark.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 004-random_forests_classification_use_pyspark.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-04-08 6 | """ 7 | import findspark 8 | 9 | findspark.init() 10 | 11 | from pyspark.ml.feature import VectorAssembler 12 | from pyspark.sql import SparkSession 13 | from pyspark.ml.classification import RandomForestClassifier 14 | from pyspark.ml.classification import RandomForestClassificationModel 15 | from pyspark.ml.evaluation import BinaryClassificationEvaluator 16 | from pyspark.ml.evaluation import MulticlassClassificationEvaluator 17 | 18 | 19 | def analyse_data(df): 20 | ''' 21 | 数据分析 22 | :param df: 23 | :return: 24 | ''' 25 | print(df.show(5)) 26 | 27 | # 看看每个特征的统计信息 如均值方差等 28 | print(df.describe().select('summary', 'rate_marriage', 'age', 'yrs_married', 'children', 'religious').show()) 29 | 30 | # 人们对婚姻打分比例 31 | print(df.groupBy('rate_marriage').count().show()) 32 | 33 | # 以孩子和事务为键 然后聚合。 可以发现数据集中 没孩子 没事务的人最多 34 | print(df.groupBy('children', 'affairs').count().orderBy('children', 'affairs', 'count', ascending=True).show()) 35 | 36 | 37 | def feature_process(df): 38 | ''' 39 | 特征工程 40 | :param df: 41 | :return: 42 | ''' 43 | df_assembler = VectorAssembler(inputCols=['rate_marriage', 'age', 'yrs_married', 'children', 'religious'], 44 | outputCol="features") 45 | df = df_assembler.transform(df) 46 | model_df = df.select(['features', 'affairs']) 47 | return model_df 48 | 49 | 50 | if __name__ == '__main__': 51 | # 1. 加载数据集 52 | spark = SparkSession.builder.appName('random_forest').getOrCreate() 53 | df = spark.read.csv('./data/affairs.csv', inferSchema=True, header=True) 54 | print((df.count(), len(df.columns))) 55 | 56 | # 2. 数据分析 57 | analyse_data(df) 58 | 59 | # 3. 特征工程 60 | model_df = feature_process(df) 61 | # 切分数据集 62 | train_df, test_df = model_df.randomSplit([0.75, 0.25]) 63 | print('训练集条数:', train_df.count()) 64 | print('训练集标签的统计:') 65 | print(train_df.groupBy('affairs').count().show()) 66 | 67 | print('测试集条数:', test_df.count()) 68 | print('测试集标签的统计:') 69 | print(test_df.groupBy('affairs').count().show()) 70 | 71 | # 4. 训练模型 72 | rf_classifier = RandomForestClassifier(labelCol='affairs', numTrees=50).fit(train_df) 73 | 74 | # 5. 模型评估 75 | rf_predictions = rf_classifier.transform(test_df) 76 | 77 | rf_accuracy = MulticlassClassificationEvaluator(labelCol='affairs', metricName='accuracy').evaluate(rf_predictions) 78 | print('测试集的准确率:', rf_accuracy) 79 | 80 | rf_precision = MulticlassClassificationEvaluator(labelCol='affairs', metricName='weightedPrecision').evaluate( 81 | rf_predictions) 82 | print('测试集的精确率:', rf_precision) 83 | 84 | rf_auc = BinaryClassificationEvaluator(labelCol='affairs').evaluate(rf_predictions) 85 | print('测试集的AUC值:', rf_auc) 86 | 87 | # 看一下在分类中 每个特征所起的重要性 88 | print(rf_classifier.featureImportances) 89 | 90 | # 保存模型 91 | rf_classifier.save("./RF_model") 92 | 93 | # 下次使用, 则按照下面的方式加载 94 | rf = RandomForestClassificationModel.load("./RF_model") 95 | model_preditions = rf.transform(test_df) 96 | model_preditions.show() 97 | 98 | -------------------------------------------------------------------------------- /PySpark/005-kmeans_cluster_use_pyspark.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 005-kmeans_cluster_use_pyspark.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-04-08 6 | """ 7 | import findspark 8 | findspark.init() 9 | import pyspark 10 | import pandas as pd 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | from pyspark.sql.functions import * 14 | from pyspark.sql.types import * 15 | from pyspark.sql.functions import rand, randn 16 | from pyspark.ml.clustering import KMeans 17 | from pyspark.sql import SparkSession 18 | from pyspark.ml.linalg import Vectors 19 | from pyspark.ml.feature import VectorAssembler 20 | from pyspark.ml.evaluation import ClusteringEvaluator 21 | 22 | def analyse_data(df): 23 | ''' 24 | 数据分析 25 | :param df: 26 | :return: 27 | ''' 28 | print('总共的标签数:', df.select('species').distinct().count()) 29 | 30 | # 每类数据集的样本数 31 | print(df.groupBy('species').count().orderBy('count', ascending=False).show()) 32 | 33 | 34 | def feature_process(df): 35 | ''' 36 | 特征工程 37 | :param df: 38 | :return: 39 | ''' 40 | input_cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] 41 | vec_assembler = VectorAssembler(inputCols = input_cols, outputCol='features') 42 | final_data = vec_assembler.transform(df) 43 | return final_data 44 | 45 | 46 | if __name__ == '__main__': 47 | # 加载鸢尾花的数据 48 | spark = SparkSession.builder.appName('k_means').getOrCreate() 49 | df = spark.read.csv('./data/iris_dataset.csv',inferSchema=True,header=True) 50 | print((df.count(),len(df.columns))) 51 | 52 | analyse_data(df) 53 | 54 | final_data = feature_process(df) 55 | 56 | errors=[] 57 | 58 | for k in range(2, 10): 59 | kmeans = KMeans(featuresCol='features', k=k) 60 | model = kmeans.fit(final_data) 61 | 62 | # Make predictions 63 | predictions = model.transform(final_data) 64 | evaluator = ClusteringEvaluator() 65 | silhouette = evaluator.evaluate(predictions) # 欧式距离 66 | 67 | # 打印聚类的中心 68 | centers = model.clusterCenters() 69 | print("Cluster Centers: ") 70 | for center in centers: 71 | print(center) 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /PySpark/006-recommendr_system_use_pyspark.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 006-recommendr_system_use_pyspark.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-04-09 6 | """ 7 | import findspark 8 | 9 | findspark.init() 10 | 11 | from pyspark.sql import SparkSession 12 | from pyspark.sql.functions import rand 13 | from pyspark.ml.feature import StringIndexer, IndexToString 14 | from pyspark.ml.recommendation import ALS 15 | from pyspark.ml.evaluation import RegressionEvaluator 16 | 17 | 18 | def analyse_data(df): 19 | ''' 20 | 数据分析 21 | :param df: 22 | :return: 23 | ''' 24 | print(df.printSchema()) # 查看数据格式 25 | 26 | # 看前5条数据 27 | print(df.show(5)) 28 | 29 | print(df.orderBy(rand()).show(5)) # 将数据打乱 看前五条 30 | 31 | # 与用户进行聚合,看每个用户都看过多少电影 前五名最爱看电影的人 32 | print(df.groupBy('userId').count().orderBy('count', ascending=False).show(5)) 33 | 34 | # 显示前五个最热门的电影 35 | print(df.groupBy('title').count().orderBy('count', ascending=False).show(5)) 36 | 37 | 38 | def feature_process(df): 39 | ''' 40 | 特征工程 41 | :param df: 42 | :return: 43 | ''' 44 | # 1. 将title转为数字 也就是多加了一列特征 45 | stringIndexer = StringIndexer(inputCol="title", outputCol="title_new") 46 | model = stringIndexer.fit(df) 47 | indexed = model.transform(df) 48 | print(indexed.show(5)) 49 | return indexed 50 | 51 | 52 | if __name__ == '__main__': 53 | # 1. 加载数据 54 | spark = SparkSession.builder.appName('rc').getOrCreate() 55 | df = spark.read.csv('./data/movie_ratings_df.csv', inferSchema=True, header=True) 56 | # print((df.count(), len(df.columns))) # (100000, 3) 57 | 58 | # 2. 数据分析 59 | analyse_data(df) 60 | 61 | # 3. 特征工程 62 | model_df = feature_process(df) 63 | # 切分数据集 64 | train, test = model_df.randomSplit([0.75, 0.25]) 65 | print('训练集条数:', train.count()) 66 | print('测试集条数:', test.count()) 67 | # 训练集条数: 74996 68 | # 测试集条数: 25004 69 | 70 | # 4. 模型训练 71 | rec = ALS(maxIter=10, regParam=0.01, userCol='userId', 72 | itemCol='title_new', ratingCol='rating', 73 | nonnegative=True, coldStartStrategy="drop") 74 | rec_model = rec.fit(train) 75 | 76 | # 5. 模型评估 77 | predicted_ratings = rec_model.transform(test) 78 | print(predicted_ratings.printSchema()) 79 | 80 | # 计算预测和rating的均方误差 81 | evaluator=RegressionEvaluator(metricName='rmse',predictionCol='prediction',labelCol='rating') 82 | rmse=evaluator.evaluate(predicted_ratings) 83 | print(rmse) 84 | 85 | -------------------------------------------------------------------------------- /PySpark/007-NLP_use_pyspark.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 007-NLP_use_pyspark.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-04-09 6 | """ 7 | import findspark 8 | 9 | findspark.init() 10 | 11 | from pyspark.sql import SparkSession 12 | from pyspark.ml.feature import Tokenizer 13 | from pyspark.ml.feature import StopWordsRemover 14 | from pyspark.ml.feature import CountVectorizer 15 | from pyspark.ml.feature import HashingTF, IDF 16 | from pyspark.sql.functions import length 17 | from pyspark.sql.functions import udf 18 | from pyspark.sql.types import IntegerType 19 | from pyspark.sql.functions import * 20 | from pyspark.ml.feature import VectorAssembler 21 | from pyspark.ml.classification import LogisticRegression 22 | from pyspark.ml.evaluation import BinaryClassificationEvaluator 23 | 24 | 25 | def basic_op(): 26 | ''' 27 | 基本的操作 28 | :return: 29 | ''' 30 | spark = SparkSession.builder.appName('nlp').getOrCreate() 31 | df = spark.createDataFrame([(1, 'I really liked this movie'), 32 | (2, 'I would recommend this movie to my friends'), 33 | (3, 'movie was alright but acting was horrible'), 34 | (4, 'I am never watching that movie ever again')], 35 | ['user_id', 'review']) 36 | # print(df.show()) 37 | 38 | # 1. 将文本进行分词 做成新一个特征 39 | tokenization = Tokenizer(inputCol='review', outputCol='tokens') 40 | tokenized_df = tokenization.transform(df) 41 | # print(tokenized_df.show()) 42 | 43 | # 2. 去除停用词 44 | stopword_removal = StopWordsRemover(inputCol='tokens', outputCol='refined_tokens') 45 | refined_df = stopword_removal.transform(tokenized_df) 46 | print(refined_df.select(['user_id', 'tokens', 'refined_tokens']).show(10)) 47 | 48 | # 3. 统计向量 使用one-hot 49 | count_vec = CountVectorizer(inputCol='refined_tokens', outputCol='features') 50 | cv_df = count_vec.fit(refined_df).transform(refined_df) 51 | print(cv_df.select(['user_id', 'refined_tokens', 'features']).show(4)) 52 | print('词表(注:去停用词之后的):', count_vec.fit(refined_df).vocabulary) 53 | 54 | # 4. 计算tf-idf 55 | hashing_vec = HashingTF(inputCol='refined_tokens', outputCol='tf_features') 56 | hashing_df = hashing_vec.transform(refined_df) # 先进行一个hash计算 57 | print(hashing_df.select(['user_id', 'refined_tokens', 'tf_features']).show()) 58 | 59 | tf_idf_vec = IDF(inputCol='tf_features', outputCol='tf_idf_features') 60 | tf_idf_df = tf_idf_vec.fit(hashing_df).transform(hashing_df) 61 | print(tf_idf_df.select(['user_id', 'tf_idf_features']).show(4)) 62 | 63 | 64 | def data_process(text_df): 65 | text_df = text_df.filter(((text_df.Sentiment == '1') | (text_df.Sentiment == '0'))) 66 | print('清洗后的数据量:', text_df.count()) 67 | 68 | print('正负样本的分布') 69 | print(text_df.groupBy('Sentiment').count().show()) 70 | 71 | # 加入长度特征 72 | text_df = text_df.withColumn("Label", text_df.Sentiment.cast('float')).drop('Sentiment') 73 | 74 | # 分词 75 | text_df = text_df.withColumn('length', length(text_df['Review'])) 76 | tokenization = Tokenizer(inputCol='Review', outputCol='tokens') 77 | tokenized_df = tokenization.transform(text_df) 78 | 79 | # 去停用词 80 | stopword_removal = StopWordsRemover(inputCol='tokens', outputCol='refined_tokens') 81 | refined_text_df = stopword_removal.transform(tokenized_df) 82 | 83 | len_udf = udf(lambda s: len(s), IntegerType()) 84 | refined_text_df = refined_text_df.withColumn("token_count", len_udf(col('refined_tokens'))) 85 | 86 | count_vec = CountVectorizer(inputCol='refined_tokens', outputCol='features') 87 | cv_text_df = count_vec.fit(refined_text_df).transform(refined_text_df) 88 | 89 | model_text_df = cv_text_df.select(['features', 'token_count', 'Label']) 90 | return model_text_df 91 | 92 | 93 | if __name__ == '__main__': 94 | # basic_op() 95 | 96 | # 下面做一个简单的文本分类 97 | spark = SparkSession.builder.appName('text_classification').getOrCreate() 98 | text_df = spark.read.csv('./data/Movie_reviews.csv', inferSchema=True, header=True, sep=',') 99 | print('数据量:', text_df.count()) # 数据量: 7087 100 | 101 | model_text_df = data_process(text_df) 102 | df_assembler = VectorAssembler(inputCols=['features', 'token_count'], outputCol='features_vec') 103 | model_text_df = df_assembler.transform(model_text_df) 104 | 105 | # 切分数据集 106 | training_df, test_df = model_text_df.randomSplit([0.75, 0.25]) 107 | 108 | # 模型训练 109 | log_reg = LogisticRegression(featuresCol='features_vec', labelCol='Label').fit(training_df) 110 | 111 | # 模型评估 112 | results = log_reg.evaluate(test_df).predictions 113 | 114 | # confusion matrix 115 | true_postives = results[(results.Label == 1) & (results.prediction == 1)].count() 116 | true_negatives = results[(results.Label == 0) & (results.prediction == 0)].count() 117 | false_positives = results[(results.Label == 0) & (results.prediction == 1)].count() 118 | false_negatives = results[(results.Label == 1) & (results.prediction == 0)].count() 119 | 120 | recall = float(true_postives) / (true_postives + false_negatives) 121 | print(recall) 122 | 123 | precision = float(true_postives) / (true_postives + false_positives) 124 | print(precision) 125 | 126 | accuracy = float((true_postives + true_negatives) / (results.count())) 127 | print(accuracy) 128 | -------------------------------------------------------------------------------- /PySpark/data/Movie_reviews.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/PySpark/data/Movie_reviews.csv -------------------------------------------------------------------------------- /PySpark/data/iris_dataset.csv: -------------------------------------------------------------------------------- 1 | sepal_length,sepal_width,petal_length,petal_width,species 2 | 5.1,3.5,1.4,0.2,setosa 3 | 4.9,3,1.4,0.2,setosa 4 | 4.7,3.2,1.3,0.2,setosa 5 | 4.6,3.1,1.5,0.2,setosa 6 | 5,3.6,1.4,0.2,setosa 7 | 5.4,3.9,1.7,0.4,setosa 8 | 4.6,3.4,1.4,0.3,setosa 9 | 5,3.4,1.5,0.2,setosa 10 | 4.4,2.9,1.4,0.2,setosa 11 | 4.9,3.1,1.5,0.1,setosa 12 | 5.4,3.7,1.5,0.2,setosa 13 | 4.8,3.4,1.6,0.2,setosa 14 | 4.8,3,1.4,0.1,setosa 15 | 4.3,3,1.1,0.1,setosa 16 | 5.8,4,1.2,0.2,setosa 17 | 5.7,4.4,1.5,0.4,setosa 18 | 5.4,3.9,1.3,0.4,setosa 19 | 5.1,3.5,1.4,0.3,setosa 20 | 5.7,3.8,1.7,0.3,setosa 21 | 5.1,3.8,1.5,0.3,setosa 22 | 5.4,3.4,1.7,0.2,setosa 23 | 5.1,3.7,1.5,0.4,setosa 24 | 4.6,3.6,1,0.2,setosa 25 | 5.1,3.3,1.7,0.5,setosa 26 | 4.8,3.4,1.9,0.2,setosa 27 | 5,3,1.6,0.2,setosa 28 | 5,3.4,1.6,0.4,setosa 29 | 5.2,3.5,1.5,0.2,setosa 30 | 5.2,3.4,1.4,0.2,setosa 31 | 4.7,3.2,1.6,0.2,setosa 32 | 4.8,3.1,1.6,0.2,setosa 33 | 5.4,3.4,1.5,0.4,setosa 34 | 5.2,4.1,1.5,0.1,setosa 35 | 5.5,4.2,1.4,0.2,setosa 36 | 4.9,3.1,1.5,0.1,setosa 37 | 5,3.2,1.2,0.2,setosa 38 | 5.5,3.5,1.3,0.2,setosa 39 | 4.9,3.1,1.5,0.1,setosa 40 | 4.4,3,1.3,0.2,setosa 41 | 5.1,3.4,1.5,0.2,setosa 42 | 5,3.5,1.3,0.3,setosa 43 | 4.5,2.3,1.3,0.3,setosa 44 | 4.4,3.2,1.3,0.2,setosa 45 | 5,3.5,1.6,0.6,setosa 46 | 5.1,3.8,1.9,0.4,setosa 47 | 4.8,3,1.4,0.3,setosa 48 | 5.1,3.8,1.6,0.2,setosa 49 | 4.6,3.2,1.4,0.2,setosa 50 | 5.3,3.7,1.5,0.2,setosa 51 | 5,3.3,1.4,0.2,setosa 52 | 7,3.2,4.7,1.4,versicolor 53 | 6.4,3.2,4.5,1.5,versicolor 54 | 6.9,3.1,4.9,1.5,versicolor 55 | 5.5,2.3,4,1.3,versicolor 56 | 6.5,2.8,4.6,1.5,versicolor 57 | 5.7,2.8,4.5,1.3,versicolor 58 | 6.3,3.3,4.7,1.6,versicolor 59 | 4.9,2.4,3.3,1,versicolor 60 | 6.6,2.9,4.6,1.3,versicolor 61 | 5.2,2.7,3.9,1.4,versicolor 62 | 5,2,3.5,1,versicolor 63 | 5.9,3,4.2,1.5,versicolor 64 | 6,2.2,4,1,versicolor 65 | 6.1,2.9,4.7,1.4,versicolor 66 | 5.6,2.9,3.6,1.3,versicolor 67 | 6.7,3.1,4.4,1.4,versicolor 68 | 5.6,3,4.5,1.5,versicolor 69 | 5.8,2.7,4.1,1,versicolor 70 | 6.2,2.2,4.5,1.5,versicolor 71 | 5.6,2.5,3.9,1.1,versicolor 72 | 5.9,3.2,4.8,1.8,versicolor 73 | 6.1,2.8,4,1.3,versicolor 74 | 6.3,2.5,4.9,1.5,versicolor 75 | 6.1,2.8,4.7,1.2,versicolor 76 | 6.4,2.9,4.3,1.3,versicolor 77 | 6.6,3,4.4,1.4,versicolor 78 | 6.8,2.8,4.8,1.4,versicolor 79 | 6.7,3,5,1.7,versicolor 80 | 6,2.9,4.5,1.5,versicolor 81 | 5.7,2.6,3.5,1,versicolor 82 | 5.5,2.4,3.8,1.1,versicolor 83 | 5.5,2.4,3.7,1,versicolor 84 | 5.8,2.7,3.9,1.2,versicolor 85 | 6,2.7,5.1,1.6,versicolor 86 | 5.4,3,4.5,1.5,versicolor 87 | 6,3.4,4.5,1.6,versicolor 88 | 6.7,3.1,4.7,1.5,versicolor 89 | 6.3,2.3,4.4,1.3,versicolor 90 | 5.6,3,4.1,1.3,versicolor 91 | 5.5,2.5,4,1.3,versicolor 92 | 5.5,2.6,4.4,1.2,versicolor 93 | 6.1,3,4.6,1.4,versicolor 94 | 5.8,2.6,4,1.2,versicolor 95 | 5,2.3,3.3,1,versicolor 96 | 5.6,2.7,4.2,1.3,versicolor 97 | 5.7,3,4.2,1.2,versicolor 98 | 5.7,2.9,4.2,1.3,versicolor 99 | 6.2,2.9,4.3,1.3,versicolor 100 | 5.1,2.5,3,1.1,versicolor 101 | 5.7,2.8,4.1,1.3,versicolor 102 | 6.3,3.3,6,2.5,virginica 103 | 5.8,2.7,5.1,1.9,virginica 104 | 7.1,3,5.9,2.1,virginica 105 | 6.3,2.9,5.6,1.8,virginica 106 | 6.5,3,5.8,2.2,virginica 107 | 7.6,3,6.6,2.1,virginica 108 | 4.9,2.5,4.5,1.7,virginica 109 | 7.3,2.9,6.3,1.8,virginica 110 | 6.7,2.5,5.8,1.8,virginica 111 | 7.2,3.6,6.1,2.5,virginica 112 | 6.5,3.2,5.1,2,virginica 113 | 6.4,2.7,5.3,1.9,virginica 114 | 6.8,3,5.5,2.1,virginica 115 | 5.7,2.5,5,2,virginica 116 | 5.8,2.8,5.1,2.4,virginica 117 | 6.4,3.2,5.3,2.3,virginica 118 | 6.5,3,5.5,1.8,virginica 119 | 7.7,3.8,6.7,2.2,virginica 120 | 7.7,2.6,6.9,2.3,virginica 121 | 6,2.2,5,1.5,virginica 122 | 6.9,3.2,5.7,2.3,virginica 123 | 5.6,2.8,4.9,2,virginica 124 | 7.7,2.8,6.7,2,virginica 125 | 6.3,2.7,4.9,1.8,virginica 126 | 6.7,3.3,5.7,2.1,virginica 127 | 7.2,3.2,6,1.8,virginica 128 | 6.2,2.8,4.8,1.8,virginica 129 | 6.1,3,4.9,1.8,virginica 130 | 6.4,2.8,5.6,2.1,virginica 131 | 7.2,3,5.8,1.6,virginica 132 | 7.4,2.8,6.1,1.9,virginica 133 | 7.9,3.8,6.4,2,virginica 134 | 6.4,2.8,5.6,2.2,virginica 135 | 6.3,2.8,5.1,1.5,virginica 136 | 6.1,2.6,5.6,1.4,virginica 137 | 7.7,3,6.1,2.3,virginica 138 | 6.3,3.4,5.6,2.4,virginica 139 | 6.4,3.1,5.5,1.8,virginica 140 | 6,3,4.8,1.8,virginica 141 | 6.9,3.1,5.4,2.1,virginica 142 | 6.7,3.1,5.6,2.4,virginica 143 | 6.9,3.1,5.1,2.3,virginica 144 | 5.8,2.7,5.1,1.9,virginica 145 | 6.8,3.2,5.9,2.3,virginica 146 | 6.7,3.3,5.7,2.5,virginica 147 | 6.7,3,5.2,2.3,virginica 148 | 6.3,2.5,5,1.9,virginica 149 | 6.5,3,5.2,2,virginica 150 | 6.2,3.4,5.4,2.3,virginica 151 | 5.9,3,5.1,1.8,virginica -------------------------------------------------------------------------------- /PySpark/data/movie_ratings_df.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/PySpark/data/movie_ratings_df.csv -------------------------------------------------------------------------------- /PySpark/data/sample_data.csv: -------------------------------------------------------------------------------- 1 | ratings,age,experience,family,mobile 2 | 3,32,9,3,Vivo 3 | 3,27,13,3,Apple 4 | 4,22,2.5,0,Samsung 5 | 4,37,16.5,4,Apple 6 | 5,27,9,1,MI 7 | 4,27,9,0,Oppo 8 | 5,37,23,5,Vivo 9 | 5,37,23,5,Samsung 10 | 3,22,2.5,0,Apple 11 | 3,27,6,0,MI 12 | 2,27,6,2,Oppo 13 | 5,27,6,2,Samsung 14 | 3,37,16.5,5,Apple 15 | 5,27,6,0,MI 16 | 4,22,6,1,Oppo 17 | 4,37,9,2,Samsung 18 | 4,27,6,1,Apple 19 | 1,37,23,5,MI 20 | 2,42,23,2,Oppo 21 | 4,37,6,0,Vivo 22 | 5,22,2.5,0,Samsung 23 | 3,37,16.5,5,Apple 24 | 3,42,23,5,MI 25 | 2,27,9,2,Samsung 26 | 4,27,6,1,Apple 27 | 5,27,2.5,0,MI 28 | 2,27,6,2,Oppo 29 | 5,37,13,1,Vivo 30 | 2,32,16.5,2,Oppo 31 | 3,27,6,0,MI 32 | 3,27,6,0,MI 33 | 4,22,6,1,Oppo 34 | 4,37,6,0,Vivo -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python-Library-Learning 2 | Here we will sort out a variety of interesting Python library learning 3 | -------------------------------------------------------------------------------- /RSA实战/001-rsa生成公私钥并保存.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time    : 2020/7/15 12:00 3 | # @Author  : xiaolu 4 | # @FileName: 001-rsa生成公私钥并保存.py 5 | # @Software: PyCharm 6 | import rsa 7 | 8 | pubkey, privkey = rsa.newkeys(1024) # 生成公钥和私钥 9 | 10 | pub = pubkey.save_pkcs1() # 将生成的公钥和私钥进行转换, 以便存储 11 | pri = privkey.save_pkcs1() # save_pkcs1()是内置方法, 其默认参数就是"PEM" 12 | 13 | with open('pubkey.pem', mode='wb') as f, open('privkey.pem', mode='wb') as f1: 14 | f.write(pub) 15 | f1.write(pri) 16 | -------------------------------------------------------------------------------- /RSA实战/002-公钥加密私钥解密.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time    : 2020/7/15 12:05 3 | # @Author  : xiaolu 4 | # @FileName: 002-公钥加密私钥解密.py 5 | # @Software: PyCharm 6 | import rsa 7 | 8 | if __name__ == '__main__': 9 | with open('pubkey.pem', mode='rb') as f, open('privkey.pem', 'rb') as f1: 10 | # 从文件读取公私钥 11 | pub = f.read() 12 | pri = f1.read() 13 | 14 | # 转为原始的状态 15 | pubkey = rsa.PublicKey.load_pkcs1(pub) 16 | privkey = rsa.PrivateKey.load_pkcs1(pri) 17 | 18 | message = '你是个傻逼吧' 19 | info = rsa.encrypt(message.encode('utf8'), pubkey) 20 | msg = rsa.decrypt(info, privkey) 21 | print(msg.decode('utf8')) 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /apscheduler实现定时任务/定时任务.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 定时任务.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-10-27 6 | """ 7 | import time 8 | from datetime import datetime 9 | from apscheduler.schedulers.blocking import BlockingScheduler 10 | 11 | 12 | def my_job(text): 13 | print('{}'.format(text), time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) 14 | 15 | 16 | if __name__ == '__main__': 17 | sched = BlockingScheduler() 18 | # sched.add_job(my_job, 'interval', days=0, hours=24, minutes=0, seconds=0) # 每隔24小时执行一次 19 | # sched.add_job(my_job, 'interval', seconds=5, args=['北京时间:']) # 每个5秒执行 用interval 20 | 21 | # 指定某个时间点执行一次 22 | sched.add_job(my_job, 'date', run_date=datetime(2021, 10, 27, 17, 8, 5), args=['北京时间:']) 23 | sched.start() -------------------------------------------------------------------------------- /chinesebert中的pinyin和glyph的处理/MSYH.TTC: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/chinesebert中的pinyin和glyph的处理/MSYH.TTC -------------------------------------------------------------------------------- /chinesebert中的pinyin和glyph的处理/image_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : image_test.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-07-22 6 | """ 7 | import os 8 | import pygame 9 | 10 | chinese_dir = 'chinese' 11 | if not os.path.exists(chinese_dir): 12 | os.mkdir(chinese_dir) 13 | 14 | pygame.init() 15 | 16 | # 1. 写出所有的汉字 17 | # start, end = (0x4E00, 0x9FA5) # 汉字编码范围 18 | # 19 | # for codepoint in range(int(start), int(end)): 20 | # word = chr(codepoint) 21 | # font = pygame.font.Font("MSYH.TTC", 22) # 当前目录下要有微软雅黑的字体文件msyh.ttc,或者去c:\Windows\Fonts目录下找 22 | # rtext = font.render(word, True, (0, 0, 0), (255, 255, 255)) 23 | # pygame.image.save(rtext, os.path.join(chinese_dir, word + ".png")) 24 | 25 | # 2. 指定汉字 对于不同的字体 可以切换MSYH.TTC文件就行 26 | word = '新' 27 | font = pygame.font.Font("MSYH.TTC", 22) # 当前目录下要有微软雅黑的字体文件msyh.ttc,或者去c:\Windows\Fonts目录下找 28 | rtext = font.render(word, True, (0, 0, 0), (255, 255, 255)) 29 | pygame.image.save(rtext, os.path.join(chinese_dir, word + ".png")) 30 | 31 | -------------------------------------------------------------------------------- /chinesebert中的pinyin和glyph的处理/pinyin_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : pinyin_test.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-07-22 6 | """ 7 | from pypinyin import pinyin, lazy_pinyin, Style 8 | 9 | if __name__ == '__main__': 10 | print(pinyin('新浪微博')) # 输出: [['xīn'], ['làng'], ['wēi'], ['bó']] 11 | 12 | print(lazy_pinyin('新浪微博')) # 输出: ['xin', 'lang', 'wei', 'bo'] 13 | 14 | # 将拼音用数字表示 然后跟在拼音的后面 15 | style = Style.TONE3 # 1代表一声、2代表二声、3代表三声、4代表四声 16 | print(lazy_pinyin('新浪微博', style=style)) 17 | 18 | -------------------------------------------------------------------------------- /collections的用法/001-collections中的namedtuple用法.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time    : 2020/8/3 15:57 3 | # @Author  : xiaolu 4 | # @FileName: 001-collections中的namedtuple用法.py 5 | # @Software: PyCharm 6 | 7 | # 我的认识 感觉nametuple是一种便捷类的使用 8 | from collections import namedtuple 9 | 10 | Point = namedtuple("Point", ['x', 'y']) 11 | # 相当于定义了一个Point类,其中x, y为类的属性 12 | p = Point(1, 2) 13 | print(p.x) 14 | print(p.y) 15 | 16 | 17 | # 在深度学习中 我们可以定义参数文件 18 | from collections import namedtuple 19 | Config = namedtuple('Config', ['learning_rate', 20 | 'epoch', 21 | 'device', 22 | 'batch_size', 23 | 'vocab_size']) 24 | 25 | 26 | config = Config( 27 | learning_rate=1e-5, 28 | epoch=10, 29 | device=4, 30 | batch_size=32, 31 | vocab_size=12239 32 | ) 33 | print(config.learning_rate) 34 | 35 | 36 | -------------------------------------------------------------------------------- /elasticsearch/001-创建库并插入数据.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time    : 2020/7/30 15:04 3 | # @Author  : xiaolu 4 | # @FileName: 001-创建库并插入数据.py 5 | # @Software: PyCharm 6 | from elasticsearch import Elasticsearch 7 | 8 | 9 | es = Elasticsearch() 10 | 11 | # result = es.indices.delete(index='point_type', ignore=[400, 404]) # 删除索引(库) 12 | # exit() 13 | 14 | 15 | mapping = { 16 | "settings": { 17 | "analysis": { 18 | "filter": { 19 | "jieba_stop": { 20 | "type": "stop", 21 | "stopwords_path": "stopwords/stopwords.txt" 22 | }, 23 | "jieba_synonym": { 24 | "type": "synonym", 25 | "synonyms_path": "synonyms/synonyms.txt" 26 | }, 27 | "my_shingle_filter": { 28 | "type": "shingle", 29 | "min_shingle_size": 2, 30 | "max_shingle_size": 2, 31 | "output_unigrams": False 32 | } 33 | }, 34 | "analyzer": { 35 | "word_ans": { 36 | "tokenizer": "jieba_search", # 采用结巴分词 37 | "filter": "jieba_stop" # 采用结巴停用词过滤 38 | }, 39 | "char_ana": { 40 | "tokenizer": "standard", # 对于字符 采用标准的分词方式 就是按字分割 41 | "filter": "jieba_stop" # 也采用jieba停用词过滤 42 | }, 43 | "char_bigram_ana": { 44 | "type": "custom", 45 | "tokenizer": "standard", 46 | "filter": [ 47 | "jieba_stop", 48 | "my_shingle_filter" 49 | ] 50 | }, 51 | "word_bigram_ana": { 52 | "type": "custom", 53 | "tokenizer": "jieba_search", 54 | "filter": [ 55 | "jieba_stop", 56 | "my_shingle_filter" 57 | ] 58 | } 59 | } 60 | } 61 | }, 62 | "mappings": { 63 | "properties": { 64 | "title": { 65 | "type": "keyword" 66 | }, 67 | "author": { 68 | "type": "keyword" 69 | }, 70 | "dynasty": { 71 | "type": "keyword" 72 | }, 73 | "words": { 74 | "type": "integer" 75 | }, 76 | "content": { 77 | "analyzer": "word_ana", 78 | "search_analyzer": "word_ana", 79 | "type": "text" 80 | } 81 | } 82 | } 83 | } 84 | # 相当于将content入库时,会进行分词,然后采用jieba的停用词过滤方式。 当通过内容去查找时,也是先将问题分词,然后停用词过滤,在进行匹配。 85 | 86 | # es.indices.create(index='point_type', body=mapping) 87 | 88 | # 然后插入数据 89 | data = [ 90 | { 91 | "title": "静夜思", 92 | "author": "李白", 93 | "dynasty": "唐", 94 | "words": "20", 95 | "content": "床前明月光,疑是地上霜。举头望明月,低头思故乡。" 96 | }, 97 | 98 | { 99 | "title": "观沧海", 100 | "author": "曹操", 101 | "dynasty": "东汉末年", 102 | "words": "56", 103 | "content": "东临碣石,以观沧海。水何澹澹,山岛竦峙。树木丛生,百草丰茂。秋风萧瑟,洪波涌起。日月之行,若出其中。星汉灿烂,若出其里。幸甚至哉,歌以咏志。" 104 | }, 105 | 106 | { 107 | "title": "咏鹅", 108 | "author": "骆宾王", 109 | "dynasty": "唐", 110 | "words": "18", 111 | "content": "鹅鹅鹅,曲项向天歌。白毛浮绿水,红掌拨清波。" 112 | }, 113 | 114 | { 115 | "title": "将进酒", 116 | "author": "陈陶", 117 | "dynasty": "唐", 118 | "words": "14", 119 | "content": "银鸭金鹅言待谁,隋家岳渎皇家有" 120 | }, 121 | 122 | { 123 | "title": "春雪", 124 | "author": "白居易", 125 | "dynasty": "唐", 126 | "words": "10", 127 | "content": "大似落鹅毛,密如飘玉屑" 128 | } 129 | ] 130 | for d in data: 131 | es.index(index='point_type', body=d) 132 | -------------------------------------------------------------------------------- /elasticsearch/002-es中的搜索.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time    : 2020/7/30 15:52 3 | # @Author  : xiaolu 4 | # @FileName: 002-es中的搜索.py 5 | # @Software: PyCharm 6 | from elasticsearch import Elasticsearch 7 | 8 | 9 | if __name__ == '__main__': 10 | es = Elasticsearch() 11 | querys = '东临碣石' 12 | dsl = { 13 | 'query': { 14 | 'match': { 15 | 'title': '咏鹅' 16 | } 17 | } 18 | } 19 | results = es.search(index='point_type', body=dsl)['hits']['hits'] # 搜索多条结果的话 这里可能是一个列表 20 | 21 | res = [] 22 | for result in results: 23 | res.append(result['_source']) 24 | print(res) 25 | 26 | -------------------------------------------------------------------------------- /flask+echart+ajax/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/flask+echart+ajax/.DS_Store -------------------------------------------------------------------------------- /flask+echart+ajax/app.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : app.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2022-01-08 6 | """ 7 | import random 8 | from flask import Flask, render_template, jsonify 9 | 10 | app = Flask(__name__) 11 | 12 | 13 | @app.route('/left_data') 14 | def get_left_data(): 15 | day = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] 16 | # nums = [150, 230, 224, 218, 135, 147, 260] 17 | nums = [random.randint(0, 100) for _ in range(len(day))] 18 | random.shuffle(nums) 19 | data = {'day': day, 'nums': nums} 20 | return jsonify(data) 21 | 22 | 23 | @app.route('/') 24 | def index(): 25 | return render_template('index.html') 26 | 27 | 28 | if __name__ == '__main__': 29 | # app.run(port=6000) 30 | app.run(host='0.0.0.0') -------------------------------------------------------------------------------- /flask+echart+ajax/static/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/flask+echart+ajax/static/.DS_Store -------------------------------------------------------------------------------- /flask+echart+ajax/static/css/main.css: -------------------------------------------------------------------------------- 1 | body { 2 | margin: 0; 3 | background: #333; 4 | } 5 | 6 | #left { 7 | position: absolute; 8 | width: 50%; 9 | height: 100%; 10 | top: 0%; 11 | left: 0%; 12 | background: #666666; 13 | /* color: white; */ 14 | } 15 | 16 | #right { 17 | position: absolute; 18 | width: 50%; 19 | height: 100%; 20 | top: 0%; 21 | right: 0%; 22 | color: #FFFFFF; 23 | /* font-size: 20px; */ 24 | background: green; 25 | } -------------------------------------------------------------------------------- /flask+echart+ajax/static/js/controller.js: -------------------------------------------------------------------------------- 1 | function get_left_data() { 2 | $.ajax({ 3 | url:"/left_data", 4 | success: function(data) { 5 | option_left.xAxis.data = data.day 6 | option_left.series[0].data = data.nums 7 | ec_left.setOption(option_left) 8 | }, 9 | error: function(xhr, type, errorThrown) { 10 | } 11 | }) 12 | } 13 | 14 | get_left_data() 15 | setInterval(get_left_data, 1000*5) -------------------------------------------------------------------------------- /flask+echart+ajax/static/js/left.js: -------------------------------------------------------------------------------- 1 | var ec_left = echarts.init(document.getElementById("left"), "dark"); 2 | 3 | option_left = { 4 | xAxis: { 5 | type: 'category', 6 | data: [] 7 | }, 8 | yAxis: { 9 | type: 'value' 10 | }, 11 | series: [ 12 | { 13 | data: [], 14 | type: 'line' 15 | } 16 | ] 17 | }; 18 | ec_left.setOption(option_left); -------------------------------------------------------------------------------- /flask+echart+ajax/static/js/right.js: -------------------------------------------------------------------------------- 1 | var ec_right = echarts.init(document.getElementById("right"), "dark"); 2 | 3 | option_right = { 4 | xAxis: { 5 | type: 'category', 6 | data: ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] 7 | }, 8 | yAxis: { 9 | type: 'value' 10 | }, 11 | series: [ 12 | { 13 | data: [150, 230, 224, 218, 135, 147, 260], 14 | type: 'line' 15 | } 16 | ] 17 | }; 18 | ec_right.setOption(option_right); -------------------------------------------------------------------------------- /flask+echart+ajax/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Title 6 | 7 | 8 | 9 | 10 | 11 | 12 |
13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /flask表单那些事/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/flask表单那些事/.DS_Store -------------------------------------------------------------------------------- /flask表单那些事/app.py: -------------------------------------------------------------------------------- 1 | """ 2 | # -*- coding: utf-8 -*- 3 | # @File : app.py 4 | # @Time : 2020/11/19 3:54 下午 5 | # @Author : xiaolu 6 | # @Email : luxiaonlp@163.com 7 | # @Software: PyCharm 8 | """ 9 | from flask import Flask 10 | from flask import render_template, request 11 | 12 | app = Flask(__name__) 13 | 14 | 15 | @app.route('/', methods=['POST', 'GET']) 16 | def my_index(): 17 | user_name = request.form.get('username') 18 | if user_name is not None: 19 | pass_word = request.form.get('pwd') 20 | sex = request.form.getlist('sex') 21 | property = request.form.getlist('property') 22 | content = request.form.get('content') 23 | print(content) 24 | print(user_name) 25 | print(pass_word) 26 | print(sex) 27 | print(property) 28 | return render_template('index.html') 29 | 30 | 31 | if __name__ == '__main__': 32 | app.run() 33 | 34 | -------------------------------------------------------------------------------- /flask表单那些事/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 表单那些事 6 | 7 | 8 |
9 | 用户名:
10 | 密 码: 11 |

性别:

12 | 男神 13 | 女神 14 |

你目前的家产:

15 | 一辆自行车 16 | 一台电脑 17 | 一个手机 18 |
19 | 20 | 23 | 24 |
25 | 26 | 27 | -------------------------------------------------------------------------------- /gensim/001-TF-IDF句子相似度计算.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time    : 2020/8/5 11:47 3 | # @Author  : xiaolu 4 | # @FileName: 001-TF-IDF句子相似度计算.py 5 | # @Software: PyCharm 6 | import jieba 7 | from gensim import corpora, models, similarities 8 | 9 | import numpy as np 10 | import linecache 11 | 12 | 13 | def similarity(query_path, query): 14 | ''' 15 | :param query_path: 问题库的路径 16 | :param query: 所提的问题 17 | :return: 问题库中与当前问题相似的问题索引 18 | ''' 19 | # 对问题库中的问题处理 20 | questions = [] 21 | with open(query_path, 'r', encoding='utf8') as f: 22 | for line in f.readlines(): 23 | line = line.strip() 24 | line = jieba.lcut(line) 25 | temp = [] 26 | for w in line: 27 | if w not in stopword: 28 | temp.append(w) 29 | questions.append(temp) 30 | 31 | # 创建词典 32 | dictionary = corpora.Dictionary(questions) 33 | # 基于词典,将分词列表集转换成稀疏向量集,即语料库 34 | questions = [dictionary.doc2bow(ques) for ques in questions] 35 | # 训练TF-IDF模型,传入语料库进行训练 36 | tfidf = models.TfidfModel(questions) # 传入的向量集 37 | # 用训练好的TF-IDF模型处理被检索文本,即语料库 38 | corpus_tfidf = tfidf[questions] 39 | # for temp in corpus_tfidf: # 每个问题中的每个词的tfidf值 40 | # print(temp) 41 | # 对当前所问问题进行处理 42 | 43 | new_vec = dictionary.doc2bow(query.split()) 44 | new_vec_tfidf = tfidf[new_vec] 45 | 46 | # 计算当前问题与问题库中所有问题的相似度 47 | index = similarities.MatrixSimilarity(corpus_tfidf) # 最相似问题 48 | sims = index[new_vec_tfidf] # 相似的列表吧 49 | # print(sims) 50 | 51 | max_loc = np.argmax(sims) # 最相似的问题(问题库)编号 52 | max_sim = sims[max_loc] 53 | # print(max_loc) # 5 相似问题的编号 54 | # print(max_sim) # 1.0 相似程度 55 | 56 | # 句子相似度阈值 57 | sup = 0.7 58 | # row_index默认为-1,即未匹配到满足相似度阈值的问题 59 | row_index = -1 60 | if max_sim > sup: 61 | # 相似度最大值对应文件中问题所在的行索引 62 | row_index = max_loc + 1 63 | return row_index 64 | 65 | 66 | def get_answer(answer_path, row_index): 67 | """ 68 | :func: 得到问题对应的答案 69 | :param answer_path: 答案存储所在文件路径 70 | :param row_index: 答案的行索引 71 | :return: 72 | """ 73 | answer = linecache.getline(answer_path, row_index) 74 | return answer 75 | 76 | 77 | if __name__ == '__main__': 78 | answer_path = './data/answer.txt' 79 | query_path = './data/question.txt' 80 | 81 | # 加载停用词 82 | stopword = [] 83 | with open('./data/stopwords.txt', 'r', encoding='utf8') as f: 84 | for line in f.readlines(): 85 | line = line.strip() 86 | stopword.append(line) 87 | print('退出请按q') 88 | while True: 89 | question = input('>:') 90 | if question == 'q': 91 | break 92 | 93 | # 首先分词然后去除停用词 94 | res = jieba.lcut(question) 95 | question_sep = [] 96 | for r in res: 97 | if r not in stopword: 98 | question_sep.append(r) 99 | # question_sep 是问题经过分词, 停用词处理后的词表 100 | query = ' '.join(line for line in question_sep) 101 | 102 | # 得到问题对应的行索引 也就是问题来了 我们先和问题库中的问题匹配 得到问题库中的相似问题 103 | row_index = similarity(query_path, query) # 找到相似问题的索引位置了 104 | 105 | answer = get_answer(answer_path, row_index) 106 | print('<:', answer) 107 | -------------------------------------------------------------------------------- /gensim/002-gensim文本摘要.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @Time : 2020/11/3 14:43 4 | @Auth : xiaolu 5 | @File :002-gensim文本摘要.py 6 | @IDE :PyCharm 7 | @Email:luxiaonlp@163.com 8 | """ 9 | import re 10 | from LAC import LAC 11 | from gensim.summarization.summarizer import summarize 12 | 13 | 14 | def clean(content): 15 | content = content.replace('.', '') 16 | content = content.replace(' ', '') 17 | content = content.replace('\n', '.') 18 | return content 19 | 20 | 21 | def process_data(text, lac): 22 | # 首先对text进行分句子 主要防止摘要为半句话 23 | text = re.split('[.。?!]', text) 24 | 25 | sentences = [] 26 | for t in text: 27 | if len(t) == 0: 28 | continue 29 | t = lac.run(t) 30 | sentences.append(' '.join(t)) 31 | 32 | # 最后用.将句子连起来 33 | return '. '.join(sentences) 34 | 35 | 36 | if __name__ == '__main__': 37 | lac = LAC(mode='seg') 38 | 39 | # 1. 加载文章 40 | data = [] 41 | with open('./data/text.txt', 'r', encoding='utf8') as f: 42 | lines = f.readlines() 43 | for i, line in enumerate(lines): 44 | line = line.strip() 45 | line = process_data(line, lac) 46 | line = summarize(line) 47 | line = clean(line) 48 | print('*' * 20 + '第{}篇文章的摘要'.format(i + 1) + '*' * 20) 49 | print(line) 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /gensim/data/answer.txt: -------------------------------------------------------------------------------- 1 | 中国的首都是北京 2 | 美国的首都在华盛顿 3 | 陕西的省会城市是西安 4 | 山西的省会城市是太原 5 | 姚明的老婆是叶莉 6 | 姚明的女儿是姚沁蕾 7 | 国家主席是习近平 -------------------------------------------------------------------------------- /gensim/data/question.txt: -------------------------------------------------------------------------------- 1 | 中国的首都在哪儿 2 | 美国的首都在哪儿 3 | 陕西的省会城市在哪 4 | 山西的省会城市在哪儿 5 | 姚明的老婆是谁 6 | 姚明的女儿是谁 7 | 国家主席是谁呀 -------------------------------------------------------------------------------- /gensim/data/stopwords.txt: -------------------------------------------------------------------------------- 1 | ——— 2 | 》), 3 | )÷(1- 4 | ”, 5 | )、 6 | =( 7 | : 8 | → 9 | ℃ 10 | & 11 | * 12 | 一一 13 | ~~~~ 14 | ’ 15 | . 16 | 『 17 | .一 18 | ./ 19 | -- 20 | 』 21 | =″ 22 | 【 23 | [*] 24 | }> 25 | [⑤]] 26 | [①D] 27 | c] 28 | ng昉 29 | * 30 | // 31 | [ 32 | ] 33 | [②e] 34 | [②g] 35 | ={ 36 | } 37 | ,也 38 | ‘ 39 | A 40 | [①⑥] 41 | [②B] 42 | [①a] 43 | [④a] 44 | [①③] 45 | [③h] 46 | ③] 47 | 1. 48 | -- 49 | [②b] 50 | ’‘ 51 | ××× 52 | [①⑧] 53 | 0:2 54 | =[ 55 | [⑤b] 56 | [②c] 57 | [④b] 58 | [②③] 59 | [③a] 60 | [④c] 61 | [①⑤] 62 | [①⑦] 63 | [①g] 64 | ∈[ 65 | [①⑨] 66 | [①④] 67 | [①c] 68 | [②f] 69 | [②⑧] 70 | [②①] 71 | [①C] 72 | [③c] 73 | [③g] 74 | [②⑤] 75 | [②②] 76 | 一. 77 | [①h] 78 | .数 79 | [] 80 | [①B] 81 | 数/ 82 | [①i] 83 | [③e] 84 | [①①] 85 | [④d] 86 | [④e] 87 | [③b] 88 | [⑤a] 89 | [①A] 90 | [②⑧] 91 | [②⑦] 92 | [①d] 93 | [②j] 94 | 〕〔 95 | ][ 96 | :// 97 | ′∈ 98 | [②④ 99 | [⑤e] 100 | 12% 101 | b] 102 | ... 103 | ................... 104 | …………………………………………………③ 105 | ZXFITL 106 | [③F] 107 | 」 108 | [①o] 109 | ]∧′=[ 110 | ∪φ∈ 111 | ′| 112 | {- 113 | ②c 114 | } 115 | [③①] 116 | R.L. 117 | [①E] 118 | Ψ 119 | -[*]- 120 | ↑ 121 | .日 122 | [②d] 123 | [② 124 | [②⑦] 125 | [②②] 126 | [③e] 127 | [①i] 128 | [①B] 129 | [①h] 130 | [①d] 131 | [①g] 132 | [①②] 133 | [②a] 134 | f] 135 | [⑩] 136 | a] 137 | [①e] 138 | [②h] 139 | [②⑥] 140 | [③d] 141 | [②⑩] 142 | e] 143 | 〉 144 | 】 145 | 元/吨 146 | [②⑩] 147 | 2.3% 148 | 5:0 149 | [①] 150 | :: 151 | [②] 152 | [③] 153 | [④] 154 | [⑤] 155 | [⑥] 156 | [⑦] 157 | [⑧] 158 | [⑨] 159 | …… 160 | —— 161 | ? 162 | 、 163 | 。 164 | “ 165 | ” 166 | 《 167 | 》 168 | ! 169 | , 170 | : 171 | ; 172 | ? 173 | . 174 | , 175 | . 176 | ' 177 | ? 178 | · 179 | ——— 180 | ── 181 | ? 182 | — 183 | < 184 | > 185 | ( 186 | ) 187 | 〔 188 | 〕 189 | [ 190 | ] 191 | ( 192 | ) 193 | - 194 | + 195 | ~ 196 | × 197 | / 198 | / 199 | ① 200 | ② 201 | ③ 202 | ④ 203 | ⑤ 204 | ⑥ 205 | ⑦ 206 | ⑧ 207 | ⑨ 208 | ⑩ 209 | Ⅲ 210 | В 211 | " 212 | ; 213 | # 214 | @ 215 | γ 216 | μ 217 | φ 218 | φ. 219 | × 220 | Δ 221 | ■ 222 | ▲ 223 | sub 224 | exp 225 | sup 226 | sub 227 | Lex 228 | # 229 | % 230 | & 231 | ' 232 | + 233 | +ξ 234 | ++ 235 | - 236 | -β 237 | < 238 | <± 239 | <Δ 240 | <λ 241 | <φ 242 | << 243 | = 244 | = 245 | =☆ 246 | =- 247 | > 248 | >λ 249 | _ 250 | ~± 251 | ~+ 252 | [⑤f] 253 | [⑤d] 254 | [②i] 255 | ≈ 256 | [②G] 257 | [①f] 258 | LI 259 | ㈧ 260 | [- 261 | ...... 262 | 〉 263 | [③⑩] 264 | 第二 265 | 一番 266 | 一直 267 | 一个 268 | 一些 269 | 许多 270 | 种 271 | 有的是 272 | 也就是说 273 | 末##末 274 | 啊 275 | 阿 276 | 哎 277 | 哎呀 278 | 哎哟 279 | 唉 280 | 俺 281 | 俺们 282 | 按 283 | 按照 284 | 吧 285 | 吧哒 286 | 把 287 | 罢了 288 | 被 289 | 本 290 | 本着 291 | 比 292 | 比方 293 | 比如 294 | 鄙人 295 | 彼 296 | 彼此 297 | 边 298 | 别 299 | 别的 300 | 别说 301 | 并 302 | 并且 303 | 不比 304 | 不成 305 | 不单 306 | 不但 307 | 不独 308 | 不管 309 | 不光 310 | 不过 311 | 不仅 312 | 不拘 313 | 不论 314 | 不怕 315 | 不然 316 | 不如 317 | 不特 318 | 不惟 319 | 不问 320 | 不只 321 | 朝 322 | 朝着 323 | 趁 324 | 趁着 325 | 乘 326 | 冲 327 | 除 328 | 除此之外 329 | 除非 330 | 除了 331 | 此 332 | 此间 333 | 此外 334 | 从 335 | 从而 336 | 打 337 | 待 338 | 但 339 | 但是 340 | 当 341 | 当着 342 | 到 343 | 得 344 | 的 345 | 的话 346 | 等 347 | 等等 348 | 地 349 | 第 350 | 叮咚 351 | 对 352 | 对于 353 | 多 354 | 多少 355 | 而 356 | 而况 357 | 而且 358 | 而是 359 | 而外 360 | 而言 361 | 而已 362 | 尔后 363 | 反过来 364 | 反过来说 365 | 反之 366 | 非但 367 | 非徒 368 | 否则 369 | 嘎 370 | 嘎登 371 | 该 372 | 赶 373 | 个 374 | 各 375 | 各个 376 | 各位 377 | 各种 378 | 各自 379 | 给 380 | 根据 381 | 跟 382 | 故 383 | 故此 384 | 固然 385 | 关于 386 | 管 387 | 归 388 | 果然 389 | 果真 390 | 过 391 | 哈 392 | 哈哈 393 | 呵 394 | 和 395 | 何 396 | 何处 397 | 何况 398 | 何时 399 | 嘿 400 | 哼 401 | 哼唷 402 | 呼哧 403 | 乎 404 | 哗 405 | 还是 406 | 还有 407 | 换句话说 408 | 换言之 409 | 或 410 | 或是 411 | 或者 412 | 极了 413 | 及 414 | 及其 415 | 及至 416 | 即 417 | 即便 418 | 即或 419 | 即令 420 | 即若 421 | 即使 422 | 几 423 | 几时 424 | 己 425 | 既 426 | 既然 427 | 既是 428 | 继而 429 | 加之 430 | 假如 431 | 假若 432 | 假使 433 | 鉴于 434 | 将 435 | 较 436 | 较之 437 | 叫 438 | 接着 439 | 结果 440 | 借 441 | 紧接着 442 | 进而 443 | 尽 444 | 尽管 445 | 经 446 | 经过 447 | 就 448 | 就是 449 | 就是说 450 | 据 451 | 具体地说 452 | 具体说来 453 | 开始 454 | 开外 455 | 靠 456 | 咳 457 | 可 458 | 可见 459 | 可是 460 | 可以 461 | 况且 462 | 啦 463 | 来 464 | 来着 465 | 离 466 | 例如 467 | 哩 468 | 连 469 | 连同 470 | 两者 471 | 了 472 | 临 473 | 另 474 | 另外 475 | 另一方面 476 | 论 477 | 嘛 478 | 吗 479 | 慢说 480 | 漫说 481 | 冒 482 | 么 483 | 每 484 | 每当 485 | 们 486 | 莫若 487 | 某 488 | 某个 489 | 某些 490 | 拿 491 | 哪 492 | 哪边 493 | 哪儿 494 | 哪个 495 | 哪里 496 | 哪年 497 | 哪怕 498 | 哪天 499 | 哪些 500 | 哪样 501 | 那 502 | 那边 503 | 那儿 504 | 那个 505 | 那会儿 506 | 那里 507 | 那么 508 | 那么些 509 | 那么样 510 | 那时 511 | 那些 512 | 那样 513 | 乃 514 | 乃至 515 | 呢 516 | 能 517 | 你 518 | 你们 519 | 您 520 | 宁 521 | 宁可 522 | 宁肯 523 | 宁愿 524 | 哦 525 | 呕 526 | 啪达 527 | 旁人 528 | 呸 529 | 凭 530 | 凭借 531 | 其 532 | 其次 533 | 其二 534 | 其他 535 | 其它 536 | 其一 537 | 其余 538 | 其中 539 | 起 540 | 起见 541 | 起见 542 | 岂但 543 | 恰恰相反 544 | 前后 545 | 前者 546 | 且 547 | 然而 548 | 然后 549 | 然则 550 | 让 551 | 人家 552 | 任 553 | 任何 554 | 任凭 555 | 如 556 | 如此 557 | 如果 558 | 如何 559 | 如其 560 | 如若 561 | 如上所述 562 | 若 563 | 若非 564 | 若是 565 | 啥 566 | 上下 567 | 尚且 568 | 设若 569 | 设使 570 | 甚而 571 | 甚么 572 | 甚至 573 | 省得 574 | 时候 575 | 什么 576 | 什么样 577 | 使得 578 | 是 579 | 是的 580 | 首先 581 | 谁 582 | 谁知 583 | 顺 584 | 顺着 585 | 似的 586 | 虽 587 | 虽然 588 | 虽说 589 | 虽则 590 | 随 591 | 随着 592 | 所 593 | 所以 594 | 他 595 | 他们 596 | 他人 597 | 它 598 | 它们 599 | 她 600 | 她们 601 | 倘 602 | 倘或 603 | 倘然 604 | 倘若 605 | 倘使 606 | 腾 607 | 替 608 | 通过 609 | 同 610 | 同时 611 | 哇 612 | 万一 613 | 往 614 | 望 615 | 为 616 | 为何 617 | 为了 618 | 为什么 619 | 为着 620 | 喂 621 | 嗡嗡 622 | 我 623 | 我们 624 | 呜 625 | 呜呼 626 | 乌乎 627 | 无论 628 | 无宁 629 | 毋宁 630 | 嘻 631 | 吓 632 | 相对而言 633 | 像 634 | 向 635 | 向着 636 | 嘘 637 | 呀 638 | 焉 639 | 沿 640 | 沿着 641 | 要 642 | 要不 643 | 要不然 644 | 要不是 645 | 要么 646 | 要是 647 | 也 648 | 也罢 649 | 也好 650 | 一 651 | 一般 652 | 一旦 653 | 一方面 654 | 一来 655 | 一切 656 | 一样 657 | 一则 658 | 依 659 | 依照 660 | 矣 661 | 以 662 | 以便 663 | 以及 664 | 以免 665 | 以至 666 | 以至于 667 | 以致 668 | 抑或 669 | 因 670 | 因此 671 | 因而 672 | 因为 673 | 哟 674 | 用 675 | 由 676 | 由此可见 677 | 由于 678 | 有 679 | 有的 680 | 有关 681 | 有些 682 | 又 683 | 于 684 | 于是 685 | 于是乎 686 | 与 687 | 与此同时 688 | 与否 689 | 与其 690 | 越是 691 | 云云 692 | 哉 693 | 再说 694 | 再者 695 | 在 696 | 在下 697 | 咱 698 | 咱们 699 | 则 700 | 怎 701 | 怎么 702 | 怎么办 703 | 怎么样 704 | 怎样 705 | 咋 706 | 照 707 | 照着 708 | 者 709 | 这 710 | 这边 711 | 这儿 712 | 这个 713 | 这会儿 714 | 这就是说 715 | 这里 716 | 这么 717 | 这么点儿 718 | 这么些 719 | 这么样 720 | 这时 721 | 这些 722 | 这样 723 | 正如 724 | 吱 725 | 之 726 | 之类 727 | 之所以 728 | 之一 729 | 只是 730 | 只限 731 | 只要 732 | 只有 733 | 至 734 | 至于 735 | 诸位 736 | 着 737 | 着呢 738 | 自 739 | 自从 740 | 自个儿 741 | 自各儿 742 | 自己 743 | 自家 744 | 自身 745 | 综上所述 746 | 总的来看 747 | 总的来说 748 | 总的说来 749 | 总而言之 750 | 总之 751 | 纵 752 | 纵令 753 | 纵然 754 | 纵使 755 | 遵照 756 | 作为 757 | 兮 758 | 呃 759 | 呗 760 | 咚 761 | 咦 762 | 喏 763 | 啐 764 | 喔唷 765 | 嗬 766 | 嗯 767 | 嗳 768 | -------------------------------------------------------------------------------- /gensim/data/test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time    : 2020/8/6 10:18 3 | # @Author  : xiaolu 4 | # @FileName: test.py 5 | # @Software: PyCharm 6 | import linecache 7 | 8 | 9 | path = 'answer.txt' 10 | for i in range(5): 11 | answer = linecache.getline(path, i) 12 | answer = answer.strip() 13 | print(answer) -------------------------------------------------------------------------------- /gradio学习/01-row_column_layout.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 01-row_column_layout.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2023-05-12 6 | """ 7 | import gradio as gr 8 | 9 | title = "抽取式问答" 10 | 11 | description = "输入上下文与问题后,点击submit按钮,可从上下文中抽取出答案,赶快试试吧!" 12 | 13 | examples = [ 14 | ["普希金从那里学习人民的语言,吸取了许多有益的养料,这一切对普希金后来的创作产生了很大的影响。这两年里,普希金创作了不少优秀的作品,如《囚徒》、《致大海》、《致凯恩》和《假如生活欺骗了你》等几十首抒情诗,叙事诗《努林伯爵》,历史剧《鲍里斯·戈都诺夫》,以及《叶甫盖尼·奥涅金》前六章。", "著名诗歌《假如生活欺骗了你》的作者是"], 15 | ["普希金从那里学习人民的语言,吸取了许多有益的养料,这一切对普希金后来的创作产生了很大的影响。这两年里,普希金创作了不少优秀的作品,如《囚徒》、《致大海》、《致凯恩》和《假如生活欺骗了你》等几十首抒情诗,叙事诗《努林伯爵》,历史剧《鲍里斯·戈都诺夫》,以及《叶甫盖尼·奥涅金》前六章。", "普希金创作的叙事诗叫什么"] 16 | ] 17 | 18 | article = "感兴趣的小伙伴可以阅读[Transformers实用指南](https://zhuanlan.zhihu.com/p/548336726)" 19 | 20 | 21 | # 预测函数 22 | def custom_predict(context, question): 23 | answer = '对不起 我就是不给你回答' 24 | answer = question + ": " + answer 25 | score = 0.01 26 | return answer, score 27 | 28 | 29 | # 清除输入输出 30 | def clear_input(): 31 | return "", "", "", "" 32 | 33 | 34 | # 构建Blocks上下文 35 | with gr.Blocks() as demo: 36 | gr.Markdown("# 抽取式问答") 37 | gr.Markdown("输入上下文与问题后,点击submit按钮,可从上下文中抽取出答案,赶快试试吧!") 38 | with gr.Column(): # 列排列 39 | context = gr.Textbox(label="context") 40 | question = gr.Textbox(label="question") 41 | with gr.Row(): # 行排列 42 | clear = gr.Button("clear") # 清除按钮 43 | submit = gr.Button("submit") # submit提交按钮 44 | with gr.Column(): # 列排列 45 | answer = gr.Textbox(label="answer") 46 | score = gr.Label(label="score") 47 | 48 | # 绑定submit点击函数 49 | submit.click(fn=custom_predict, inputs=[context, question], outputs=[answer, score]) 50 | 51 | # 绑定clear点击函数 52 | clear.click(fn=clear_input, inputs=[], outputs=[context, question, answer, score]) 53 | gr.Examples(examples, inputs=[context, question]) 54 | gr.Markdown("感兴趣的小伙伴可以阅读[Transformers实用指南](https://zhuanlan.zhihu.com/p/548336726)") 55 | 56 | demo.launch() 57 | -------------------------------------------------------------------------------- /gradio学习/02-chatglm_web.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 02-chatglm_web.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2023-05-12 6 | """ 7 | from transformers import AutoModel, AutoTokenizer 8 | import gradio as gr 9 | 10 | # 加载模型 11 | tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True) 12 | model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda() 13 | model = model.eval() 14 | 15 | 16 | """Override Chatbot.postprocess""" 17 | def postprocess(self, y): 18 | if y is None: 19 | return [] 20 | for i, (message, response) in enumerate(y): 21 | y[i] = ( 22 | None if message is None else mdtex2html.convert((message)), 23 | None if response is None else mdtex2html.convert(response), 24 | ) 25 | return y 26 | 27 | 28 | gr.Chatbot.postprocess = postprocess 29 | 30 | 31 | def parse_text(text): 32 | """copy from https://github.com/GaiZhenbiao/ChuanhuChatGPT/""" 33 | lines = text.split("\n") 34 | lines = [line for line in lines if line != ""] 35 | count = 0 36 | for i, line in enumerate(lines): 37 | if "```" in line: 38 | count += 1 39 | items = line.split('`') 40 | if count % 2 == 1: 41 | lines[i] = f'
'
 42 |             else:
 43 |                 lines[i] = f'
' 44 | else: 45 | if i > 0: 46 | if count % 2 == 1: 47 | line = line.replace("`", "\`") 48 | line = line.replace("<", "<") 49 | line = line.replace(">", ">") 50 | line = line.replace(" ", " ") 51 | line = line.replace("*", "*") 52 | line = line.replace("_", "_") 53 | line = line.replace("-", "-") 54 | line = line.replace(".", ".") 55 | line = line.replace("!", "!") 56 | line = line.replace("(", "(") 57 | line = line.replace(")", ")") 58 | line = line.replace("$", "$") 59 | lines[i] = "
"+line 60 | text = "".join(lines) 61 | return text 62 | 63 | 64 | def predict(input, chatbot, max_length, top_p, temperature, history): 65 | chatbot.append((parse_text(input), "")) 66 | for response, history in model.stream_chat(tokenizer, input, history, max_length=max_length, top_p=top_p, 67 | temperature=temperature): 68 | chatbot[-1] = (parse_text(input), parse_text(response)) 69 | yield chatbot, history 70 | 71 | 72 | def reset_user_input(): 73 | return gr.update(value='') 74 | 75 | 76 | def reset_state(): 77 | return [], [] 78 | 79 | 80 | with gr.Blocks() as demo: 81 | gr.HTML("""

ChatGLM

""") # 可以加入前端代码显示 82 | 83 | chatbot = gr.Chatbot() # 占一行 chatbot 84 | with gr.Row(): # 下面的每个元素行行排列 85 | with gr.Column(scale=4): # 行 左 占总行空间的4/5 86 | with gr.Column(scale=12): 87 | user_input = gr.Textbox(show_label=False, placeholder="Input...", lines=10).style( 88 | container=False) 89 | with gr.Column(min_width=32, scale=1): 90 | submitBtn = gr.Button("Submit", variant="primary") 91 | 92 | with gr.Column(scale=1): # 行 右 占总行空间的1/5 93 | emptyBtn = gr.Button("Clear History") 94 | max_length = gr.Slider(0, 4096, value=2048, step=1.0, label="Maximum length", interactive=True) 95 | top_p = gr.Slider(0, 1, value=0.7, step=0.01, label="Top P", interactive=True) 96 | temperature = gr.Slider(0, 1, value=0.95, step=0.01, label="Temperature", interactive=True) 97 | 98 | history = gr.State([]) 99 | submitBtn.click(predict, [user_input, chatbot, max_length, top_p, temperature, history], [chatbot, history], 100 | show_progress=True) 101 | submitBtn.click(reset_user_input, [], [user_input]) # 点了提交按钮后 用户输入框也得改下 102 | emptyBtn.click(reset_state, outputs=[chatbot, history], show_progress=True) 103 | 104 | demo.queue().launch(share=False, inbrowser=True) 105 | # 如果想加登录 106 | # zhanghu = [["xiaolu", "1234"]] 107 | # demo.queue().launch(share=True, server_name='0.0.0.0', server_port=6006, auth=zhanghu, auth_message='请联系xiaolu认证进行访问') 108 | -------------------------------------------------------------------------------- /ipdb调试python程序/001-简单调试.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time    : 2020/8/6 10:04 3 | # @Author  : xiaolu 4 | # @FileName: 001-简单调试.py 5 | # @Software: PyCharm 6 | from ipdb import set_trace 7 | 8 | 9 | if __name__ == "__main__": 10 | a = 0 11 | b = 1 12 | for i in range(1, 100, 2): 13 | a += i 14 | b *= i 15 | set_trace() 16 | 17 | 18 | # ipdb> print(a) 19 | # 1 20 | # ipdb> print(b) 21 | # 1 22 | # 接下来输入n 每输入一次 往后执行一行 23 | 24 | # 假设输入两次n 此时的a=4 b=1 输三次n 此时的a=4 b=3 25 | -------------------------------------------------------------------------------- /ipdb调试python程序/readme.txt: -------------------------------------------------------------------------------- 1 | ipdb命令大全: 2 | 3 | ENTER(重复上次命令) 4 | c(继续) 5 | l(查找当前位于哪里) 6 | s(进入子程序) 7 | r(运行直到子程序结束) 8 | ! 9 | h(帮助) 10 | a(rgs) 打印当前函数的参数 11 | j(ump) 让程序跳转到指定的行数 12 | l(ist) 可以列出当前将要运行的代码块 13 | n(ext) 让程序运行下一行,如果当前语句有一个函数调用,用 n 是不会进入被调用的函数体中的 14 | p(rint) 最有用的命令之一,打印某个变量 15 | q(uit) 退出调试 16 | r(eturn) 继续执行,直到函数体返回 17 | s(tep) 跟 n 相似,但是如果当前有一个函数调用,那么 s 会进入被调用的函数体中 -------------------------------------------------------------------------------- /logging模块的使用/001-日志级别的使用.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 001-日志级别的使用.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-02-21 6 | """ 7 | import logging 8 | 9 | if __name__ == '__main__': 10 | logging.basicConfig(level=logging.NOTSET) # 这是级别 输出小于warning级别的信息 11 | logging.debug('数学') 12 | logging.info('英语') 13 | logging.warning('物理') 14 | logging.error('体育') 15 | logging.critical('政治') -------------------------------------------------------------------------------- /logging模块的使用/002-日志控制台输出.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 002-日志控制台输出.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-02-21 6 | """ 7 | import logging # 引入logging模块 8 | if __name__ == '__main__': 9 | logging.basicConfig(level=logging.DEBUG, # 输出的最低级别 10 | format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s') # logging.basicConfig函数对日志的输出格式及方式做相关配置 11 | # asctime是时间 filename是当前文件夹 lineno 行号 levelname 什么级别的错误 massage输出的信息 12 | # 由于日志基本配置中级别设置为DEBUG,所以一下打印信息将会全部显示在控制台上 13 | logging.info('this is a loggging info message') 14 | logging.debug('this is a loggging debug message') 15 | logging.warning('this is loggging a warning message') 16 | logging.error('this is an loggging error message') 17 | logging.critical('this is a loggging critical message') 18 | -------------------------------------------------------------------------------- /logging模块的使用/003-日志文件输出.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 003-日志文件输出.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-02-21 6 | """ 7 | import logging # 引入logging模块 8 | import os.path 9 | import time 10 | 11 | if __name__ == '__main__': 12 | # 第一步,创建一个logger 13 | logger = logging.getLogger() 14 | logger.setLevel(logging.INFO) # Log等级总开关 15 | 16 | # 第二步,创建一个handler,用于写入日志文件 17 | rq = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) 18 | log_path = os.path.dirname(os.getcwd()) + '/Logs/' 19 | os.makedirs(log_path, exist_ok=True) # 创建文件夹 20 | log_name = log_path + rq + '.log' # 日志名 21 | logfile = log_name 22 | fh = logging.FileHandler(logfile, mode='w') 23 | fh.setLevel(logging.DEBUG) # 输出到file的log等级的开关 24 | 25 | # 第三步,定义handler的输出格式 26 | formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s") 27 | fh.setFormatter(formatter) 28 | 29 | # 第四步,将logger添加到handler里面 30 | logger.addHandler(fh) 31 | 32 | # 日志 33 | logger.debug('this is a logger debug message') 34 | logger.info('this is a logger info message') 35 | logger.warning('this is a logger warning message') 36 | logger.error('this is a logger error message') 37 | logger.critical('this is a logger critical message') -------------------------------------------------------------------------------- /logging模块的使用/004-捕捉异常.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 004-捕捉异常.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-02-21 6 | """ 7 | import os.path 8 | import time 9 | import logging 10 | 11 | if __name__ == '__main__': 12 | # 创建一个logger 13 | logger = logging.getLogger() 14 | logger.setLevel(logging.INFO) # Log等级总开关 15 | 16 | # 创建一个handler,用于写入日志文件 17 | rq = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) 18 | log_path = os.path.dirname(os.getcwd()) + '/Logs/' 19 | os.makedirs(log_path, exist_ok=True) 20 | log_name = log_path + rq + '.log' 21 | logfile = log_name 22 | fh = logging.FileHandler(logfile, mode='w') 23 | fh.setLevel(logging.DEBUG) # 输出到file的log等级的开关 24 | 25 | # 定义handler的输出格式 26 | formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s") 27 | fh.setFormatter(formatter) 28 | logger.addHandler(fh) 29 | 30 | # 使用logger.XX来记录错误,这里的"error"可以根据所需要的级别进行修改 31 | try: 32 | open('/path/to/does/not/exist', 'rb') 33 | except (SystemExit, KeyboardInterrupt): 34 | raise 35 | except Exception: 36 | logger.error('Failed to open file', exc_info=True) 37 | -------------------------------------------------------------------------------- /pandas一键画图/001-plot_zhexiantu.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 001-plot_zhexiantu.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-12-29 6 | """ 7 | # 安装pandas以及pandas_bokeh # pip install pandas_bokeh pandas 8 | import numpy as np 9 | import pandas as pd 10 | import pandas_bokeh 11 | 12 | # 注意 文件名字不要夹带中文 13 | 14 | if __name__ == '__main__': 15 | np.random.seed(55) 16 | df = pd.DataFrame({"宁德时代": np.random.randn(100)+0.2, 17 | "贵州茅台": np.random.randn(100)+0.17}, 18 | index=pd.date_range('1/1/2021', periods=100)) 19 | df = df.cumsum() # 累加 20 | df = df + 50 21 | df.plot_bokeh.line( 22 | figsize=(800, 450), # 图片的大小 23 | title="宁德时代 vs 贵州茅台", # 表名 24 | xlabel="日期", # 横坐标的名字 25 | ylabel="股票价格 [$]", # 纵坐标的名字 26 | # yticks=[0, 100, 200, 300, 400], # y轴的虚线 可以不带 27 | ylim=(45, 80), # y轴范围 28 | xlim=("2021-01-01", "2021-04-01"), # x轴的范围 29 | colormap=["red", "blue"], 30 | plot_data_points=True, # 标记每个值 31 | plot_data_points_size=5, 32 | marker="asterisk") 33 | -------------------------------------------------------------------------------- /pandas一键画图/002-plot_sandiantu.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | Bokeh Plot 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 |
39 | 40 | 41 | 42 | 43 | 44 | 47 | 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /pandas一键画图/002-plot_sandiantu.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 002-plot_sandiantu.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-12-29 6 | """ 7 | import pandas as pd 8 | import pandas_bokeh 9 | 10 | if __name__ == '__main__': 11 | # 随便造一些数据 12 | df = pd.DataFrame({ 13 | 'length': [5.1, 4.9, 4.7, 4.6, 5., 5.4, 4.6, 5., 4.4, 4.9, 5.4, 4.8, 4.8, 4.3, 5.8, 5.7], 14 | 'width': [3.5, 3., 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.4, 3., 3., 4., 4.4], 15 | 'label': [0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1] 16 | }) 17 | 18 | p_scatter = df.plot_bokeh.scatter( 19 | x="length", 20 | y="width", 21 | category="label", # 如果有类别 还可以jia 22 | title="随便一画", 23 | show_figure=True, 24 | ) -------------------------------------------------------------------------------- /pandas一键画图/003-plot_zhuzhuangtu.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | Bokeh Plot 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 |
39 | 40 | 41 | 42 | 43 | 44 | 47 | 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /pandas一键画图/003-plot_zhuzhuangtu.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 003-plot_zhuzhuangtu.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-12-29 6 | """ 7 | import pandas as pd 8 | import pandas_bokeh 9 | 10 | 11 | if __name__ == '__main__': 12 | data = { 13 | 'fruits': 14 | ['苹果', '梨', '草莓', '西瓜', '葡萄', '香蕉'], 15 | '2015': [2, 1, 4, 3, 2, 4], 16 | '2016': [5, 3, 3, 2, 4, 6], 17 | '2017': [3, 2, 4, 4, 5, 3] 18 | } 19 | df = pd.DataFrame(data).set_index("fruits") # 设置水果为索引 20 | 21 | p_bar = df.plot_bokeh.bar( 22 | ylabel="每斤的的价格 [¥]", 23 | title="水果每年的价格", 24 | alpha=0.6) -------------------------------------------------------------------------------- /py2neo操作neo4j/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/.DS_Store -------------------------------------------------------------------------------- /py2neo操作neo4j/py2neo简单练习/create_graph_v1.py: -------------------------------------------------------------------------------- 1 | """ 2 | # -*- coding: utf-8 -*- 3 | # @File : create_graph_v1.py 4 | # @Time : 2020/11/23 6:52 下午 5 | # @Author : xiaolu 6 | # @Email : luxiaonlp@163.com 7 | # @Software: PyCharm 8 | """ 9 | from py2neo import Graph, Node, Relationship, NodeMatcher 10 | import pandas as pd 11 | from pdb import set_trace 12 | 13 | 14 | def load_data(): 15 | # 加载数据 16 | # data = pd.read_excel('./santi.xlsx') 17 | # data = pd.read_excel('./mingchaonaxieshier.xlsx') 18 | data = pd.read_excel('./test.xlsx') 19 | start = data['S'].tolist() 20 | relation = data['P'].tolist() 21 | end = data['O'].tolist() 22 | start_list = [str(i) for i in start] 23 | relation_list = [str(i) for i in relation] 24 | end_list = [str(i) for i in end] 25 | link_dict = dict() 26 | link_dict['start'] = start_list 27 | link_dict['relation'] = relation_list 28 | link_dict['end'] = end_list 29 | df_data = pd.DataFrame(link_dict) 30 | return df_data 31 | 32 | 33 | class DataToNeo4j: 34 | def __init__(self): 35 | link = Graph() 36 | self.graph = link 37 | 38 | self.start = 'start' 39 | self.end = 'end' 40 | 41 | self.graph.delete_all() # 将之前的图 全部删除 42 | self.matcher = NodeMatcher(link) # 为了查找 43 | 44 | def create_node(self, start, end): 45 | # 创建节点 46 | for name in start: 47 | node = Node(self.start, name=name) 48 | self.graph.create(node) 49 | 50 | for name in end: 51 | node = Node(self.end, name=name) 52 | self.graph.create(node) 53 | 54 | def create_relation(self, df_data): 55 | m = 0 56 | for m in range(0, len(df_data)): 57 | # print(list(self.matcher.match(self.start).where('_.name=' + "'" + df_data['start'][m] + "'"))) 58 | # 相当于在'start'标签下找 name=某个名字的节点 59 | # print(list(self.matcher.match(self.end).where('_.name=' + "'" + df_data['end'][m] + "'"))) 60 | # 相当于在'end'标签下找 name=某个名字的节点' 61 | # 然后为这两个节点创建关系 62 | try: 63 | rel = Relationship( 64 | self.matcher.match(self.start).where('_.name=' + "'" + df_data['start'][m] + "'").first(), 65 | df_data['relation'][m], 66 | self.matcher.match(self.end).where('_.name=' + "'" + df_data['end'][m] + "'").first() 67 | ) 68 | self.graph.create(rel) 69 | except AttributeError as e: 70 | print(e, m) 71 | 72 | 73 | def data_extraction(df_data): 74 | node_start = [] 75 | for i in df_data['start'].tolist(): 76 | node_start.append(i) 77 | 78 | node_end = [] 79 | for i in df_data['end'].tolist(): 80 | node_end.append(i) 81 | 82 | # 去重 83 | node_start = list(set(node_start)) 84 | node_end = list(set(node_end)) 85 | return node_start, node_end 86 | 87 | 88 | if __name__ == '__main__': 89 | df_data = load_data() 90 | # print(df_data.head()) 91 | node_start, node_end = data_extraction(df_data) 92 | # 创建图 93 | create_data = DataToNeo4j() 94 | # 节点 95 | create_data.create_node(node_start, node_end) 96 | # 关系 97 | create_data.create_relation(df_data) 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /py2neo操作neo4j/py2neo简单练习/create_graph_v2.py: -------------------------------------------------------------------------------- 1 | """ 2 | # -*- coding: utf-8 -*- 3 | # @File : create_graph_v2.py 4 | # @Time : 2020/11/23 9:54 下午 5 | # @Author : xiaolu 6 | # @Email : luxiaonlp@163.com 7 | # @Software: PyCharm 8 | """ 9 | """ 10 | # -*- coding: utf-8 -*- 11 | # @File : create_graph_v1.py 12 | # @Time : 2020/11/23 6:52 下午 13 | # @Author : xiaolu 14 | # @Email : luxiaonlp@163.com 15 | # @Software: PyCharm 16 | """ 17 | from py2neo import Graph, Node, Relationship, NodeMatcher 18 | import pandas as pd 19 | from pdb import set_trace 20 | 21 | 22 | def load_data(): 23 | # 加载数据 24 | data = pd.read_excel('./santi.xlsx') 25 | # data = pd.read_excel('./mingchaonaxieshier.xlsx') 26 | # data = pd.read_excel('./test.xlsx') 27 | start = data['S'].tolist() 28 | relation = data['P'].tolist() 29 | end = data['O'].tolist() 30 | start_list = [str(i) for i in start] 31 | relation_list = [str(i) for i in relation] 32 | end_list = [str(i) for i in end] 33 | link_dict = dict() 34 | link_dict['start'] = start_list 35 | link_dict['relation'] = relation_list 36 | link_dict['end'] = end_list 37 | df_data = pd.DataFrame(link_dict) 38 | return df_data 39 | 40 | 41 | class DataToNeo4j: 42 | def __init__(self): 43 | link = Graph() 44 | self.graph = link 45 | 46 | self.start = 'start' 47 | self.end = 'end' 48 | 49 | self.graph.delete_all() # 将之前的图 全部删除 50 | self.matcher = NodeMatcher(link) # 为了查找 51 | 52 | def create_node(self, start, end): 53 | # 创建节点 54 | temp = [] 55 | temp.extend(start) 56 | temp.extend(end) 57 | temp = list(set(temp)) 58 | for t in temp: 59 | node = Node(self.start, name=t) 60 | self.graph.create(node) 61 | 62 | 63 | # for name in start: 64 | # node = Node(self.start, name=name) 65 | # self.graph.create(node) 66 | # 67 | # for name in end: 68 | # node = Node(self.end, name=name) 69 | # self.graph.create(node) 70 | 71 | def create_relation(self, df_data): 72 | m = 0 73 | for m in range(0, len(df_data)): 74 | # print(list(self.matcher.match(self.start).where('_.name=' + "'" + df_data['start'][m] + "'"))) 75 | # 相当于在'start'标签下找 name=某个名字的节点 76 | # print(list(self.matcher.match(self.end).where('_.name=' + "'" + df_data['end'][m] + "'"))) 77 | # 相当于在'end'标签下找 name=某个名字的节点' 78 | # 然后为这两个节点创建关系 79 | try: 80 | rel = Relationship( 81 | self.matcher.match(self.start).where('_.name=' + "'" + df_data['start'][m] + "'").first(), 82 | df_data['relation'][m], 83 | self.matcher.match(self.start).where('_.name=' + "'" + df_data['end'][m] + "'").first() 84 | ) 85 | self.graph.create(rel) 86 | except AttributeError as e: 87 | print(e, m) 88 | 89 | 90 | def data_extraction(df_data): 91 | node_start = [] 92 | for i in df_data['start'].tolist(): 93 | node_start.append(i) 94 | 95 | node_end = [] 96 | for i in df_data['end'].tolist(): 97 | node_end.append(i) 98 | 99 | # 去重 100 | node_start = list(set(node_start)) 101 | node_end = list(set(node_end)) 102 | return node_start, node_end 103 | 104 | 105 | if __name__ == '__main__': 106 | df_data = load_data() 107 | # print(df_data.head()) 108 | node_start, node_end = data_extraction(df_data) 109 | # 创建图 110 | create_data = DataToNeo4j() 111 | # 节点 112 | create_data.create_node(node_start, node_end) 113 | # 关系 114 | create_data.create_relation(df_data) 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | -------------------------------------------------------------------------------- /py2neo操作neo4j/py2neo简单练习/mingchaonaxieshier.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/py2neo简单练习/mingchaonaxieshier.xlsx -------------------------------------------------------------------------------- /py2neo操作neo4j/py2neo简单练习/santi.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/py2neo简单练习/santi.xlsx -------------------------------------------------------------------------------- /py2neo操作neo4j/py2neo简单练习/test.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/py2neo简单练习/test.xlsx -------------------------------------------------------------------------------- /py2neo操作neo4j/readme.txt: -------------------------------------------------------------------------------- 1 | 1. 创造一个节点 2 | create (n:Person {name:"我", age=21}) 3 | 2. 创建关系 4 | create (p:Person {name:"我", age:"23"})-[:包工程{金额:10000}]->(n:Person {name:"好大哥", age:"35"}) 5 | 3. 删除节点 注意 删除有连接的节点时 必须先删掉关系 6 | create (n:Person {name:"XL", age:23}) 7 | match (n:Person {name:"XL"}) delete n 8 | 4. 删除关系 9 | match (p:Person {name:"我", age:"23"})-[f:包工程{金额:10000}]->(n:Person {name:"好大哥", age:"35"}) delete f 10 | 5. 加上标签 11 | match (t:Person) where id(t)=2 set t:好人 return t 12 | 同个某个节点的id 找到它 然后给其设置一个好人的标签 13 | 6. 额外增加属性 14 | match (a:好人) where id(a)=2 set a.战斗力=200 return a 15 | 在好人标签中找一个节点的id为2 然后给其加一个战斗力属性 并设置其值为200 16 | 7. 查找 17 | create (:Person {name:"唐僧", age:"79"})-[:师傅 {s_time:"2020-11-23"}]->(:Person {name:"孙悟空", age:"1w"}) 18 | match (a:Person)-[:师傅]->(b:Person) return a,b 19 | 创建了唐僧和孙悟空是师傅关系 然后 找关系为师傅的两个节点。 20 | 8. 快速清空数据库 21 | match (n) detach delete n -------------------------------------------------------------------------------- /py2neo操作neo4j/事件三元组抽取/ltp的使用.py: -------------------------------------------------------------------------------- 1 | """ 2 | # -*- coding: utf-8 -*- 3 | # @File : ltp的使用.py 4 | # @Time : 2020/11/25 9:20 下午 5 | # @Author : xiaolu 6 | # @Email : luxiaonlp@163.com 7 | # @Software: PyCharm 8 | """ 9 | from ltp import LTP 10 | # 安装ltp: pip install ltp -i https://pypi.douban.com/simple/ 11 | # 学习文档: http://ltp.ai/docs/quickstart.html 12 | 13 | 14 | def fenju(): 15 | # 分句子 16 | sents = ltp.sent_split(["他叫汤姆去拿外衣。", "汤姆生病了。他去了医院。"]) 17 | print(sents) 18 | 19 | 20 | def fenci(): 21 | # 可以加载自己的词表 22 | ltp.init_dict(path='my_vocab.txt', max_window=4) 23 | segment, _ = ltp.seg(['我是你爸,我是你妈']) 24 | print(segment) 25 | 26 | 27 | def cixingbiaozhu(): 28 | seg, hidden = ltp.seg(['他叫汤姆去拿外衣。']) 29 | pos = ltp.pos(hidden) 30 | print(seg) 31 | print(pos) 32 | 33 | 34 | def mingmingshitishibie(): 35 | seg, hidden = ltp.seg(['他叫汤姆去拿外衣。孙悟空不同意咋办? 但是奥特曼肯定会同意']) 36 | ner = ltp.ner(hidden) 37 | print(seg) 38 | print(ner) 39 | 40 | for i in ner[0]: 41 | tag = i[0] 42 | name = seg[0][i[1]: i[2]+1] 43 | print(tag, ":", name) 44 | 45 | 46 | def yuyijuesebiaozhu(): 47 | seg, hidden = ltp.seg(["他叫汤姆去拿外衣。"]) 48 | srl = ltp.srl(hidden) 49 | print(srl) # 包含了空 50 | 51 | srl = ltp.srl(hidden, keep_empty=False) 52 | print(srl) 53 | 54 | 55 | def yicunjufafenxi(): 56 | seg, hidden = ltp.seg(["他叫汤姆去拿外衣。"]) 57 | dep = ltp.dep(hidden) 58 | print(dep) 59 | 60 | 61 | def yicunjufashu(): 62 | seg, hidden = ltp.seg(["他叫汤姆去拿外衣。"]) 63 | sdp = ltp.sdp(hidden, graph=False) 64 | print(sdp) 65 | 66 | 67 | def yicunjufafenxitu(): 68 | seg, hidden = ltp.seg(["他叫汤姆去拿外衣。"]) 69 | sdp = ltp.sdp(hidden, graph=True) 70 | print(sdp) 71 | 72 | 73 | 74 | 75 | 76 | if __name__ == '__main__': 77 | ltp = LTP() # ltp = LTP(path = "base|small|tiny") 默认下载small 78 | 79 | # 1. 分句 80 | # fenju() 81 | 82 | # 2. 分词 83 | # fenci() 84 | 85 | # 3. 词性标注 86 | # cixingbiaozhu() 87 | 88 | # 4. 命名实体识别 89 | # mingmingshitishibie() 90 | 91 | # 5. 语义角色标注 92 | # yuyijuesebiaozhu() 93 | 94 | # 6. 依存句法分析 95 | # yicunjufafenxi() 96 | 97 | # 7. 依存句法树 98 | yicunjufashu() 99 | 100 | # 8. 依存句法分析(图) 101 | yicunjufafenxitu() 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | -------------------------------------------------------------------------------- /py2neo操作neo4j/事件三元组抽取/my_vocab.txt: -------------------------------------------------------------------------------- 1 | 我是 2 | 你妈 -------------------------------------------------------------------------------- /py2neo操作neo4j/事件三元组抽取/readme.txt: -------------------------------------------------------------------------------- 1 | --------------2020-12-9 更新---------------------- 2 | 这里不建议用ltp做三元组抽取,最近学习了一个深度学习模型进行三元组抽取 在我的另一个仓库 3 | 4 | [链接](https://github.com/shawroad/NLP_pytorch_project/tree/master/relation_extraction/lstm_cnn_information_extract) 5 | 6 | 7 | --------------2020-11-28 更新---------------------- 8 | 迪哥使用的是pyltp。 这里我不推荐用pyltp,这个包目前已经不更新了。已经是老古董了。加载的模型估计也过时了。 9 | 10 | 这里我推荐使用ltp 11 | 12 | 安装: pip install ltp -i https://pypi.douban.com/simple/ 13 | 14 | 测试安装成功与否: from ltp import LTP 15 | 16 | 安装成功后 下载模型 直接执行下面的代码 就可以下载了 17 | from ltp import LTP 18 | ltp = LTP() # ltp = LTP(path = "base|small|tiny") 可以指定参数 默认下载的是small 180m左右 19 | 20 | 21 | 这些操作完成以后 建议先看看ltp的使用方法 22 | 可以看代码 ltp的使用.py 或者看官方文档:http://ltp.ai/docs/quickstart.html 23 | 然后在去看三元组的抽取 24 | -------------------------------------------------------------------------------- /py2neo操作neo4j/医疗知识图谱问答/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/医疗知识图谱问答/.DS_Store -------------------------------------------------------------------------------- /py2neo操作neo4j/医疗知识图谱问答/ahocorasick的使用/demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | # -*- coding: utf-8 -*- 3 | # @File : demo.py 4 | # @Time : 2020/11/25 10:26 上午 5 | # @Author : xiaolu 6 | # @Email : luxiaonlp@163.com 7 | # @Software: PyCharm 8 | """ 9 | # 安装 pip install ahocorasick -i https://pypi.douban.com/simple 10 | import ahocorasick 11 | 12 | 13 | if __name__ == '__main__': 14 | wordlist = ['长春海外制药接骨续筋片', '香菇炖甲鱼', '三鹤药业黄柏胶囊', '上海衡山熊去氧胆酸片', '升和药业依托泊苷注射液', '怡诺思', 15 | '人格障碍', '转铁蛋白饱和度', '脾囊肿', '素烧白萝卜', '利君现代冠脉宁片', '上海复华药业注射用还原型谷', '阴囊上有白色小疙瘩', 16 | '腹痛伴休克', '成都通德胰激肽原酶肠溶片', '蒸猪肝', '河北百善血尿胶囊', '精神障碍', '输卵管畸形', '元和抑眩宁胶囊', '莲藕豆腐', 17 | '辰欣哈西奈德溶液', '信谊烟酸片', '慢性胆囊炎', '参芪降糖颗粒', '康普药业盐酸普萘洛尔片', '西安迪赛胸腺肽肠溶片', 18 | '双鹭药业注射用复合辅酶', '慢性筛窦炎', '新高制药维胺酯维E乳膏', '冰黄肤乐软膏', '神经类疾病', '液晶热图', 19 | '枣(干)', '股外侧皮神经病', '浙江惠松硅炭银片', '牙根外露', '湖北潜江氯霉素滴眼液', '盐类皮质激素分泌过多', '五子衍宗丸', 20 | '小儿阵发性睡眠性血红蛋白尿症', '功能失调性子宫出血病', '茵栀黄口服液', '眼底出血和渗出', '斯达制药注射用头孢噻肟钠', '复方白芷酊', 21 | '胫腓骨骨折', '西南药业氯霉素片', '宫颈炎', '茶碱缓释胶囊', '原发性硬化性胆管炎', '郑州韩都利肺胶囊', '咽反射消失', 22 | '脊髓灰质炎', '甲状腺片', '回盲瓣功能不全', '牛黄清胃丸', '乙肝e抗体', '马齿苋粥', '动脉硬化', '宝宝乐', '肠闭锁', '肺放线菌病', 23 | '江苏晨牌产妇安颗粒', '犬吠样咳嗽', '胃康灵胶囊', '小儿烟酸缺乏病', '青龙防风通圣丸', '广东南国维生素C片', '碘化油咀嚼片', 24 | '西乐葆', '伟哥甲磺酸酚妥拉明分散片', '成都迪康药业樟脑醑', '斑疹', '五花炖墨鱼', '肉炖芸豆粉条', '陕西东泰制药益脉康胶囊', 25 | '桔梗八味颗粒', '华南牌溴丙胺太林片', '吉林敖东洮南小牛脾提取物注', '仁青芒觉', '血吸虫病与肝胆疾病', '持续性枕横位难产', 26 | '弯曲菌感染', '丝瓜蘑菇肉片汤', '长春银诺克清咽片', '肝叶萎缩', '迪皿盐酸左西替利嗪口服溶液', '阿司匹林'] 27 | 28 | # 建树 29 | actree = ahocorasick.Automaton() 30 | for index, word in enumerate(wordlist): 31 | actree.add_word(word, (index, word)) 32 | actree.make_automaton() 33 | 34 | for i in actree.iter('昨天发烧,服用了阿司匹林,并且还吃了牛黄清胃丸,饭是吃了瓜烧白菜,大便有点色浅'): 35 | print(i) 36 | -------------------------------------------------------------------------------- /py2neo操作neo4j/医疗知识图谱问答/build_medical_graph.py: -------------------------------------------------------------------------------- 1 | """ 2 | # -*- coding: utf-8 -*- 3 | # @File : build_medical_graph.py 4 | # @Time : 2020/11/24 8:39 下午 5 | # @Author : xiaolu 6 | # @Email : luxiaonlp@163.com 7 | # @Software: PyCharm 8 | """ 9 | import json 10 | from py2neo import Graph, Node 11 | 12 | 13 | class MedicalGraph: 14 | def __init__(self): 15 | self.data_path = './data/medical_min.json' 16 | self.g = Graph() # 这里填自己的信息 17 | self.g.delete_all() # 将之前的图 全部删除 18 | 19 | def read_nodes(self): 20 | # 共7类节点 21 | drugs = [] # 药品 22 | foods = [] # 食物 23 | checks = [] # 检查 24 | departments = [] # 科室 25 | producers = [] # 药品大类 26 | diseases = [] # 疾病 27 | symptoms = [] # 症状 28 | 29 | disease_infos = [] # 疾病信息 30 | 31 | # 构建节点实体关系 32 | rels_department = [] # 科室-科室关系 33 | rels_noteat = [] # 疾病-忌吃食物关系 34 | rels_doeat = [] # 疾病-宜吃食物关系 35 | rels_recommandeat = [] # 疾病-推荐吃食物关系 36 | rels_commonddrug = [] # 疾病-通用药品关系 37 | rels_recommanddrug = [] # 疾病-热门药品关系 38 | rels_check = [] # 疾病-检查关系 39 | rels_drug_producer = [] # 厂商-药物关系 40 | 41 | rels_symptom = [] # 疾病症状关系 42 | rels_acompany = [] # 疾病并发关系 43 | rels_category = [] # 疾病与科室之间的关系 44 | 45 | count = 0 46 | for data in open(self.data_path, encoding='utf8'): 47 | disease_dict = {} 48 | count += 1 49 | print(count) 50 | data_json = json.loads(data) 51 | disease = data_json['name'] # 疾病名 52 | disease_dict['name'] = disease 53 | diseases.append(disease) 54 | disease_dict['desc'] = '' 55 | disease_dict['prevent'] = '' 56 | disease_dict['cause'] = '' 57 | disease_dict['easy_get'] = '' 58 | disease_dict['cure_department'] = '' 59 | disease_dict['cure_way'] = '' 60 | disease_dict['cure_lasttime'] = '' 61 | disease_dict['symptom'] = '' 62 | disease_dict['cured_prob'] = '' 63 | 64 | # 做症状 然后做疾病和症状的关系 65 | if 'symptom' in data_json: 66 | symptoms += data_json['symptom'] # 这里加入所有的症状 67 | for symptom in data_json['symptom']: 68 | rels_symptom.append([disease, symptom]) 69 | 70 | # 做并发症 并做疾病与并发症的关系 71 | if 'acompany' in data_json: 72 | for acompany in data_json['acompany']: 73 | rels_acompany.append([disease, acompany]) 74 | 75 | # 做描述 不和病做关系 当做病的属性 76 | if 'desc' in data_json: 77 | disease_dict['desc'] = data_json['desc'] 78 | 79 | # 80 | if 'prevent' in data_json: 81 | disease_dict['prevent'] = data_json['prevent'] 82 | 83 | if 'cause' in data_json: 84 | disease_dict['cause'] = data_json['cause'] 85 | 86 | if 'get_prob' in data_json: 87 | disease_dict['get_prob'] = data_json['get_prob'] 88 | 89 | if 'easy_get' in data_json: 90 | disease_dict['easy_get'] = data_json['easy_get'] 91 | 92 | # 科室 93 | if 'cure_department' in data_json: 94 | cure_department = data_json['cure_department'] 95 | if len(cure_department) == 1: 96 | rels_category.append([disease, cure_department[0]]) 97 | if len(cure_department) == 2: 98 | big = cure_department[0] 99 | small = cure_department[1] 100 | rels_department.append([small, big]) 101 | rels_category.append([disease, small]) 102 | disease_dict['cure_department'] = cure_department 103 | departments += cure_department 104 | 105 | if 'cure_way' in data_json: 106 | disease_dict['cure_way'] = data_json['cure_way'] 107 | 108 | if 'cure_lasttime' in data_json: 109 | disease_dict['cure_lasttime'] = data_json['cure_lasttime'] 110 | 111 | if 'cured_prob' in data_json: 112 | disease_dict['cured_prob'] = data_json['cured_prob'] 113 | 114 | if 'common_drug' in data_json: 115 | common_drug = data_json['common_drug'] 116 | for drug in common_drug: 117 | rels_commonddrug.append([disease, drug]) 118 | drugs += common_drug 119 | 120 | if 'recommand_drug' in data_json: 121 | recommand_drug = data_json['recommand_drug'] 122 | drugs += recommand_drug 123 | for drug in recommand_drug: 124 | rels_recommanddrug.append([disease, drug]) 125 | 126 | if 'not_eat' in data_json: 127 | not_eat = data_json['not_eat'] 128 | for _not in not_eat: 129 | rels_noteat.append([disease, _not]) 130 | 131 | foods += not_eat 132 | do_eat = data_json['do_eat'] 133 | for _do in do_eat: 134 | rels_doeat.append([disease, _do]) 135 | 136 | foods += do_eat 137 | recommand_eat = data_json['recommand_eat'] 138 | 139 | for _recommand in recommand_eat: 140 | rels_recommandeat.append([disease, _recommand]) 141 | foods += recommand_eat 142 | 143 | if 'check' in data_json: 144 | check = data_json['check'] 145 | for _check in check: 146 | rels_check.append([disease, _check]) 147 | checks += check 148 | if 'drug_detail' in data_json: 149 | drug_detail = data_json['drug_detail'] 150 | producer = [i.split('(')[0] for i in drug_detail] 151 | rels_drug_producer += [[i.split('(')[0], i.split('(')[-1].replace(')', '')] for i in drug_detail] 152 | producers += producer 153 | disease_infos.append(disease_dict) 154 | return set(drugs), set(foods), set(checks), set(departments), set(producers), set(symptoms), set(diseases), \ 155 | disease_infos, rels_check, rels_recommandeat, rels_noteat, rels_doeat, rels_department, \ 156 | rels_commonddrug, rels_drug_producer, rels_recommanddrug, rels_symptom, rels_acompany, rels_category 157 | 158 | def create_diseases_nodes(self, disease_infos): 159 | ''' 160 | 创建知识图谱中心疾病的节点 161 | ''' 162 | count = 0 163 | for disease_dict in disease_infos: 164 | # 疾病节点里面包含几种属性信息 165 | node = Node("Disease", name=disease_dict['name'], desc=disease_dict['desc'], 166 | prevent=disease_dict['prevent'], cause=disease_dict['cause'], 167 | easy_get=disease_dict['easy_get'], cure_lasttime=disease_dict['cure_lasttime'], 168 | cure_department=disease_dict['cure_department'] 169 | , cure_way=disease_dict['cure_way'], cured_prob=disease_dict['cured_prob']) 170 | self.g.create(node) 171 | count += 1 172 | print(count) 173 | return 174 | 175 | def create_node(self, label, nodes): 176 | ''' 177 | 建立节点 178 | ''' 179 | count = 0 180 | for node_name in nodes: 181 | node = Node(label, name=node_name) 182 | self.g.create(node) 183 | count += 1 184 | print(count, len(nodes)) 185 | return 186 | 187 | def create_graphnodes(self): 188 | Drugs, Foods, Checks, Departments, Producers, Symptoms, Diseases, disease_infos, rels_check, rels_recommandeat, rels_noteat, rels_doeat, rels_department, rels_commonddrug, rels_drug_producer, rels_recommanddrug, rels_symptom, rels_acompany, rels_category = self.read_nodes() 189 | self.create_diseases_nodes(disease_infos) 190 | 191 | self.create_node('Drug', Drugs) 192 | print(len(Drugs)) 193 | 194 | self.create_node('Food', Foods) 195 | print(len(Foods)) 196 | 197 | self.create_node('Check', Checks) 198 | print(len(Checks)) 199 | 200 | self.create_node('Department', Departments) 201 | print(len(Departments)) 202 | 203 | self.create_node('Producer', Producers) 204 | print(len(Producers)) 205 | 206 | self.create_node('Symptom', Symptoms) 207 | 208 | return 209 | 210 | def create_graphrels(self): 211 | Drugs, Foods, Checks, Departments, Producers, Symptoms, Diseases, disease_infos, rels_check, rels_recommandeat, rels_noteat, rels_doeat, rels_department, rels_commonddrug, rels_drug_producer, rels_recommanddrug, rels_symptom, rels_acompany, rels_category = self.read_nodes() 212 | self.create_relationship('Disease', 'Food', rels_recommandeat, 'recommand_eat', '推荐食谱') 213 | self.create_relationship('Disease', 'Food', rels_noteat, 'no_eat', '忌吃') 214 | self.create_relationship('Disease', 'Food', rels_doeat, 'do_eat', '宜吃') 215 | self.create_relationship('Department', 'Department', rels_department, 'belongs_to', '属于') 216 | self.create_relationship('Disease', 'Drug', rels_commonddrug, 'common_drug', '常用药品') 217 | self.create_relationship('Producer', 'Drug', rels_drug_producer, 'drugs_of', '生产药品') 218 | self.create_relationship('Disease', 'Drug', rels_recommanddrug, 'recommand_drug', '好评药品') 219 | self.create_relationship('Disease', 'Check', rels_check, 'need_check', '诊断检查') 220 | self.create_relationship('Disease', 'Symptom', rels_symptom, 'has_symptom', '症状') 221 | self.create_relationship('Disease', 'Disease', rels_acompany, 'acompany_with', '并发症') 222 | self.create_relationship('Disease', 'Department', rels_category, 'belongs_to', '所属科室') 223 | 224 | def create_relationship(self, start_node, end_node, edges, rel_type, rel_name): 225 | '''创建实体关联边''' 226 | count = 0 227 | # 去重处理 228 | set_edges = [] 229 | 230 | for edge in edges: 231 | set_edges.append('###'.join(edge)) 232 | 233 | all = len(set(set_edges)) 234 | for edge in set(set_edges): 235 | edge = edge.split('###') 236 | p = edge[0] 237 | q = edge[1] 238 | query = "match(p:%s),(q:%s) where p.name='%s'and q.name='%s' create (p)-[rel:%s{name:'%s'}]->(q)" % ( 239 | start_node, end_node, p, q, rel_type, rel_name) 240 | try: 241 | self.g.run(query) 242 | count += 1 243 | print(rel_type, count, all) 244 | except Exception as e: 245 | print(e) 246 | return 247 | 248 | 249 | if __name__ == '__main__': 250 | # 实例化类图 251 | handler = MedicalGraph() 252 | 253 | # 创建节点 254 | handler.create_graphnodes() 255 | # 创建关系 256 | handler.create_graphrels() 257 | 258 | 259 | 260 | -------------------------------------------------------------------------------- /py2neo操作neo4j/医疗知识图谱问答/data_process/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | # -*- coding: utf-8 -*- 3 | # @File : __init__.py.py 4 | # @Time : 2020/11/25 10:15 上午 5 | # @Author : xiaolu 6 | # @Email : luxiaonlp@163.com 7 | # @Software: PyCharm 8 | """ 9 | -------------------------------------------------------------------------------- /py2neo操作neo4j/医疗知识图谱问答/data_process/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/医疗知识图谱问答/data_process/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /py2neo操作neo4j/医疗知识图谱问答/data_process/__pycache__/answer_search.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/医疗知识图谱问答/data_process/__pycache__/answer_search.cpython-37.pyc -------------------------------------------------------------------------------- /py2neo操作neo4j/医疗知识图谱问答/data_process/__pycache__/question_classifier.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/医疗知识图谱问答/data_process/__pycache__/question_classifier.cpython-37.pyc -------------------------------------------------------------------------------- /py2neo操作neo4j/医疗知识图谱问答/data_process/__pycache__/question_parser.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shawroad/Python-Library-Learning/48c31d49898005dc4300cfca5cea13180f9b5cd8/py2neo操作neo4j/医疗知识图谱问答/data_process/__pycache__/question_parser.cpython-37.pyc -------------------------------------------------------------------------------- /py2neo操作neo4j/医疗知识图谱问答/data_process/answer_search.py: -------------------------------------------------------------------------------- 1 | """ 2 | # -*- coding: utf-8 -*- 3 | # @File : answer_search.py 4 | # @Time : 2020/11/25 11:08 上午 5 | # @Author : xiaolu 6 | # @Email : luxiaonlp@163.com 7 | # @Software: PyCharm 8 | """ 9 | from py2neo import Graph 10 | 11 | 12 | class AnswerSearcher: 13 | def __init__(self): 14 | self.g = Graph() 15 | self.num_limit = 20 16 | 17 | def search_main(self, sqls): 18 | # 执行cypher查询,并返回相应结果 19 | final_answers = [] 20 | for sql_ in sqls: 21 | question_type = sql_['question_type'] 22 | queries = sql_['sql'] 23 | answers = [] 24 | for query in queries: 25 | ress = self.g.run(query).data() 26 | answers += ress 27 | final_answer = self.answer_prettify(question_type, answers) 28 | if final_answer: 29 | final_answers.append(final_answer) 30 | return final_answers 31 | 32 | def answer_prettify(self, question_type, answers): 33 | # 根据对应的qustion_type,调用相应的回复模板 34 | final_answer = [] 35 | if not answers: 36 | return '' 37 | if question_type == 'disease_symptom': 38 | desc = [i['n.name'] for i in answers] 39 | subject = answers[0]['m.name'] 40 | final_answer = '{0}的症状包括:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit])) 41 | 42 | elif question_type == 'symptom_disease': 43 | desc = [i['m.name'] for i in answers] 44 | subject = answers[0]['n.name'] 45 | final_answer = '症状{0}可能染上的疾病有:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit])) 46 | 47 | elif question_type == 'disease_cause': 48 | desc = [i['m.cause'] for i in answers] 49 | subject = answers[0]['m.name'] 50 | final_answer = '{0}可能的成因有:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit])) 51 | 52 | elif question_type == 'disease_prevent': 53 | desc = [i['m.prevent'] for i in answers] 54 | subject = answers[0]['m.name'] 55 | final_answer = '{0}的预防措施包括:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit])) 56 | 57 | elif question_type == 'disease_lasttime': 58 | desc = [i['m.cure_lasttime'] for i in answers] 59 | subject = answers[0]['m.name'] 60 | final_answer = '{0}治疗可能持续的周期为:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit])) 61 | 62 | elif question_type == 'disease_cureway': 63 | desc = [';'.join(i['m.cure_way']) for i in answers] 64 | subject = answers[0]['m.name'] 65 | final_answer = '{0}可以尝试如下治疗:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit])) 66 | 67 | elif question_type == 'disease_cureprob': 68 | desc = [i['m.cured_prob'] for i in answers] 69 | subject = answers[0]['m.name'] 70 | final_answer = '{0}治愈的概率为(仅供参考):{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit])) 71 | 72 | elif question_type == 'disease_easyget': 73 | desc = [i['m.easy_get'] for i in answers] 74 | subject = answers[0]['m.name'] 75 | 76 | final_answer = '{0}的易感人群包括:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit])) 77 | 78 | elif question_type == 'disease_desc': 79 | desc = [i['m.desc'] for i in answers] 80 | subject = answers[0]['m.name'] 81 | final_answer = '{0},熟悉一下:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit])) 82 | 83 | elif question_type == 'disease_acompany': 84 | desc1 = [i['n.name'] for i in answers] 85 | desc2 = [i['m.name'] for i in answers] 86 | subject = answers[0]['m.name'] 87 | desc = [i for i in desc1 + desc2 if i != subject] 88 | final_answer = '{0}的症状包括:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit])) 89 | 90 | elif question_type == 'disease_not_food': 91 | desc = [i['n.name'] for i in answers] 92 | subject = answers[0]['m.name'] 93 | final_answer = '{0}忌食的食物包括有:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit])) 94 | 95 | elif question_type == 'disease_do_food': 96 | do_desc = [i['n.name'] for i in answers if i['r.name'] == '宜吃'] 97 | recommand_desc = [i['n.name'] for i in answers if i['r.name'] == '推荐食谱'] 98 | subject = answers[0]['m.name'] 99 | final_answer = '{0}宜食的食物包括有:{1}\n推荐食谱包括有:{2}'.format(subject, ';'.join(list(set(do_desc))[:self.num_limit]), ';'.join(list(set(recommand_desc))[:self.num_limit])) 100 | 101 | elif question_type == 'food_not_disease': 102 | desc = [i['m.name'] for i in answers] 103 | subject = answers[0]['n.name'] 104 | final_answer = '患有{0}的人最好不要吃{1}'.format(';'.join(list(set(desc))[:self.num_limit]), subject) 105 | 106 | elif question_type == 'food_do_disease': 107 | desc = [i['m.name'] for i in answers] 108 | subject = answers[0]['n.name'] 109 | final_answer = '患有{0}的人建议多试试{1}'.format(';'.join(list(set(desc))[:self.num_limit]), subject) 110 | 111 | elif question_type == 'disease_drug': 112 | desc = [i['n.name'] for i in answers] 113 | subject = answers[0]['m.name'] 114 | final_answer = '{0}通常的使用的药品包括:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit])) 115 | 116 | elif question_type == 'drug_disease': 117 | desc = [i['m.name'] for i in answers] 118 | subject = answers[0]['n.name'] 119 | final_answer = '{0}主治的疾病有{1},可以试试'.format(subject, ';'.join(list(set(desc))[:self.num_limit])) 120 | 121 | elif question_type == 'disease_check': 122 | desc = [i['n.name'] for i in answers] 123 | subject = answers[0]['m.name'] 124 | final_answer = '{0}通常可以通过以下方式检查出来:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit])) 125 | 126 | elif question_type == 'check_disease': 127 | desc = [i['m.name'] for i in answers] 128 | subject = answers[0]['n.name'] 129 | final_answer = '通常可以通过{0}检查出来的疾病有{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit])) 130 | 131 | return final_answer 132 | 133 | 134 | if __name__ == '__main__': 135 | searcher = AnswerSearcher() -------------------------------------------------------------------------------- /py2neo操作neo4j/医疗知识图谱问答/data_process/question_classifier.py: -------------------------------------------------------------------------------- 1 | """ 2 | # -*- coding: utf-8 -*- 3 | # @File : question_classifier.py 4 | # @Time : 2020/11/25 10:16 上午 5 | # @Author : xiaolu 6 | # @Email : luxiaonlp@163.com 7 | # @Software: PyCharm 8 | """ 9 | import os 10 | import ahocorasick 11 | from pdb import set_trace 12 | 13 | 14 | class QuestionClassifier: 15 | def __init__(self): 16 | # 特征词路径 17 | self.disease_path = './dict/disease.txt' 18 | self.department_path = './dict/department.txt' 19 | self.check_path = './dict/check.txt' 20 | self.drug_path = './dict/drug.txt' 21 | self.food_path = './dict/food.txt' 22 | self.producer_path = './dict/producer.txt' 23 | self.symptom_path = './dict/symptom.txt' 24 | self.deny_path = './dict/deny.txt' 25 | 26 | # 加载特征词 27 | self.disease_wds = [i.strip() for i in open(self.disease_path) if i.strip()] 28 | self.department_wds = [i.strip() for i in open(self.department_path) if i.strip()] 29 | self.check_wds = [i.strip() for i in open(self.check_path) if i.strip()] 30 | self.drug_wds = [i.strip() for i in open(self.drug_path) if i.strip()] 31 | self.food_wds = [i.strip() for i in open(self.food_path) if i.strip()] 32 | self.producer_wds = [i.strip() for i in open(self.producer_path) if i.strip()] 33 | self.symptom_wds = [i.strip() for i in open(self.symptom_path) if i.strip()] 34 | self.region_words = set(self.department_wds + self.disease_wds + self.check_wds + self.drug_wds + self.food_wds + self.producer_wds + self.symptom_wds) 35 | self.deny_words = [i.strip() for i in open(self.deny_path) if i.strip()] 36 | 37 | # 建树 加快检索 可参考ahocorasick的使用 进行学习 actree 38 | self.region_tree = self.build_actree(list(self.region_words)) 39 | 40 | # 构建词典 41 | self.wdtype_dict = self.build_wdtype_dict() 42 | 43 | # 问句疑问词 44 | self.symptom_qwds = ['症状', '表征', '现象', '症候', '表现'] 45 | self.cause_qwds = ['原因', '成因', '为什么', '怎么会', '怎样才', '咋样才', '怎样会', '如何会', '为啥', '为何', '如何才会', '怎么才会', '会导致', '会造成'] 46 | self.acompany_qwds = ['并发症', '并发', '一起发生', '一并发生', '一起出现', '一并出现', '一同发生', '一同出现', '伴随发生', '伴随', '共现'] 47 | self.food_qwds = ['饮食', '饮用', '吃', '食', '伙食', '膳食', '喝', '菜', '忌口', '补品', '保健品', '食谱', '菜谱', '食用', '食物', '补品'] 48 | self.drug_qwds = ['药', '药品', '用药', '胶囊', '口服液', '炎片'] 49 | self.prevent_qwds = ['预防', '防范', '抵制', '抵御', '防止', ' 躲避', '逃避', '避开', '免得', '逃开', '避开', '避掉', '躲开', '躲掉', '绕开', 50 | '怎样才能不', '怎么才能不', '咋样才能不', '咋才能不', '如何才能不', 51 | '怎样才不', '怎么才不', '咋样才不', '咋才不', '如何才不', 52 | '怎样才可以不', '怎么才可以不', '咋样才可以不', '咋才可以不', '如何可以不', 53 | '怎样才可不', '怎么才可不', '咋样才可不', '咋才可不', '如何可不'] 54 | self.lasttime_qwds = ['周期', '多久', '多长时间', '多少时间', '几天', '几年', '多少天', '多少小时', '几个小时', '多少年'] 55 | self.cureway_qwds = ['怎么治疗', '如何医治', '怎么医治', '怎么治', '怎么医', '如何治', '医治方式', '疗法', '咋治', '怎么办', '咋办', '咋治'] 56 | self.cureprob_qwds = ['多大概率能治好', '多大几率能治好', '治好希望大么', '几率', '几成', '比例', '可能性', '能治', '可治', '可以治', '可以医'] 57 | self.easyget_qwds = ['易感人群', '容易感染', '易发人群', '什么人', '哪些人', '感染', '染上', '得上'] 58 | self.check_qwds = ['检查', '检查项目', '查出', '检查', '测出', '试出'] 59 | self.belong_qwds = ['属于什么科', '属于', '什么科', '科室'] 60 | self.cure_qwds = ['治疗什么', '治啥', '治疗啥', '医治啥', '治愈啥', '主治啥', '主治什么', '有什么用', '有何用', '用处', '用途', 61 | '有什么好处', '有什么益处', '有何益处', '用来', '用来做啥', '用来作甚', '需要', '要'] 62 | 63 | print('model init finished ......') 64 | return 65 | 66 | '''分类主函数''' 67 | def classify(self, question): 68 | data = {} 69 | medical_dict = self.check_medical(question) 70 | 71 | if not medical_dict: 72 | return {} 73 | 74 | data['args'] = medical_dict 75 | 76 | # 收集问句当中所涉及到的实体类型 77 | types = [] 78 | for type_ in medical_dict.values(): 79 | types += type_ 80 | 81 | question_types = [] 82 | # 症状 83 | if self.check_words(self.symptom_qwds, question) and ('disease' in types): 84 | question_type = 'disease_symptom' 85 | question_types.append(question_type) 86 | 87 | if self.check_words(self.symptom_qwds, question) and ('symptom' in types): 88 | question_type = 'symptom_disease' 89 | question_types.append(question_type) 90 | 91 | # 原因 92 | if self.check_words(self.cause_qwds, question) and ('disease' in types): 93 | question_type = 'disease_cause' 94 | question_types.append(question_type) 95 | # 并发症 96 | if self.check_words(self.acompany_qwds, question) and ('disease' in types): 97 | question_type = 'disease_acompany' 98 | question_types.append(question_type) 99 | 100 | # 推荐食品 101 | if self.check_words(self.food_qwds, question) and 'disease' in types: 102 | deny_status = self.check_words(self.deny_words, question) 103 | if deny_status: 104 | question_type = 'disease_not_food' 105 | else: 106 | question_type = 'disease_do_food' 107 | question_types.append(question_type) 108 | 109 | # 已知食物找疾病 110 | if self.check_words(self.food_qwds+self.cure_qwds, question) and 'food' in types: 111 | deny_status = self.check_words(self.deny_words, question) 112 | if deny_status: 113 | question_type = 'food_not_disease' 114 | else: 115 | question_type = 'food_do_disease' 116 | question_types.append(question_type) 117 | 118 | # 推荐药品 119 | if self.check_words(self.drug_qwds, question) and 'disease' in types: 120 | question_type = 'disease_drug' 121 | question_types.append(question_type) 122 | 123 | # 药品治啥病 124 | if self.check_words(self.cure_qwds, question) and 'drug' in types: 125 | question_type = 'drug_disease' 126 | question_types.append(question_type) 127 | 128 | # 疾病接受检查项目 129 | if self.check_words(self.check_qwds, question) and 'disease' in types: 130 | question_type = 'disease_check' 131 | question_types.append(question_type) 132 | 133 | # 已知检查项目查相应疾病 134 | if self.check_words(self.check_qwds+self.cure_qwds, question) and 'check' in types: 135 | question_type = 'check_disease' 136 | question_types.append(question_type) 137 | 138 | # 症状防御 139 | if self.check_words(self.prevent_qwds, question) and 'disease' in types: 140 | question_type = 'disease_prevent' 141 | question_types.append(question_type) 142 | 143 | # 疾病医疗周期 144 | if self.check_words(self.lasttime_qwds, question) and 'disease' in types: 145 | question_type = 'disease_lasttime' 146 | question_types.append(question_type) 147 | 148 | # 疾病治疗方式 149 | if self.check_words(self.cureway_qwds, question) and 'disease' in types: 150 | question_type = 'disease_cureway' 151 | question_types.append(question_type) 152 | 153 | # 疾病治愈可能性 154 | if self.check_words(self.cureprob_qwds, question) and 'disease' in types: 155 | question_type = 'disease_cureprob' 156 | question_types.append(question_type) 157 | 158 | # 疾病易感染人群 159 | if self.check_words(self.easyget_qwds, question) and 'disease' in types : 160 | question_type = 'disease_easyget' 161 | question_types.append(question_type) 162 | 163 | # 若没有查到相关的外部查询信息,那么则将该疾病的描述信息返回 164 | if question_types == [] and 'disease' in types: 165 | question_types = ['disease_desc'] 166 | 167 | # 若没有查到相关的外部查询信息,那么则将该疾病的描述信息返回 168 | if question_types == [] and 'symptom' in types: 169 | question_types = ['symptom_disease'] 170 | 171 | # 将多个分类结果进行合并处理,组装成一个字典 172 | data['question_types'] = question_types 173 | 174 | return data 175 | 176 | def build_wdtype_dict(self): 177 | # 构建词对应的类型 将词和对应的类型组成字典 178 | wd_dict = dict() 179 | for wd in self.region_words: 180 | wd_dict[wd] = [] 181 | if wd in self.disease_wds: 182 | wd_dict[wd].append('disease') 183 | if wd in self.department_wds: 184 | wd_dict[wd].append('department') 185 | if wd in self.check_wds: 186 | wd_dict[wd].append('check') 187 | if wd in self.drug_wds: 188 | wd_dict[wd].append('drug') 189 | if wd in self.food_wds: 190 | wd_dict[wd].append('food') 191 | if wd in self.symptom_wds: 192 | wd_dict[wd].append('symptom') 193 | if wd in self.producer_wds: 194 | wd_dict[wd].append('producer') 195 | return wd_dict 196 | 197 | def build_actree(self, wordlist): 198 | # 构造actree树 加速过滤 199 | actree = ahocorasick.Automaton() 200 | for index, word in enumerate(wordlist): 201 | actree.add_word(word, (index, word)) 202 | actree.make_automaton() 203 | return actree 204 | 205 | def check_medical(self, question): 206 | # 当用户输入一个问题时 先对问题进行过滤 207 | region_wds = [] 208 | for i in self.region_tree.iter(question): 209 | wd = i[1][1] # 取出当前匹配到的词的索引位置 210 | region_wds.append(wd) 211 | stop_wds = [] 212 | for wd1 in region_wds: 213 | for wd2 in region_wds: 214 | if wd1 in wd2 and wd1 != wd2: 215 | stop_wds.append(wd1) 216 | final_wds = [i for i in region_wds if i not in stop_wds] 217 | final_dict = {i: self.wdtype_dict.get(i) for i in final_wds} 218 | return final_dict 219 | 220 | def check_words(self, wds, sent): 221 | # 基于特征词进行分类 看当前特征在这个问题中包含不包含 222 | for wd in wds: 223 | if wd in sent: 224 | return True 225 | return False 226 | 227 | 228 | if __name__ == '__main__': 229 | handler = QuestionClassifier() 230 | while True: 231 | question = input("input an question:") 232 | data = handler.classify(question) 233 | print(data) 234 | -------------------------------------------------------------------------------- /py2neo操作neo4j/医疗知识图谱问答/data_process/question_parser.py: -------------------------------------------------------------------------------- 1 | """ 2 | # -*- coding: utf-8 -*- 3 | # @File : question_parser.py 4 | # @Time : 2020/11/25 11:04 上午 5 | # @Author : xiaolu 6 | # @Email : luxiaonlp@163.com 7 | # @Software: PyCharm 8 | """ 9 | class QuestionPaser: 10 | def build_entitydict(self, args): 11 | # 构建实体节点 12 | entity_dict = {} 13 | for arg, types in args.items(): 14 | for type in types: 15 | if type not in entity_dict: 16 | entity_dict[type] = [arg] 17 | else: 18 | entity_dict[type].append(arg) 19 | return entity_dict 20 | 21 | def parser_main(self, res_classify): 22 | # 解析主函数 23 | args = res_classify['args'] 24 | entity_dict = self.build_entitydict(args) 25 | question_types = res_classify['question_types'] 26 | sqls = [] 27 | for question_type in question_types: 28 | sql_ = {} 29 | sql_['question_type'] = question_type 30 | sql = [] 31 | if question_type == 'disease_symptom': 32 | sql = self.sql_transfer(question_type, entity_dict.get('disease')) 33 | 34 | elif question_type == 'symptom_disease': 35 | sql = self.sql_transfer(question_type, entity_dict.get('symptom')) 36 | 37 | elif question_type == 'disease_cause': 38 | sql = self.sql_transfer(question_type, entity_dict.get('disease')) 39 | 40 | elif question_type == 'disease_acompany': 41 | sql = self.sql_transfer(question_type, entity_dict.get('disease')) 42 | 43 | elif question_type == 'disease_not_food': 44 | sql = self.sql_transfer(question_type, entity_dict.get('disease')) 45 | 46 | elif question_type == 'disease_do_food': 47 | sql = self.sql_transfer(question_type, entity_dict.get('disease')) 48 | 49 | elif question_type == 'food_not_disease': 50 | sql = self.sql_transfer(question_type, entity_dict.get('food')) 51 | 52 | elif question_type == 'food_do_disease': 53 | sql = self.sql_transfer(question_type, entity_dict.get('food')) 54 | 55 | elif question_type == 'disease_drug': 56 | sql = self.sql_transfer(question_type, entity_dict.get('disease')) 57 | 58 | elif question_type == 'drug_disease': 59 | sql = self.sql_transfer(question_type, entity_dict.get('drug')) 60 | 61 | elif question_type == 'disease_check': 62 | sql = self.sql_transfer(question_type, entity_dict.get('disease')) 63 | 64 | elif question_type == 'check_disease': 65 | sql = self.sql_transfer(question_type, entity_dict.get('check')) 66 | 67 | elif question_type == 'disease_prevent': 68 | sql = self.sql_transfer(question_type, entity_dict.get('disease')) 69 | 70 | elif question_type == 'disease_lasttime': 71 | sql = self.sql_transfer(question_type, entity_dict.get('disease')) 72 | 73 | elif question_type == 'disease_cureway': 74 | sql = self.sql_transfer(question_type, entity_dict.get('disease')) 75 | 76 | elif question_type == 'disease_cureprob': 77 | sql = self.sql_transfer(question_type, entity_dict.get('disease')) 78 | 79 | elif question_type == 'disease_easyget': 80 | sql = self.sql_transfer(question_type, entity_dict.get('disease')) 81 | 82 | elif question_type == 'disease_desc': 83 | sql = self.sql_transfer(question_type, entity_dict.get('disease')) 84 | 85 | if sql: 86 | sql_['sql'] = sql 87 | 88 | sqls.append(sql_) 89 | return sqls 90 | 91 | def sql_transfer(self, question_type, entities): 92 | # 针对不同的问题 进行查找 93 | if not entities: 94 | return [] 95 | 96 | # 查询语句 97 | sql = [] 98 | # 查询疾病的原因 99 | if question_type == 'disease_cause': 100 | sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.cause".format(i) for i in entities] 101 | 102 | # 查询疾病的防御措施 103 | elif question_type == 'disease_prevent': 104 | sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.prevent".format(i) for i in entities] 105 | 106 | # 查询疾病的持续时间 107 | elif question_type == 'disease_lasttime': 108 | sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.cure_lasttime".format(i) for i in entities] 109 | 110 | # 查询疾病的治愈概率 111 | elif question_type == 'disease_cureprob': 112 | sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.cured_prob".format(i) for i in entities] 113 | 114 | # 查询疾病的治疗方式 115 | elif question_type == 'disease_cureway': 116 | sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.cure_way".format(i) for i in entities] 117 | 118 | # 查询疾病的易发人群 119 | elif question_type == 'disease_easyget': 120 | sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.easy_get".format(i) for i in entities] 121 | 122 | # 查询疾病的相关介绍 123 | elif question_type == 'disease_desc': 124 | sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.desc".format(i) for i in entities] 125 | 126 | # 查询疾病有哪些症状 127 | elif question_type == 'disease_symptom': 128 | sql = ["MATCH (m:Disease)-[r:has_symptom]->(n:Symptom) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities] 129 | 130 | # 查询症状会导致哪些疾病 131 | elif question_type == 'symptom_disease': 132 | sql = ["MATCH (m:Disease)-[r:has_symptom]->(n:Symptom) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities] 133 | 134 | # 查询疾病的并发症 135 | elif question_type == 'disease_acompany': 136 | sql1 = ["MATCH (m:Disease)-[r:acompany_with]->(n:Disease) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities] 137 | sql2 = ["MATCH (m:Disease)-[r:acompany_with]->(n:Disease) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities] 138 | sql = sql1 + sql2 139 | # 查询疾病的忌口 140 | elif question_type == 'disease_not_food': 141 | sql = ["MATCH (m:Disease)-[r:no_eat]->(n:Food) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities] 142 | 143 | # 查询疾病建议吃的东西 144 | elif question_type == 'disease_do_food': 145 | sql1 = ["MATCH (m:Disease)-[r:do_eat]->(n:Food) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities] 146 | sql2 = ["MATCH (m:Disease)-[r:recommand_eat]->(n:Food) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities] 147 | sql = sql1 + sql2 148 | 149 | # 已知忌口查疾病 150 | elif question_type == 'food_not_disease': 151 | sql = ["MATCH (m:Disease)-[r:no_eat]->(n:Food) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities] 152 | 153 | # 已知推荐查疾病 154 | elif question_type == 'food_do_disease': 155 | sql1 = ["MATCH (m:Disease)-[r:do_eat]->(n:Food) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities] 156 | sql2 = ["MATCH (m:Disease)-[r:recommand_eat]->(n:Food) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities] 157 | sql = sql1 + sql2 158 | 159 | # 查询疾病常用药品-药品别名记得扩充 160 | elif question_type == 'disease_drug': 161 | sql1 = ["MATCH (m:Disease)-[r:common_drug]->(n:Drug) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities] 162 | sql2 = ["MATCH (m:Disease)-[r:recommand_drug]->(n:Drug) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities] 163 | sql = sql1 + sql2 164 | 165 | # 已知药品查询能够治疗的疾病 166 | elif question_type == 'drug_disease': 167 | sql1 = ["MATCH (m:Disease)-[r:common_drug]->(n:Drug) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities] 168 | sql2 = ["MATCH (m:Disease)-[r:recommand_drug]->(n:Drug) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities] 169 | sql = sql1 + sql2 170 | # 查询疾病应该进行的检查 171 | elif question_type == 'disease_check': 172 | sql = ["MATCH (m:Disease)-[r:need_check]->(n:Check) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities] 173 | 174 | # 已知检查查询疾病 175 | elif question_type == 'check_disease': 176 | sql = ["MATCH (m:Disease)-[r:need_check]->(n:Check) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities] 177 | return sql 178 | 179 | 180 | if __name__ == '__main__': 181 | handler = QuestionPaser() -------------------------------------------------------------------------------- /py2neo操作neo4j/医疗知识图谱问答/dict/deny.txt: -------------------------------------------------------------------------------- 1 | 否 2 | 非 3 | 不 4 | 无 5 | 弗 6 | 勿 7 | 毋 8 | 未 9 | 没 10 | 莫 11 | 没有 12 | 防止 13 | 不再 14 | 不会 15 | 不能 16 | 忌 17 | 禁止 18 | 防止 19 | 难以 20 | 忘记 21 | 忽视 22 | 放弃 23 | 拒绝 24 | 杜绝 25 | 不是 26 | 并未 27 | 并无 28 | 仍未 29 | 难以出现 30 | 切勿 31 | 不要 32 | 不可 33 | 别 34 | 管住 35 | 注意 36 | 小心 37 | 少 38 | 39 | -------------------------------------------------------------------------------- /py2neo操作neo4j/医疗知识图谱问答/dict/department.txt: -------------------------------------------------------------------------------- 1 | 心理科 2 | 妇科 3 | 耳鼻喉科 4 | 中医综合 5 | 泌尿内科 6 | 康复科 7 | 神经外科 8 | 生殖健康 9 | 肿瘤科 10 | 肛肠科 11 | 儿科 12 | 普外科 13 | 心胸外科 14 | 风湿免疫科 15 | 小儿外科 16 | 传染科 17 | 减肥 18 | 其他科室 19 | 肾内科 20 | 皮肤性病科 21 | 口腔科 22 | 不孕不育 23 | 五官科 24 | 整形美容科 25 | 消化内科 26 | 急诊科 27 | 肝胆外科 28 | 遗传病科 29 | 精神科 30 | 神经内科 31 | 小儿内科 32 | 肿瘤内科 33 | 皮肤科 34 | 中医科 35 | 骨外科 36 | 外科 37 | 呼吸内科 38 | 其他综合 39 | 眼科 40 | 内分泌科 41 | 性病科 42 | 妇产科 43 | 肝病 44 | 肿瘤外科 45 | 儿科综合 46 | 营养科 47 | 男科 48 | 产科 49 | 感染科 50 | 泌尿外科 51 | 血液科 52 | 心内科 53 | 烧伤科 54 | 内科 -------------------------------------------------------------------------------- /py2neo操作neo4j/医疗知识图谱问答/run_chatbot.py: -------------------------------------------------------------------------------- 1 | """ 2 | # -*- coding: utf-8 -*- 3 | # @File : run_chatbot.py 4 | # @Time : 2020/11/25 10:07 上午 5 | # @Author : xiaolu 6 | # @Email : luxiaonlp@163.com 7 | # @Software: PyCharm 8 | """ 9 | from data_process.question_classifier import QuestionClassifier 10 | from data_process.question_parser import QuestionPaser 11 | from data_process.answer_search import AnswerSearcher 12 | 13 | 14 | class ChatBotGraph: 15 | def __init__(self): 16 | self.classifier = QuestionClassifier() 17 | self.parser = QuestionPaser() 18 | self.searcher = AnswerSearcher() 19 | 20 | def chat_main(self, sent): 21 | answer = "您好, 我是小路医药智能助理,希望可以帮到您。如果没答上来,可联系120。祝您身体棒棒的!!!" 22 | res_classify = self.classifier.classify(sent) 23 | if not res_classify: 24 | return answer 25 | res_sql = self.parser.parser_main(res_classify) 26 | final_answers = self.searcher.search_main(res_sql) 27 | if not final_answers: 28 | return answer 29 | else: 30 | return '\n'.join(final_answers) 31 | 32 | 33 | if __name__ == '__main__': 34 | handler = ChatBotGraph() 35 | while True: 36 | question = input("用户:") 37 | answer = handler.chat_main(question) 38 | print("小路:", answer) 39 | -------------------------------------------------------------------------------- /pyecharts使用/001-柱状图.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 001-柱状图.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-02-17 6 | """ 7 | from pyecharts.charts import Bar 8 | from pyecharts import options as opts 9 | from pyecharts.globals import ThemeType 10 | 11 | 12 | bar = ( 13 | Bar({"theme": ThemeType.MACARONS}) # 设置主题 14 | # Bar() 15 | .set_global_opts( 16 | title_opts=opts.TitleOpts(title="各种衣服价格", subtitle="VS"), 17 | xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)) # 名字倾斜15度 18 | ) 19 | 20 | .add_xaxis(["衬衫", "羊毛衫", "雪纺衫", "裤子", "高跟鞋", "袜子"]) 21 | .add_yaxis("商家A", [5, 20, 36, 10, 75, 90]) 22 | .add_yaxis("商家B", [5, 20, 36, 10, 75, 90]) 23 | 24 | ) 25 | bar.render('柱状图.html') -------------------------------------------------------------------------------- /pyecharts使用/002-折线图.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 002-折线图.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-02-17 6 | """ 7 | from pyecharts.charts import Line 8 | from pyecharts import options as opts 9 | 10 | 11 | if __name__ == "__main__": 12 | x_data = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] 13 | # x_data = [1, 2, 3, 4, 5, 6, 7] 14 | y_data = [820, 932, 901, 934, 1290, 1330, 1320] 15 | y_data2 = [237, 132, 401, 534, 290, 1230, 1120] 16 | 17 | line = ( 18 | Line() 19 | .set_global_opts( 20 | tooltip_opts=opts.TooltipOpts(is_show=True), 21 | title_opts=opts.TitleOpts(title="收入大比拼", pos_left="center"), # 标题 22 | legend_opts=opts.LegendOpts(pos_left="right"), # 线条示例放在右上角 23 | xaxis_opts=opts.AxisOpts(type_="category", name="星期"), # 横轴的类型与名字 24 | # 注意横轴type_等于value 和category的区别 25 | yaxis_opts=opts.AxisOpts( 26 | type_="value", 27 | name="收入", 28 | splitline_opts=opts.SplitLineOpts(is_show=True), # 是否显示横向格子线 29 | is_scale=True, 30 | ), # 纵轴的类型与名字 31 | ) 32 | .add_xaxis(xaxis_data=x_data) 33 | .add_yaxis( 34 | is_smooth=True, # 是否进行平滑处理 35 | series_name="小花收入", # 标识每条线 36 | y_axis=y_data, 37 | symbol="emptyCircle", 38 | linestyle_opts=opts.LineStyleOpts(width=2), # 设置线宽 39 | is_symbol_show=True, 40 | label_opts=opts.LabelOpts(is_show=True), # is_show显示是否需要标注数据 41 | ) 42 | .add_yaxis( 43 | series_name="王五", # 标识每条线 44 | y_axis=y_data2, 45 | symbol="emptyCircle", 46 | is_symbol_show=True, 47 | label_opts=opts.LabelOpts(is_show=True), # is_show显示是否需要标注数据 48 | 49 | # 自定义标记 50 | markpoint_opts=opts.MarkPointOpts( 51 | data=[opts.MarkPointItem(name="自定义标记点", coord=[x_data[2], y_data2[2]], value=y_data2[2])] 52 | ), 53 | ) 54 | 55 | ) 56 | line.render('折线图.html') 57 | -------------------------------------------------------------------------------- /pyecharts使用/003-饼状图.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 003-饼状图.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-02-17 6 | """ 7 | from pyecharts import options as opts 8 | from pyecharts.charts import Pie 9 | from pyecharts.faker import Faker 10 | 11 | 12 | if __name__ == '__main__': 13 | # 生成假数据 14 | # a, b = Faker.choose(), Faker.values() 15 | # print(a) 16 | # print(b) 17 | # ['可乐', '雪碧', '橙汁', '绿茶', '奶茶', '百威', '青岛'] 18 | # [97, 140, 75, 28, 89, 20, 143] 19 | pie = ( 20 | Pie() 21 | .set_global_opts(title_opts=opts.TitleOpts(title="Pie-设置颜色")) 22 | 23 | .add("", [list(z) for z in zip(Faker.choose(), Faker.values())]) 24 | 25 | .set_colors(["blue", "green", "yellow", "red", "pink", "orange", "purple"]) # 每个所占面积的颜色设置 26 | 27 | .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}")) # 标签显示的样子 28 | ) 29 | pie.render("饼状图.html") 30 | -------------------------------------------------------------------------------- /pyecharts使用/折线图.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Awesome-pyecharts 6 | 7 | 8 | 9 | 10 |
11 | 292 | 293 | 294 | -------------------------------------------------------------------------------- /pyecharts使用/柱状图.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Awesome-pyecharts 6 | 7 | 8 | 9 | 10 | 11 |
12 | 189 | 190 | 191 | -------------------------------------------------------------------------------- /pyecharts使用/饼状图.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Awesome-pyecharts 6 | 7 | 8 | 9 | 10 |
11 | 134 | 135 | 136 | -------------------------------------------------------------------------------- /pymysql的使用/001-创建数据库.py: -------------------------------------------------------------------------------- 1 | """ 2 | # -*- coding: utf-8 -*- 3 | # @File : 001-创建数据库.py 4 | # @Time : 2020/11/24 1:59 下午 5 | # @Author : xiaolu 6 | # @Email : luxiaonlp@163.com 7 | # @Software: PyCharm 8 | """ 9 | import pymysql 10 | 11 | if __name__ == '__main__': 12 | # 打开数据库连接 13 | db = pymysql.connect('localhost', 'xxxxx', 'xxxxxx') 14 | 15 | # 使用 cursor() 方法创建一个游标对象 cursor 16 | cursor = db.cursor() 17 | 18 | # 创建数据库 19 | db_name = 'TESTDB' 20 | sql = "CREATE DATABASE {}".format(db_name) # 创建数据库 21 | cursor.execute(sql) 22 | 23 | # 使用 execute() 方法执行 SQL 查询 24 | cursor.execute("SELECT VERSION()") 25 | 26 | # 使用fetchone()方法获取单条数据 27 | data = cursor.fetchone() 28 | print("数据库的版本: ", data) 29 | # 关闭数据库连接 30 | db.close() -------------------------------------------------------------------------------- /pymysql的使用/002-创建表插入数据.py: -------------------------------------------------------------------------------- 1 | """ 2 | # -*- coding: utf-8 -*- 3 | # @File : 002-创建表插入数据.py 4 | # @Time : 2020/11/24 2:20 下午 5 | # @Author : xiaolu 6 | # @Email : luxiaonlp@163.com 7 | # @Software: PyCharm 8 | """ 9 | import pymysql 10 | 11 | if __name__ == '__main__': 12 | # 打开数据库连接 13 | db = pymysql.connect("localhost", "xxxxx", "xxxxxxx", "TESTDB") 14 | 15 | # 使用 cursor() 方法创建一个游标对象 cursor 16 | cursor = db.cursor() 17 | 18 | # 使用 execute() 方法执行 SQL,如果表存在则删除 19 | cursor.execute("DROP TABLE IF EXISTS EMPLOYEE") 20 | 21 | sql = '''CREATE TABLE EMPLOYEE ( 22 | FIRST_NAME CHAR (20) NOT NULL, 23 | LAST_NAME CHAR (20), 24 | AGE INT, 25 | SEX CHAR (1), 26 | INCOME FLOAT 27 | ) 28 | ''' 29 | cursor.execute(sql) 30 | 31 | # 接着插入数据 32 | insert_sql = """INSERT INTO EMPLOYEE(FIRST_NAME, LAST_NAME, AGE, SEX, INCOME) 33 | VALUES ('Mac', 'Mohan', 20, 'M', 2000)""" 34 | try: 35 | # 执行sql语句 36 | cursor.execute(insert_sql) 37 | # 提交到数据库执行 38 | db.commit() 39 | except: 40 | print('滚犊子,插不进去') 41 | # 如果发生错误则回滚 42 | db.rollback() 43 | # 关闭数据库连接 44 | db.close() -------------------------------------------------------------------------------- /pymysql的使用/003-查询.py: -------------------------------------------------------------------------------- 1 | """ 2 | # -*- coding: utf-8 -*- 3 | # @File : 003-查询.py 4 | # @Time : 2020/11/24 2:34 下午 5 | # @Author : xiaolu 6 | # @Email : luxiaonlp@163.com 7 | # @Software: PyCharm 8 | """ 9 | import pymysql 10 | 11 | if __name__ == '__main__': 12 | # 打开数据库连接 13 | db = pymysql.connect("localhost", "xxxxxx", "xxxxxxx", "TESTDB") 14 | 15 | # 使用cursor()方法获取操作游标 16 | cursor = db.cursor() 17 | 18 | # SQL 查询语句 19 | sql = "SELECT * FROM EMPLOYEE WHERE INCOME > %s" % (1000) 20 | try: 21 | cursor.execute(sql) 22 | 23 | # 获取所有记录列表 24 | results = cursor.fetchall() 25 | for row in results: 26 | fname = row[0] 27 | lname = row[1] 28 | age = row[2] 29 | sex = row[3] 30 | income = row[4] 31 | print('fname: {}, lname:{}, age:{}, sex:{}, income:{}'.format(fname, lname, age, sex, income)) 32 | except: 33 | print("啥也找不到") 34 | 35 | # 关闭数据库连接 36 | db.close() -------------------------------------------------------------------------------- /pymysql的使用/004-更新.py: -------------------------------------------------------------------------------- 1 | """ 2 | # -*- coding: utf-8 -*- 3 | # @File : 004-更新.py 4 | # @Time : 2020/11/24 2:41 下午 5 | # @Author : xiaolu 6 | # @Email : luxiaonlp@163.com 7 | # @Software: PyCharm 8 | """ 9 | import pymysql 10 | 11 | if __name__ == "__main__": 12 | # 打开数据库连接 13 | db = pymysql.connect("localhost", "xxxxxx", "xxxxxx", "TESTDB") 14 | 15 | # 使用cursor()方法获取操作游标 16 | cursor = db.cursor() 17 | 18 | # SQL 更新语句 给男性加1岁 19 | sql = "UPDATE EMPLOYEE SET AGE = AGE + 1 WHERE SEX = '%c'" % ('M') 20 | try: 21 | # 执行SQL语句 22 | cursor.execute(sql) 23 | # 提交到数据库执行 24 | db.commit() 25 | except: 26 | # 发生错误时回滚 27 | db.rollback() 28 | 29 | # 关闭数据库连接 30 | db.close() -------------------------------------------------------------------------------- /pymysql的使用/005-删除.py: -------------------------------------------------------------------------------- 1 | """ 2 | # -*- coding: utf-8 -*- 3 | # @File : 005-删除.py 4 | # @Time : 2020/11/24 2:43 下午 5 | # @Author : xiaolu 6 | # @Email : luxiaonlp@163.com 7 | # @Software: PyCharm 8 | """ 9 | import pymysql 10 | 11 | if __name__ == '__main__': 12 | # 打开数据库连接 13 | db = pymysql.connect("localhost", "xxxxxx", "xxxxx", "TESTDB") 14 | 15 | # 使用cursor()方法获取操作游标 16 | cursor = db.cursor() 17 | 18 | # SQL 删除语句 19 | sql = "DELETE FROM EMPLOYEE WHERE AGE > %s" % (20) 20 | try: 21 | # 执行SQL语句 22 | cursor.execute(sql) 23 | # 提交修改 24 | db.commit() 25 | except: 26 | # 发生错误时回滚 27 | db.rollback() 28 | 29 | # 关闭连接 30 | db.close() -------------------------------------------------------------------------------- /python并发编程/001-多线程.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 001-多线程.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-02-01 6 | """ 7 | import threading 8 | import time 9 | import requests 10 | 11 | 12 | def craw(url): 13 | # 这是个爬虫 14 | r = requests.get(url) 15 | print(url, r.status_code) 16 | 17 | 18 | def single_thread(): 19 | # 单线程爬虫 20 | print('single_thread start') 21 | for url in urls: 22 | craw(url) 23 | print('single_thread end') 24 | 25 | 26 | def multi_thread(): 27 | # 多线程爬虫 28 | print("multi_thread begin") 29 | threads = [] 30 | for url in urls: 31 | threads.append( 32 | threading.Thread(target=craw, args=(url,)) # url, 之所以加逗号 是因为这里必须为元组 33 | ) 34 | 35 | # 启动多线程 36 | for thread in threads: 37 | thread.start() 38 | 39 | # 等待结束 40 | for thread in threads: 41 | thread.join() 42 | print("multi_thread end") 43 | 44 | 45 | if __name__ == '__main__': 46 | # 爬50页的内容 47 | urls = ['https://www.cnblogs.com/sitehome/p/{}'.format(page) for page in range(1, 50 + 1)] 48 | 49 | # 单线程走起 50 | start = time.time() 51 | single_thread() 52 | end = time.time() 53 | print("single thread cost:", end - start, "seconds") 54 | 55 | # 多线程走起 56 | start = time.time() 57 | multi_thread() 58 | end = time.time() 59 | print("multi thread cost:", end - start, "seconds") -------------------------------------------------------------------------------- /python并发编程/002-生产者消费者实现多线程爬虫.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 002-生产者消费者实现多线程爬虫.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-02-01 6 | """ 7 | import queue 8 | import time 9 | import random 10 | import threading 11 | import requests 12 | from bs4 import BeautifulSoup 13 | 14 | 15 | def craw(url): 16 | # 爬取网页内容 17 | r = requests.get(url) 18 | return r.text 19 | 20 | 21 | def parse(html): 22 | # 解析其中的内容 23 | soup = BeautifulSoup(html, "html.parser") 24 | links = soup.find_all("a", class_="post-item-title") 25 | return [(link["href"], link.get_text()) for link in links] # 那链接和标题拿出来 26 | 27 | 28 | def do_craw(url_queue: queue.Queue, html_queue: queue.Queue): 29 | ''' 30 | 生产者 31 | :param url_queue: url的队列 生产者从中拿出链接 去爬虫 32 | :param html_queue: 生产者将爬取的内容放到这里 33 | :return: 34 | ''' 35 | while True: 36 | url = url_queue.get() 37 | html = craw(url) 38 | html_queue.put(html) 39 | print('线程名: ', threading.current_thread().name, 40 | "url_queue.size=", url_queue.qsize()) # 获取url队列中还有多少待爬取的 41 | time.sleep(random.randint(1, 2)) 42 | 43 | 44 | def do_parse(html_queue: queue.Queue, fout): 45 | ''' 46 | 消费者 47 | :param html_queue: 生产者生产出的内容 48 | :param fout: 消费者将内容解析出来 存到fout中 49 | :return: 50 | ''' 51 | while True: 52 | html = html_queue.get() 53 | results = parse(html) 54 | for result in results: 55 | fout.write(str(result) + "\n") 56 | print('线程名: ', threading.current_thread().name, 57 | "html_queue.size=", html_queue.qsize()) 58 | time.sleep(random.randint(1, 2)) 59 | 60 | 61 | if __name__ == '__main__': 62 | # 待爬取的网页链接 63 | urls = [ 64 | "https://www.cnblogs.com/sitehome/p/{}".format(page) for page in range(1, 50 + 1) 65 | ] 66 | 67 | url_queue = queue.Queue() 68 | html_queue = queue.Queue() 69 | 70 | # 将url放进队列中 71 | for url in urls: 72 | url_queue.put(url) 73 | 74 | # 启动三个线程去做生产者 75 | for idx in range(3): 76 | t = threading.Thread(target=do_craw, args=(url_queue, html_queue), 77 | name="craw{}".format(idx)) 78 | t.start() 79 | 80 | fout = open("data.txt", "w") 81 | # 启动两个线程去做消费者 82 | for idx in range(2): 83 | t = threading.Thread(target=do_parse, args=(html_queue, fout), 84 | name="parse{}".format(idx)) 85 | t.start() 86 | -------------------------------------------------------------------------------- /python并发编程/003-多线程锁机制.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 003-多线程锁机制.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-02-01 6 | """ 7 | import threading 8 | import time 9 | 10 | lock = threading.Lock() 11 | 12 | 13 | class Account: 14 | def __init__(self, balance): 15 | self.balance = balance 16 | 17 | 18 | def draw(account, amount): 19 | with lock: 20 | if account.balance >= amount: 21 | # time.sleep(0.1) # 如果不加锁,这里休息0.1秒,每次都会出问题,因为这里会引起线程阻塞,一定会切换 22 | print(threading.current_thread().name, "取钱成功") 23 | account.balance -= amount 24 | print(threading.current_thread().name, "余额", account.balance) 25 | else: 26 | print(threading.current_thread().name, 27 | "取钱失败,余额不足") 28 | 29 | 30 | if __name__ == "__main__": 31 | account = Account(1000) # 金额 32 | 33 | # 启动两个线程 分别去800块 34 | ta = threading.Thread(name="ta", target=draw, args=(account, 800)) 35 | tb = threading.Thread(name="tb", target=draw, args=(account, 800)) 36 | 37 | ta.start() 38 | tb.start() -------------------------------------------------------------------------------- /python并发编程/004-线程池的使用.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 004-线程池的使用.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-02-01 6 | """ 7 | import concurrent.futures 8 | import requests 9 | from bs4 import BeautifulSoup 10 | 11 | 12 | def craw(url): 13 | # 爬取网页内容 14 | r = requests.get(url) 15 | return r.text 16 | 17 | 18 | def parse(html): 19 | # 解析其中的内容 20 | soup = BeautifulSoup(html, "html.parser") 21 | links = soup.find_all("a", class_="post-item-title") 22 | return [(link["href"], link.get_text()) for link in links] # 那链接和标题拿出来 23 | 24 | 25 | if __name__ == '__main__': 26 | # 待爬取的网页链接 27 | urls = [ 28 | "https://www.cnblogs.com/sitehome/p/{}".format(page) for page in range(1, 50 + 1) 29 | ] 30 | 31 | # craw 32 | with concurrent.futures.ThreadPoolExecutor() as pool: 33 | htmls = pool.map(craw, urls) 34 | htmls = list(zip(urls, htmls)) 35 | for url, html in htmls: 36 | print(url, len(html)) 37 | print("craw over") 38 | 39 | # parse 40 | with concurrent.futures.ThreadPoolExecutor() as pool: 41 | futures = {} 42 | for url, html in htmls: 43 | future = pool.submit(parse, html) 44 | futures[future] = url 45 | 46 | # for future, url in futures.items(): 47 | # print(url, future.result()) 48 | 49 | for future in concurrent.futures.as_completed(futures): 50 | url = futures[future] 51 | print(url, future.result()) 52 | -------------------------------------------------------------------------------- /python并发编程/005-线程池加速flask-web服务.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 005-线程池加速flask-web服务.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-02-01 6 | """ 7 | import flask 8 | import json 9 | import time 10 | from concurrent.futures import ThreadPoolExecutor 11 | 12 | app = flask.Flask(__name__) 13 | pool = ThreadPoolExecutor() 14 | 15 | 16 | def read_file(): 17 | time.sleep(0.1) 18 | return "file result" 19 | 20 | 21 | def read_db(): 22 | time.sleep(0.2) 23 | return "db result" 24 | 25 | 26 | def read_api(): 27 | time.sleep(0.3) 28 | return "api result" 29 | 30 | 31 | @app.route("/") 32 | def index(): 33 | result_file = pool.submit(read_file) 34 | result_db = pool.submit(read_db) 35 | result_api = pool.submit(read_api) 36 | 37 | return json.dumps({ 38 | "result_file": result_file.result(), 39 | "result_db": result_db.result(), 40 | "result_api": result_api.result(), 41 | }) 42 | 43 | 44 | if __name__ == "__main__": 45 | app.run() -------------------------------------------------------------------------------- /python并发编程/006-多进程的使用.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 006-多进程的使用.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-02-01 6 | """ 7 | import math 8 | from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor 9 | import time 10 | 11 | 12 | def is_prime(n): 13 | if n < 2: 14 | return False 15 | if n == 2: 16 | return True 17 | if n % 2 == 0: 18 | return False 19 | sqrt_n = int(math.floor(math.sqrt(n))) 20 | for i in range(3, sqrt_n + 1, 2): 21 | if n % i == 0: 22 | return False 23 | return True 24 | 25 | 26 | def single_thread(): 27 | for number in PRIMES: 28 | is_prime(number) 29 | 30 | 31 | def multi_thread(): 32 | with ThreadPoolExecutor() as pool: 33 | pool.map(is_prime, PRIMES) 34 | 35 | 36 | def multi_process(): 37 | with ProcessPoolExecutor() as pool: 38 | pool.map(is_prime, PRIMES) 39 | 40 | 41 | if __name__ == "__main__": 42 | PRIMES = [112272535095293] * 100 43 | 44 | start = time.time() 45 | single_thread() 46 | end = time.time() 47 | print("single_thread, cost:", end - start, "seconds") 48 | 49 | start = time.time() 50 | multi_thread() 51 | end = time.time() 52 | print("multi_thread, cost:", end - start, "seconds") 53 | 54 | start = time.time() 55 | multi_process() 56 | end = time.time() 57 | print("multi_process, cost:", end - start, "seconds") 58 | -------------------------------------------------------------------------------- /python并发编程/007-多进程加速flask-web服务.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 007-多进程加速flask-web服务.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-02-01 6 | """ 7 | import flask 8 | from concurrent.futures import ProcessPoolExecutor 9 | import math 10 | import json 11 | 12 | 13 | app = flask.Flask(__name__) 14 | 15 | 16 | def is_prime(n): 17 | if n < 2: 18 | return False 19 | if n == 2: 20 | return True 21 | if n % 2 == 0: 22 | return False 23 | sqrt_n = int(math.floor(math.sqrt(n))) 24 | for i in range(3, sqrt_n + 1, 2): 25 | if n % i == 0: 26 | return False 27 | return True 28 | 29 | 30 | @app.route("/is_prime/") 31 | def api_is_prime(numbers): 32 | number_list = [int(x) for x in numbers.split(",")] 33 | results = process_pool.map(is_prime, number_list) 34 | return json.dumps(dict(zip(number_list, results))) 35 | 36 | 37 | if __name__ == "__main__": 38 | process_pool = ProcessPoolExecutor() 39 | app.run() 40 | -------------------------------------------------------------------------------- /python并发编程/008-协程爬虫.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 008-协程爬虫.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-02-01 6 | """ 7 | import asyncio 8 | import aiohttp 9 | import time 10 | 11 | 12 | async def async_craw(url): 13 | print("craw url: ", url) 14 | async with aiohttp.ClientSession() as session: 15 | async with session.get(url) as resp: 16 | result = await resp.text() 17 | print(f"craw url: {url}, {len(result)}") 18 | 19 | 20 | if __name__ == '__main__': 21 | urls = [ 22 | "https://www.cnblogs.com/sitehome/p/{}".format(page) for page in range(1, 50 + 1) 23 | ] 24 | 25 | loop = asyncio.get_event_loop() # 获取超级循环 26 | tasks = [loop.create_task(async_craw(url)) for url in urls] # 建立任务 27 | start = time.time() 28 | loop.run_until_complete(asyncio.wait(tasks)) # 开始执行 29 | end = time.time() 30 | print("use time seconds: ", end - start) -------------------------------------------------------------------------------- /python并发编程/009-使用信号量控制协程数进行爬虫.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : 009-使用信号量控制协程数进行爬虫.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-02-01 6 | """ 7 | import asyncio 8 | import aiohttp 9 | import time 10 | 11 | 12 | async def async_craw(url): 13 | async with semaphore: # 加了这个 14 | print("craw url: ", url) 15 | async with aiohttp.ClientSession() as session: 16 | async with session.get(url) as resp: 17 | result = await resp.text() 18 | await asyncio.sleep(5) 19 | print(f"craw url: {url}, {len(result)}") 20 | 21 | 22 | if __name__ == '__main__': 23 | urls = [ 24 | "https://www.cnblogs.com/sitehome/p/{}".format(page) for page in range(1, 50 + 1) 25 | ] 26 | semaphore = asyncio.Semaphore(10) # 控制并发量 27 | 28 | loop = asyncio.get_event_loop() # 获取超级循环 29 | tasks = [loop.create_task(async_craw(url)) for url in urls] # 建立任务 30 | start = time.time() 31 | loop.run_until_complete(asyncio.wait(tasks)) # 开始执行 32 | end = time.time() 33 | print("use time seconds: ", end - start) 34 | -------------------------------------------------------------------------------- /streamlit的使用/鸢尾花数据的分类app/app.py: -------------------------------------------------------------------------------- 1 | """ 2 | @file : app.py 3 | @author : xiaolu 4 | @email : luxiaonlp@163.com 5 | @time : 2021-06-09 6 | """ 7 | import streamlit as st 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | from sklearn import datasets 11 | from sklearn.model_selection import train_test_split 12 | from sklearn.decomposition import PCA 13 | from sklearn.svm import SVC 14 | from sklearn.neighbors import KNeighborsClassifier 15 | from sklearn.ensemble import RandomForestClassifier 16 | from sklearn.metrics import accuracy_score 17 | 18 | 19 | def get_dataset(name): 20 | # 加载数据集 21 | if name == 'Iris': 22 | data = datasets.load_iris() 23 | elif name == 'Wine': 24 | data = datasets.load_wine() 25 | else: 26 | data = datasets.load_breast_cancer() 27 | X = data.data 28 | y = data.target 29 | return X, y 30 | 31 | 32 | def add_parameter_ui(clf_name): 33 | # 针对每个分类器 可以调节的超参数 34 | params = dict() 35 | if clf_name == 'SVM': 36 | C = st.sidebar.slider('C', 0.01, 10.0) # 滑动条 37 | params['C'] = C 38 | elif clf_name == 'KNN': 39 | K = st.sidebar.slider('K', 1, 15) # 滑动条 40 | params['K'] = K 41 | else: 42 | max_depth = st.sidebar.slider('max_depth', 2, 15) # 滑动条 43 | params['max_depth'] = max_depth 44 | n_estimators = st.sidebar.slider('n_estimators', 1, 100) # 滑动条 45 | params['n_estimators'] = n_estimators 46 | return params 47 | 48 | 49 | def get_classifier(clf_name, params): 50 | # 实例化分类器 51 | clf = None 52 | if clf_name == 'SVM': 53 | clf = SVC(C=params['C']) 54 | elif clf_name == 'KNN': 55 | clf = KNeighborsClassifier(n_neighbors=params['K']) 56 | else: 57 | clf = RandomForestClassifier(n_estimators=params['n_estimators'], 58 | max_depth=params['max_depth'], random_state=1234) 59 | return clf 60 | 61 | 62 | def plot_result(): 63 | pca = PCA(2) 64 | X_projected = pca.fit_transform(X) 65 | x1 = X_projected[:, 0] 66 | x2 = X_projected[:, 1] 67 | fig = plt.figure() 68 | plt.scatter(x1, x2, c=y, alpha=0.8, cmap='viridis') 69 | 70 | plt.xlabel('feature_1') 71 | plt.ylabel('feature_2') 72 | plt.colorbar() 73 | st.pyplot(fig) 74 | 75 | 76 | if __name__ == '__main__': 77 | # 启动该项目,命令行: streamlit run app.py 78 | st.title('鸢尾花数据集的分类') 79 | st.write(''' 80 | # 支持选择不同的分类器(SVM/Random Forest/KNN) 81 | 哪一个分类器更好呢?''') # 支持markdown 82 | 83 | # 1. 可以选择不同的数据集 是一个下拉选择框 84 | dataset_name = st.sidebar.selectbox( 85 | '数据集的选择', 86 | ('Iris', 'Breast Cancer', 'Wine') 87 | ) 88 | 89 | st.write('## {} 数据集'.format(dataset_name)) # 选择好数据集 这里显示 90 | 91 | # 2. 可以选择不同的分类器, 是一个下拉选择框 92 | classifier_name = st.sidebar.selectbox( 93 | '分类器的选择', 94 | ('KNN', 'SVM', 'Random Forest') 95 | ) 96 | 97 | X, y = get_dataset(dataset_name) 98 | st.write('数据集的形状:', X.shape) 99 | st.write('数据集的类别数:', len(np.unique(y))) 100 | 101 | params = add_parameter_ui(classifier_name) 102 | 103 | clf = get_classifier(classifier_name, params) 104 | 105 | # 模型训练 106 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234) 107 | 108 | clf.fit(X_train, y_train) 109 | y_pred = clf.predict(X_test) 110 | 111 | acc = accuracy_score(y_test, y_pred) # 准确率 112 | 113 | st.write('选择的分类器为: ', classifier_name) 114 | st.write('准确率: ', acc) 115 | 116 | # 画图 117 | plot_result() 118 | -------------------------------------------------------------------------------- /textrank4zh/001-关键词提取.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @Time : 2020/11/4 16:37 4 | @Auth : xiaolu 5 | @File :001-关键词提取.py 6 | @IDE :PyCharm 7 | @Email:luxiaonlp@163.com 8 | """ 9 | from textrank4zh import TextRank4Keyword 10 | 11 | 12 | if __name__ == '__main__': 13 | # 加载文本 14 | data = [] 15 | with open('./data/text.txt', 'r', encoding='utf8') as f: 16 | for line in f.readlines(): 17 | line = line.strip() 18 | data.append(line) 19 | 20 | # 关键词提取 21 | tr4w = TextRank4Keyword() 22 | 23 | data = data[:1] 24 | for text in data: 25 | tr4w.analyze(text=text, lower=True, window=2) 26 | for item in tr4w.get_keywords(20, word_min_len=1): 27 | print('{}:{:6f}'.format(item.word, item.weight)) 28 | 29 | # 关键短语抽取 30 | for text in data: 31 | tr4w.analyze(text=text, lower=True, window=2) 32 | for phrase in tr4w.get_keyphrases(20, min_occur_num=1): 33 | print(phrase) 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /textrank4zh/002-摘要抽取.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @Time : 2020/11/4 16:47 4 | @Auth : xiaolu 5 | @File :002-摘要抽取.py 6 | @IDE :PyCharm 7 | @Email:luxiaonlp@163.com 8 | """ 9 | from textrank4zh import TextRank4Sentence 10 | 11 | if __name__ == '__main__': 12 | # 加载文本 13 | data = [] 14 | with open('./data/text.txt', 'r', encoding='utf8') as f: 15 | for line in f.readlines(): 16 | line = line.strip() 17 | data.append(line) 18 | 19 | # 摘要抽取 20 | tr4s = TextRank4Sentence() 21 | 22 | data = data[:1] 23 | for text in data: 24 | tr4s.analyze(text=text, lower=True, source='all_filters') 25 | for item in tr4s.get_key_sentences(num=3): 26 | print(item.index, item.weight, item.sentence) 27 | -------------------------------------------------------------------------------- /textrank4zh/readme.txt: -------------------------------------------------------------------------------- 1 | 安装 pip install textrank4zh -i https://pypi.douban.com/simple/ --------------------------------------------------------------------------------