├── app ├── ML │ ├── MLModel.py │ ├── RecommandModel.py │ ├── SimpleKNN.py │ ├── data │ │ ├── Iris.csv │ │ ├── movies.csv │ │ └── ratings.csv │ ├── doc │ │ ├── AnalyseRecommandModel.md │ │ └── index.md │ └── index.md ├── __init__.py ├── api │ └── __init__.py ├── elasticsearch │ ├── bulk.py │ ├── index.py │ ├── readJson.py │ └── test.py ├── forms │ └── book.py ├── libs │ ├── __init__.py │ ├── helper.py │ └── httpRequest.py ├── models │ └── book.py ├── secure.py ├── setting.py ├── spider │ ├── __init__.py │ └── yushu_book.py ├── testPy │ ├── DecisionTreeIris.pmml │ ├── groubBy.py │ └── index.py └── web │ ├── __init__.py │ ├── book.py │ └── user.py ├── fisher.py └── readme.md /app/ML/MLModel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Until-You-Possible/python-flask-web/09de58d305f9efbc4babde5ad0ed853398433df9/app/ML/MLModel.py -------------------------------------------------------------------------------- /app/ML/RecommandModel.py: -------------------------------------------------------------------------------- 1 | # code 2 | import numpy as np 3 | import pandas as pd 4 | import sklearn 5 | import matplotlib.pyplot as plt 6 | import seaborn as sns 7 | # Now, we create user-item matrix using scipy csr matrix 8 | from scipy.sparse import csr_matrix 9 | 10 | import warnings 11 | 12 | warnings.simplefilter(action='ignore', category=FutureWarning) 13 | 14 | ratings = pd.read_csv("./data/ratings.csv") 15 | ratings.head() 16 | 17 | movies = pd.read_csv("./data/movies.csv") 18 | movies.head() 19 | 20 | n_ratings = len(ratings) 21 | n_movies = len(ratings['movieId'].unique()) 22 | n_users = len(ratings['userId'].unique()) 23 | 24 | print(f"Number of ratings: {n_ratings}") 25 | print(f"Number of unique movieId's: {n_movies}") 26 | print(f"Number of unique users: {n_users}") 27 | print(f"Average ratings per user: {round(n_ratings / n_users, 2)}") 28 | print(f"Average ratings per movie: {round(n_ratings / n_movies, 2)}") 29 | 30 | user_freq = ratings[['userId', 'movieId']].groupby('userId').count().reset_index() 31 | user_freq.columns = ['userId', 'n_ratings'] 32 | user_freq.head() 33 | 34 | # Find Lowest and Highest rated movies: 35 | mean_rating = ratings.groupby('movieId')[['rating']].mean() 36 | # Lowest rated movies 37 | lowest_rated = mean_rating['rating'].idxmin() 38 | movies.loc[movies['movieId'] == lowest_rated] 39 | # Highest rated movies 40 | highest_rated = mean_rating['rating'].idxmax() 41 | movies.loc[movies['movieId'] == highest_rated] 42 | # show number of people who rated movies rated movie highest 43 | ratings[ratings['movieId'] == highest_rated] 44 | # show number of people who rated movies rated movie lowest 45 | ratings[ratings['movieId'] == lowest_rated] 46 | 47 | ## the above movies has very low dataset. We will use bayesian average 48 | movie_stats = ratings.groupby('movieId')[['rating']].agg(['count', 'mean']) 49 | movie_stats.columns = movie_stats.columns.droplevel() 50 | 51 | 52 | def create_matrix(df): 53 | N = len(df['userId'].unique()) 54 | M = len(df['movieId'].unique()) 55 | 56 | # Map Ids to indices 57 | user_mapper = dict(zip(np.unique(df["userId"]), list(range(N)))) 58 | movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(M)))) 59 | 60 | # Map indices to IDs 61 | user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userId"]))) 62 | movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["movieId"]))) 63 | 64 | user_index = [user_mapper[i] for i in df['userId']] 65 | movie_index = [movie_mapper[i] for i in df['movieId']] 66 | 67 | X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N)) 68 | 69 | return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper 70 | 71 | 72 | X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(ratings) 73 | 74 | from sklearn.neighbors import NearestNeighbors 75 | 76 | """ 77 | Find similar movies using KNN 78 | """ 79 | 80 | 81 | def find_similar_movies(movie_id, X, k, metric='cosine', show_distance=False): 82 | neighbour_ids = [] 83 | 84 | movie_ind = movie_mapper[movie_id] 85 | movie_vec = X[movie_ind] 86 | k += 1 87 | kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric) 88 | kNN.fit(X) 89 | movie_vec = movie_vec.reshape(1, -1) 90 | neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance) 91 | for i in range(0, k): 92 | n = neighbour.item(i) 93 | neighbour_ids.append(movie_inv_mapper[n]) 94 | neighbour_ids.pop(0) 95 | return neighbour_ids 96 | 97 | 98 | movie_titles = dict(zip(movies['movieId'], movies['title'])) 99 | 100 | movie_id = 3 101 | 102 | similar_ids = find_similar_movies(movie_id, X, k=10) 103 | movie_title = movie_titles[movie_id] 104 | 105 | print(f"Since you watched {movie_title}") 106 | for i in similar_ids: 107 | print(movie_titles[i]) 108 | -------------------------------------------------------------------------------- /app/ML/SimpleKNN.py: -------------------------------------------------------------------------------- 1 | # k-近邻算法 2 | import pandas 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.neighbors import KNeighborsClassifier 5 | 6 | 7 | def k_near(): 8 | """ 9 | 2个样本,3个特征 10 | a(a1,a2,a3),b(b1,b2,b3) 11 | 欧式距离: 12 | ____________________________________ 13 | p = √(a1 -b1)^2 + (a2-b2)^2 + (a3 - b3)^2 14 | """ 15 | # 1、原始数据 16 | # 读取数据 17 | train_data = pandas.read_csv("k_near/train.csv") 18 | # print(train_data.head(10)) 19 | 20 | # 2、数据处理 21 | # 数据筛选 22 | train_data = train_data.query("x > 1.0 & x < 1.25 & y > 2.5 & y < 2.75") 23 | 24 | # 转换时间 25 | time_value = pandas.to_datetime(train_data["time"], unit="s") 26 | # 转换成字典 27 | time_value = pandas.DatetimeIndex(time_value) 28 | # print(time_value) 29 | 30 | # 构造特征 31 | data = train_data.copy() 32 | data["day"] = time_value.day 33 | data["hour"] = time_value.hour 34 | data["weekday"] = time_value.weekday 35 | # print(train_data.head(10)) 36 | 37 | # 删除影响特征的数据,axis为1纵向删除 38 | data = data.drop(["time"], axis=1) 39 | 40 | # 删除小于目标值的数据 41 | place_count = data.groupby("place_id").count() 42 | # print(place_count) 43 | # 过滤数量大于5的地点ID,并且加入列中 44 | tf = place_count[place_count.x > 5].reset_index() 45 | # print(tf) 46 | data = data[data["place_id"].isin(tf.place_id)] 47 | 48 | # 取特征值和目标值 49 | y = data["place_id"] 50 | x = data.drop(["place_id", "row_id"], axis=1) 51 | 52 | # 数据分割 53 | x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25) 54 | 55 | # 3、特征工程 56 | # 特征工程(标准化) 57 | std = StandardScaler() 58 | x_train = std.fit_transform(x_train) 59 | x_test = std.transform(x_test) 60 | 61 | # 4、算法 62 | # 算法计算 63 | """ 64 | 优点: 65 | 简单、易于理解、易于实现、无需估计参数、无需训练 66 | 缺点: 67 | 懒惰算法,对测试样本分类时的计算量大,内存开销大 68 | 必须指定K值,K值选择不当则分类精度不能保证 69 | 问题: 70 | k值比较小:容易受异常点影响 71 | k值比较大:容易受K值影响(类别)影响 72 | 性能问题:每一个数据都要循环计算 73 | """ 74 | # k值就是n_neighbors,也就是通过多少个邻近数据确认分类 75 | knn = KNeighborsClassifier(n_neighbors=5) 76 | knn.fit(x_train, y_train) 77 | y_predict = knn.predict(x_test) 78 | print("预测值:", y_predict) 79 | 80 | # 5、评估 81 | # 评估 82 | score = knn.score(x_test, y_test) 83 | print("准确率:", score) -------------------------------------------------------------------------------- /app/ML/data/Iris.csv: -------------------------------------------------------------------------------- 1 | "","Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species" 2 | "1",5.1,3.5,1.4,0.2,"setosa" 3 | "2",4.9,3,1.4,0.2,"setosa" 4 | "3",4.7,3.2,1.3,0.2,"setosa" 5 | "4",4.6,3.1,1.5,0.2,"setosa" 6 | "5",5,3.6,1.4,0.2,"setosa" 7 | "6",5.4,3.9,1.7,0.4,"setosa" 8 | "7",4.6,3.4,1.4,0.3,"setosa" 9 | "8",5,3.4,1.5,0.2,"setosa" 10 | "9",4.4,2.9,1.4,0.2,"setosa" 11 | "10",4.9,3.1,1.5,0.1,"setosa" 12 | "11",5.4,3.7,1.5,0.2,"setosa" 13 | "12",4.8,3.4,1.6,0.2,"setosa" 14 | "13",4.8,3,1.4,0.1,"setosa" 15 | "14",4.3,3,1.1,0.1,"setosa" 16 | "15",5.8,4,1.2,0.2,"setosa" 17 | "16",5.7,4.4,1.5,0.4,"setosa" 18 | "17",5.4,3.9,1.3,0.4,"setosa" 19 | "18",5.1,3.5,1.4,0.3,"setosa" 20 | "19",5.7,3.8,1.7,0.3,"setosa" 21 | "20",5.1,3.8,1.5,0.3,"setosa" 22 | "21",5.4,3.4,1.7,0.2,"setosa" 23 | "22",5.1,3.7,1.5,0.4,"setosa" 24 | "23",4.6,3.6,1,0.2,"setosa" 25 | "24",5.1,3.3,1.7,0.5,"setosa" 26 | "25",4.8,3.4,1.9,0.2,"setosa" 27 | "26",5,3,1.6,0.2,"setosa" 28 | "27",5,3.4,1.6,0.4,"setosa" 29 | "28",5.2,3.5,1.5,0.2,"setosa" 30 | "29",5.2,3.4,1.4,0.2,"setosa" 31 | "30",4.7,3.2,1.6,0.2,"setosa" 32 | "31",4.8,3.1,1.6,0.2,"setosa" 33 | "32",5.4,3.4,1.5,0.4,"setosa" 34 | "33",5.2,4.1,1.5,0.1,"setosa" 35 | "34",5.5,4.2,1.4,0.2,"setosa" 36 | "35",4.9,3.1,1.5,0.2,"setosa" 37 | "36",5,3.2,1.2,0.2,"setosa" 38 | "37",5.5,3.5,1.3,0.2,"setosa" 39 | "38",4.9,3.6,1.4,0.1,"setosa" 40 | "39",4.4,3,1.3,0.2,"setosa" 41 | "40",5.1,3.4,1.5,0.2,"setosa" 42 | "41",5,3.5,1.3,0.3,"setosa" 43 | "42",4.5,2.3,1.3,0.3,"setosa" 44 | "43",4.4,3.2,1.3,0.2,"setosa" 45 | "44",5,3.5,1.6,0.6,"setosa" 46 | "45",5.1,3.8,1.9,0.4,"setosa" 47 | "46",4.8,3,1.4,0.3,"setosa" 48 | "47",5.1,3.8,1.6,0.2,"setosa" 49 | "48",4.6,3.2,1.4,0.2,"setosa" 50 | "49",5.3,3.7,1.5,0.2,"setosa" 51 | "50",5,3.3,1.4,0.2,"setosa" 52 | "51",7,3.2,4.7,1.4,"versicolor" 53 | "52",6.4,3.2,4.5,1.5,"versicolor" 54 | "53",6.9,3.1,4.9,1.5,"versicolor" 55 | "54",5.5,2.3,4,1.3,"versicolor" 56 | "55",6.5,2.8,4.6,1.5,"versicolor" 57 | "56",5.7,2.8,4.5,1.3,"versicolor" 58 | "57",6.3,3.3,4.7,1.6,"versicolor" 59 | "58",4.9,2.4,3.3,1,"versicolor" 60 | "59",6.6,2.9,4.6,1.3,"versicolor" 61 | "60",5.2,2.7,3.9,1.4,"versicolor" 62 | "61",5,2,3.5,1,"versicolor" 63 | "62",5.9,3,4.2,1.5,"versicolor" 64 | "63",6,2.2,4,1,"versicolor" 65 | "64",6.1,2.9,4.7,1.4,"versicolor" 66 | "65",5.6,2.9,3.6,1.3,"versicolor" 67 | "66",6.7,3.1,4.4,1.4,"versicolor" 68 | "67",5.6,3,4.5,1.5,"versicolor" 69 | "68",5.8,2.7,4.1,1,"versicolor" 70 | "69",6.2,2.2,4.5,1.5,"versicolor" 71 | "70",5.6,2.5,3.9,1.1,"versicolor" 72 | "71",5.9,3.2,4.8,1.8,"versicolor" 73 | "72",6.1,2.8,4,1.3,"versicolor" 74 | "73",6.3,2.5,4.9,1.5,"versicolor" 75 | "74",6.1,2.8,4.7,1.2,"versicolor" 76 | "75",6.4,2.9,4.3,1.3,"versicolor" 77 | "76",6.6,3,4.4,1.4,"versicolor" 78 | "77",6.8,2.8,4.8,1.4,"versicolor" 79 | "78",6.7,3,5,1.7,"versicolor" 80 | "79",6,2.9,4.5,1.5,"versicolor" 81 | "80",5.7,2.6,3.5,1,"versicolor" 82 | "81",5.5,2.4,3.8,1.1,"versicolor" 83 | "82",5.5,2.4,3.7,1,"versicolor" 84 | "83",5.8,2.7,3.9,1.2,"versicolor" 85 | "84",6,2.7,5.1,1.6,"versicolor" 86 | "85",5.4,3,4.5,1.5,"versicolor" 87 | "86",6,3.4,4.5,1.6,"versicolor" 88 | "87",6.7,3.1,4.7,1.5,"versicolor" 89 | "88",6.3,2.3,4.4,1.3,"versicolor" 90 | "89",5.6,3,4.1,1.3,"versicolor" 91 | "90",5.5,2.5,4,1.3,"versicolor" 92 | "91",5.5,2.6,4.4,1.2,"versicolor" 93 | "92",6.1,3,4.6,1.4,"versicolor" 94 | "93",5.8,2.6,4,1.2,"versicolor" 95 | "94",5,2.3,3.3,1,"versicolor" 96 | "95",5.6,2.7,4.2,1.3,"versicolor" 97 | "96",5.7,3,4.2,1.2,"versicolor" 98 | "97",5.7,2.9,4.2,1.3,"versicolor" 99 | "98",6.2,2.9,4.3,1.3,"versicolor" 100 | "99",5.1,2.5,3,1.1,"versicolor" 101 | "100",5.7,2.8,4.1,1.3,"versicolor" 102 | "101",6.3,3.3,6,2.5,"virginica" 103 | "102",5.8,2.7,5.1,1.9,"virginica" 104 | "103",7.1,3,5.9,2.1,"virginica" 105 | "104",6.3,2.9,5.6,1.8,"virginica" 106 | "105",6.5,3,5.8,2.2,"virginica" 107 | "106",7.6,3,6.6,2.1,"virginica" 108 | "107",4.9,2.5,4.5,1.7,"virginica" 109 | "108",7.3,2.9,6.3,1.8,"virginica" 110 | "109",6.7,2.5,5.8,1.8,"virginica" 111 | "110",7.2,3.6,6.1,2.5,"virginica" 112 | "111",6.5,3.2,5.1,2,"virginica" 113 | "112",6.4,2.7,5.3,1.9,"virginica" 114 | "113",6.8,3,5.5,2.1,"virginica" 115 | "114",5.7,2.5,5,2,"virginica" 116 | "115",5.8,2.8,5.1,2.4,"virginica" 117 | "116",6.4,3.2,5.3,2.3,"virginica" 118 | "117",6.5,3,5.5,1.8,"virginica" 119 | "118",7.7,3.8,6.7,2.2,"virginica" 120 | "119",7.7,2.6,6.9,2.3,"virginica" 121 | "120",6,2.2,5,1.5,"virginica" 122 | "121",6.9,3.2,5.7,2.3,"virginica" 123 | "122",5.6,2.8,4.9,2,"virginica" 124 | "123",7.7,2.8,6.7,2,"virginica" 125 | "124",6.3,2.7,4.9,1.8,"virginica" 126 | "125",6.7,3.3,5.7,2.1,"virginica" 127 | "126",7.2,3.2,6,1.8,"virginica" 128 | "127",6.2,2.8,4.8,1.8,"virginica" 129 | "128",6.1,3,4.9,1.8,"virginica" 130 | "129",6.4,2.8,5.6,2.1,"virginica" 131 | "130",7.2,3,5.8,1.6,"virginica" 132 | "131",7.4,2.8,6.1,1.9,"virginica" 133 | "132",7.9,3.8,6.4,2,"virginica" 134 | "133",6.4,2.8,5.6,2.2,"virginica" 135 | "134",6.3,2.8,5.1,1.5,"virginica" 136 | "135",6.1,2.6,5.6,1.4,"virginica" 137 | "136",7.7,3,6.1,2.3,"virginica" 138 | "137",6.3,3.4,5.6,2.4,"virginica" 139 | "138",6.4,3.1,5.5,1.8,"virginica" 140 | "139",6,3,4.8,1.8,"virginica" 141 | "140",6.9,3.1,5.4,2.1,"virginica" 142 | "141",6.7,3.1,5.6,2.4,"virginica" 143 | "142",6.9,3.1,5.1,2.3,"virginica" 144 | "143",5.8,2.7,5.1,1.9,"virginica" 145 | "144",6.8,3.2,5.9,2.3,"virginica" 146 | "145",6.7,3.3,5.7,2.5,"virginica" 147 | "146",6.7,3,5.2,2.3,"virginica" 148 | "147",6.3,2.5,5,1.9,"virginica" 149 | "148",6.5,3,5.2,2,"virginica" 150 | "149",6.2,3.4,5.4,2.3,"virginica" 151 | "150",5.9,3,5.1,1.8,"virginica" -------------------------------------------------------------------------------- /app/ML/doc/AnalyseRecommandModel.md: -------------------------------------------------------------------------------- 1 | ***对RecommendModel模型的分析*** 2 | 3 | **import的部分** 4 | 5 | 1: numpy就不赘述了(做矩阵计算用的) 6 | 7 | 2: pandas 8 | 参考文档 https://www.pypandas.cn/docs/getting_started/basics.html#head-%E4%B8%8E-tail 9 | 3: sklearn,基本的机器学习的框架 10 | 4:seaborn, 这是一个基于matplotlib进行高级封装的可视化库,相比之下,绘制图表更为集成化、绘图风格具有更高的定制性 11 | 相关参考文档: https://pypi.org/project/seaborn/ 12 | https://zhuanlan.zhihu.com/p/342945532 13 | 5: scipy相关的 https://www.biaodianfu.com/scipy-sparse.html 14 | 是关于创建 稀释的矩阵的内容。细节可以参考文档 15 | 6:warning模块 https://blog.csdn.net/low5252/article/details/109334695 16 | 17 | 18 | ***数据部分*** 19 | 20 | 首先是数据部分: 我们有两份可用数据分别是ratings.csv和movies.csv, 21 | rating,部分包括 userId,movieId,rating,timestamp 几个字段的数据 22 | movies,部分包括 movieId,title,genres 三个字段 23 | 24 | 25 | ***coding部分*** 26 | 27 | 1: 读取rating文件并取前5条 (head, tail是末尾5条) 28 | 2: movies同理 29 | 30 | n_ratings,得到ratings的length 31 | movies 32 | users 33 | 同理 (unique 去重,从大到小返回) 34 | 35 | 1: user_freq 这里 36 | 根据userId纬度将ratings数据分为多分(取决于有userId有多少不同值)表格,然后统计数量(count), 37 | reset_index 保留groupby的 参数,方便查看理解数据。 38 | column这里重新命名table 39 | 取出前五条 40 | 41 | 1:mean_rating 42 | 43 | -------------------------------------------------------------------------------- /app/ML/doc/index.md: -------------------------------------------------------------------------------- 1 | ### 常用概念 -------------------------------------------------------------------------------- /app/ML/index.md: -------------------------------------------------------------------------------- 1 | ### 测试一些模型 2 | 3 | 1: 一些参考资料 4 | https://www.cnblogs.com/ll409546297/p/11215141.html -------------------------------------------------------------------------------- /app/__init__.py: -------------------------------------------------------------------------------- 1 | # @Time : 9/22/2021 10:47 AM 2 | # @Author : arthur 3 | # @Email : arthurwanggang@outlook.com 4 | # @File : __init__.py 5 | # @Software: PyCharm 6 | 7 | from flask import Flask 8 | from app.web.book import web 9 | from app.models.book import db 10 | 11 | 12 | def create_app(): 13 | app = Flask(__name__) 14 | # 引入配置文件 15 | app.config.from_object('app.secure') 16 | app.config.from_object('app.setting') 17 | register_blueprint(app) 18 | db.init_app(app) 19 | db.create_all(app=app) 20 | return app 21 | 22 | 23 | def register_blueprint(app): 24 | app.register_blueprint(web) 25 | -------------------------------------------------------------------------------- /app/api/__init__.py: -------------------------------------------------------------------------------- 1 | # @Time : 9/22/2021 2:10 PM 2 | # @Author : arthur 3 | # @Email : arthurwanggang@outlook.com 4 | # @File : __init__.py.py 5 | # @Software: PyCharm 6 | -------------------------------------------------------------------------------- /app/elasticsearch/bulk.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | 4 | from elasticsearch import helpers 5 | 6 | from elasticsearch import Elasticsearch 7 | 8 | es_url = "http://127.0.0.1:9200" 9 | es = Elasticsearch(es_url) 10 | print(es.info) 11 | 12 | configurations = { 13 | "index_name": "index_name", 14 | "index_type": "index_type", 15 | "request_body": {} 16 | } 17 | 18 | source_path = "/Users/wanggang/Documents/kba/kba.json" 19 | 20 | 21 | def create_index(): 22 | es.indices.create(index=configurations.get("index_name"), body=configurations.get("request_body")) 23 | print("create a new index") 24 | 25 | 26 | # create_index() 27 | 28 | 29 | def check_json_count(): 30 | count = 0 31 | start_time = time.time() 32 | with open(source_path, "r", encoding="UTF-8") as fp: 33 | json_data = json.load(fp, strict=False) 34 | for item in json_data: 35 | count += 1 36 | end_time = time.time() 37 | t = end_time - start_time 38 | des = "读取这些数据{}条,共花费{}秒".format(count, t) 39 | return des 40 | 41 | 42 | print(check_json_count()) 43 | 44 | 45 | def read_json_file(): 46 | with open(source_path, "r", encoding="UTF-8") as fp: 47 | json_data = json.load(fp, strict=False) 48 | actions = [] 49 | count = 0 50 | for item in json_data: 51 | 52 | count += 1 53 | action = { 54 | "_index": configurations.get("index_name"), 55 | "type": configurations.get("index_type"), 56 | "_source": item 57 | } 58 | actions.append(action) 59 | if len(actions) == 1000: 60 | helpers.bulk(es, actions) 61 | actions = [] 62 | helpers.bulk(es, actions) 63 | 64 | 65 | # read_json_file() 66 | 67 | 68 | def check_json_count_block(): 69 | count = 0 70 | block_size = 1024 * 8 71 | with open(source_path, "r", encoding="UTF-8") as fp: 72 | while True: 73 | chunk = fp.read(block_size) 74 | if not chunk: 75 | break 76 | count += chunk.count("SAP") 77 | return count 78 | 79 | 80 | print("test number", check_json_count_block()) 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /app/elasticsearch/index.py: -------------------------------------------------------------------------------- 1 | from elasticsearch import Elasticsearch 2 | import json 3 | import time 4 | from elasticsearch import helpers 5 | from elasticsearch import Elasticsearch 6 | 7 | es = Elasticsearch('http://127.0.0.1:9200') 8 | print(es.info()) 9 | count = 0 10 | start_time = time.time() 11 | actions = [] 12 | source_path = "/Users/wanggang/Documents/kba/kba.json" 13 | index_name = "kba" 14 | index_type = "_doc" 15 | with open(source_path, 'r', encoding='utf8') as fp: 16 | json_data = json.load(fp, strict=False) 17 | for item in json_data: 18 | count += 1 19 | action = { 20 | "_index": index_name, 21 | "_type": index_type, 22 | "source": item 23 | } 24 | actions.append(action) 25 | if len(actions) == 1000: 26 | helpers.bulk(es, actions) 27 | actions = [] 28 | helpers.bulk(es, actions) 29 | print("总共有数据", count) 30 | end_time = time.time() 31 | t = end_time - start_time 32 | print("已经被导入数据", count, "耗时", t) 33 | 34 | -------------------------------------------------------------------------------- /app/elasticsearch/readJson.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import jsonlines 4 | from elasticsearch import Elasticsearch 5 | 6 | es = Elasticsearch(['http://127.0.0.1'], port=9200) 7 | # 链接es成功 8 | print(es.info()) 9 | 10 | mapping = { 11 | 'properties': { 12 | 'title': { 13 | 'type': 'text', 14 | 'analyzer': 'standard' 15 | } 16 | } 17 | } 18 | 19 | es.indices.create(index='news', ignore=400) 20 | es.indices.put_mapping(index='news', doc_type='politics', include_type_name=True, body=mapping) 21 | 22 | with open('/Users/wanggang/Desktop/index.json', 'r', encoding='utf8') as fp: 23 | json_data = json.load(fp) 24 | for item in json_data: 25 | print("item", item) 26 | es.index(index="news", doc_type='politics', document=item) 27 | print('这是文件中的json数据:', json_data) 28 | print('这是读取到文件数据的数据类型:', type(json_data)) 29 | -------------------------------------------------------------------------------- /app/elasticsearch/test.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import uuid 5 | 6 | from elasticsearch import Elasticsearch, helpers 7 | 8 | # create a new instance of the Elasticsearch client class 9 | elastic = Elasticsearch() 10 | # ...or uncomment to use this instead: 11 | # elastic = Elasticsearch("localhost") 12 | 13 | ''' 14 | a simple function that gets the working path of 15 | the Python script and returns it 16 | ''' 17 | 18 | 19 | def script_path(): 20 | path = os.path.dirname(os.path.realpath(__file__)) 21 | if os.name == 'posix': # posix is for macOS or Linux 22 | path = path + "/" 23 | else: 24 | path = path + chr(92) # backslash is for Windows 25 | return path 26 | 27 | 28 | ''' 29 | this function opens a file and returns its 30 | contents as a list of strings split by linebreaks 31 | ''' 32 | 33 | 34 | def get_data_from_file(self, path=script_path()): 35 | file = open(path + str(self), encoding="utf8", errors='ignore') 36 | data = [line.strip() for line in file] 37 | file.close() 38 | return data 39 | 40 | 41 | ''' 42 | generator to push bulk data from a JSON 43 | file into an Elasticsearch index 44 | ''' 45 | 46 | 47 | def bulk_json_data(json_file, _index, doc_type): 48 | json_list = get_data_from_file(json_file) 49 | for doc in json_list: 50 | # use a `yield` generator so that the data 51 | # isn't loaded into memory 52 | if '{"index"' not in doc: 53 | yield { 54 | "_index": _index, 55 | "_type": doc_type, 56 | "_id": uuid.uuid4(), 57 | "_source": doc 58 | } 59 | 60 | 61 | try: 62 | # make the bulk call, and get a response 63 | response = helpers.bulk(elastic, bulk_json_data("people.json", "employees", "people")) 64 | print("\nbulk_json_data() RESPONSE:", response) 65 | except Exception as e: 66 | print("\nERROR:", e) 67 | 68 | # iterator for a single document 69 | actions = [ 70 | { 71 | "_id": uuid.uuid4(), # random UUID for _id 72 | "doc_type": "person", # document _type 73 | "doc": { # the body of the document 74 | "name": "George Peterson", 75 | "sex": "male", 76 | "age": 34, 77 | "years": 10 78 | } 79 | } 80 | ] 81 | 82 | # iterator for multiple docs 83 | actions = [ 84 | { 85 | "_id": uuid.uuid4(), # random UUID for _id 86 | "doc_type": "person", # document _type 87 | "doc": { # the body of the document 88 | "name": "George Peterson", 89 | "sex": "male", 90 | "age": 34 + doc, 91 | "years": 10 + doc 92 | } 93 | } 94 | for doc in range(100) # use 'for' loop to insert 100 documents 95 | ] 96 | 97 | try: 98 | # make the bulk call using 'actions' and get a response 99 | response = helpers.bulk(elastic, actions, index='employees', doc_type='people') 100 | print("\nactions RESPONSE:", response) 101 | except Exception as e: 102 | print("\nERROR:", e) 103 | -------------------------------------------------------------------------------- /app/forms/book.py: -------------------------------------------------------------------------------- 1 | # @Time : 9/22/2021 12:47 PM 2 | # @Author : arthur 3 | # @Email : arthurwanggang@outlook.com 4 | # @File : book.py 5 | # @Software: PyCharm 6 | 7 | from wtforms import Form, StringField, IntegerField 8 | from wtforms.validators import Length, NumberRange, DataRequired, Regexp 9 | 10 | 11 | class SearchForm(Form): 12 | q = StringField(validators=[DataRequired(), Length(min=1, max=30)]) 13 | page = IntegerField(validators=[NumberRange(min=1, max=99)], default=1) 14 | -------------------------------------------------------------------------------- /app/libs/__init__.py: -------------------------------------------------------------------------------- 1 | # @Time : 9/23/2021 10:38 AM 2 | # @Author : arthur 3 | # @Email : arthurwanggang@outlook.com 4 | # @File : __init__.py.py 5 | # @Software: PyCharm 6 | -------------------------------------------------------------------------------- /app/libs/helper.py: -------------------------------------------------------------------------------- 1 | # 判断是关键字还是isbn的查询 2 | def is_isbn_or_key(word): 3 | # 默认是关键字查询key 4 | isbn_or_key = "key" 5 | if len(word) == 13 and word.isdigit(): 6 | isbn_or_key = "isbn" 7 | short_word = word.replace("-", "") 8 | if "-" in word and len(short_word) == 10 and short_word.isdigit(): 9 | isbn_or_key = "isbn" 10 | return isbn_or_key 11 | -------------------------------------------------------------------------------- /app/libs/httpRequest.py: -------------------------------------------------------------------------------- 1 | 2 | # @Time : 9/22/2021 10:47 AM 3 | # @Author : arthur 4 | # @Email : arthurwanggang@outlook.com 5 | # @File : httpRequest.py 6 | # @Software: PyCharm 7 | 8 | import requests 9 | 10 | 11 | class HttpRequest: 12 | @staticmethod 13 | def get(url, return_json=True): 14 | # restful 15 | # json 16 | r = requests.get(url) 17 | if r.status_code != 200: 18 | return {} if return_json else '' 19 | return r.json() if return_json else r.text 20 | -------------------------------------------------------------------------------- /app/models/book.py: -------------------------------------------------------------------------------- 1 | # @Time : 9/23/2021 10:39 AM 2 | # @Author : arthur 3 | # @Email : arthurwanggang@outlook.com 4 | # @File : book.py 5 | # @Software: PyCharm 6 | 7 | # sqlalchemy 8 | 9 | from sqlalchemy import Column, Integer, String 10 | from flask_sqlalchemy import SQLAlchemy 11 | 12 | db = SQLAlchemy() 13 | 14 | 15 | class Book(db.Model): 16 | id = Column(Integer, primary_key=True, autoincrement=True) 17 | title = Column(String(50), nullable=False) 18 | author = Column(String(30), nullable=True, default='未名') 19 | binding = Column(String(20)) 20 | publisher = Column(String(50)) 21 | price = Column(String(20)) 22 | pages = Column(Integer) 23 | pubdate = Column(String(20)) 24 | isbn = Column(String(15), nullable=False, unique=True) 25 | summary = Column(String(1000)) 26 | image = Column(String(50)) 27 | -------------------------------------------------------------------------------- /app/secure.py: -------------------------------------------------------------------------------- 1 | # @Time : 9/20/2021 10:47 AM 2 | # @Author : arthur 3 | # @Email : arthurwanggang@outlook.com 4 | # @File : yushu_book.py 5 | # @Software: PyCharm 6 | 7 | DEBUG = False 8 | SQLALCHEMY_DATABASE_URI = 'mysql+cymysql://root:dyh18215153215@localhost:3306/fisher' 9 | -------------------------------------------------------------------------------- /app/setting.py: -------------------------------------------------------------------------------- 1 | # @Time : 9/22/2021 2:11 PM 2 | # @Author : arthur 3 | # @Email : arthurwanggang@outlook.com 4 | # @File : setting.py 5 | # @Software: PyCharm 6 | 7 | PER_PAGE = 15 8 | -------------------------------------------------------------------------------- /app/spider/__init__.py: -------------------------------------------------------------------------------- 1 | # @Time : 9/23/2021 10:26 AM 2 | # @Author : arthur 3 | # @Email : arthurwanggang@outlook.com 4 | # @File : __init__.py.py 5 | # @Software: PyCharm 6 | -------------------------------------------------------------------------------- /app/spider/yushu_book.py: -------------------------------------------------------------------------------- 1 | # @Time : 9/22/2021 10:47 AM 2 | # @Author : arthur 3 | # @Email : arthurwanggang@outlook.com 4 | # @File : yushu_book.py 5 | # @Software: PyCharm 6 | 7 | 8 | from app.libs.httpRequest import HttpRequest 9 | from flask import current_app 10 | 11 | 12 | class YushuBook: 13 | isbn_url = "http://t.talelin.com/v2/book/isbn/{}" 14 | keyword_url = "http://t.talelin.com/v2/book/search?q={}&count={}&start={}" 15 | 16 | @classmethod 17 | def search_by_isbn(cls, isbn): 18 | url = YushuBook.isbn_url.format(isbn) 19 | # dict 20 | result = HttpRequest.get(url) 21 | return result 22 | 23 | @classmethod 24 | def search_by_keyword(cls, keyword, page=1): 25 | url = YushuBook.keyword_url.format(keyword, current_app.config['PER_PAGE'], cls.calculate_start(page)) 26 | # dict 27 | result = HttpRequest.get(url) 28 | return result 29 | 30 | @staticmethod 31 | def calculate_start(page): 32 | return (page - 1) * current_app.config['PER_PAGE'] 33 | -------------------------------------------------------------------------------- /app/testPy/DecisionTreeIris.pmml: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | 2022-09-06T15:00:16Z 6 |
7 | 8 | PMMLPipeline(steps=[('classifier', DecisionTreeClassifier())]) 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 |
61 | -------------------------------------------------------------------------------- /app/testPy/groubBy.py: -------------------------------------------------------------------------------- 1 | # @Time:2022/9/21 09:54 2 | # @Author:Ray 3 | # @File:groubBy.py.py 4 | 5 | import pandas as pd 6 | import numpy as np 7 | 8 | # 先创建一个测试table 9 | 10 | df = pd.DataFrame({'key1': list('aabba'), 11 | 'key2': ['one', 'two', 'one', 'two', 'one'], 12 | 'data1': np.random.randn(5), 13 | 'data2': np.random.randn(5)}) 14 | 15 | # print("df", df) 16 | 17 | # 1: 使用单特征对table进行划分 18 | grouped = df.groupby(["key1"]) 19 | 20 | for name, group in grouped: 21 | print(name) 22 | print(group) 23 | 24 | # 打印之后会发现,是根据key1的不同的值,将df分成两个table,第一部分是key1=a,第二部分是key2=b 25 | # 用其他特征划分原理也是一样的 26 | 27 | 28 | # 2:用key1对表格进行划分,对划分之后的表格求其中的特征的均值, 29 | 30 | groupedMean = df.groupby(["key1"]).mean() 31 | # 如果groupby()一个参数的话,也可以是 groupby("key") 32 | 33 | print("groupedMean", groupedMean) 34 | 35 | # 3:使用多特征对表格进行划分 36 | 37 | for name, group, in df.groupby(["key1", "key2"]): 38 | print("multiple features", name) 39 | print(group, "\n") 40 | 41 | # 具体使用场景 比如 42 | # 有两家商店1和2,每家商店有a,b,c三种商品,每家商店的每种商品都有各自的日销售额, 43 | # 现在要快速得到每家商店每种商品的月销售额,那么我们就可以使用groupby来进行操作。 44 | 45 | df2 = pd.DataFrame({'shop_id': list('111111222222'), 46 | 'item_id': list('abcabcabcabc'), 47 | 'item_daysales': list('123456123456')}) 48 | 49 | print("df_field_table", df2[["shop_id", "item_id"]].groupby("shop_id")) 50 | 51 | # 计算每家点每月的销售量 52 | # reset_index是保留划分的字段(这里就是shop_id and item_id) 53 | month_grounded = df2.groupby(["shop_id", "item_id"]).sum().reset_index() 54 | 55 | print("month_grounded", month_grounded) 56 | -------------------------------------------------------------------------------- /app/testPy/index.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas 3 | from sklearn.tree import DecisionTreeClassifier 4 | from sklearn2pmml.pipeline import PMMLPipeline 5 | from sklearn2pmml import sklearn2pmml 6 | 7 | 8 | iris_df = pandas.read_csv("../ML/data/Iris.csv") 9 | 10 | iris_X = iris_df[iris_df.columns.difference(["Species"])] 11 | iris_y = iris_df["Species"] 12 | 13 | pipeline = PMMLPipeline([ 14 | ("classifier", DecisionTreeClassifier()) 15 | ]) 16 | pipeline.fit(iris_X, iris_y) 17 | 18 | sklearn2pmml(pipeline, "DecisionTreeIris.pmml", with_repr=True) 19 | -------------------------------------------------------------------------------- /app/web/__init__.py: -------------------------------------------------------------------------------- 1 | # @Time : 9/22/2021 10:47 AM 2 | # @Author : arthur 3 | # @Email : arthurwanggang@outlook.com 4 | # @File : __init__.py 5 | # @Software: PyCharm 6 | 7 | from flask import Blueprint 8 | 9 | web = Blueprint('web', __name__) 10 | -------------------------------------------------------------------------------- /app/web/book.py: -------------------------------------------------------------------------------- 1 | # @Time : 9/22/2021 10:47 AM 2 | # @Author : arthur 3 | # @Email : arthurwanggang@outlook.com 4 | # @File : book.py 5 | # @Software: PyCharm 6 | 7 | 8 | from flask import jsonify, request 9 | 10 | from app.libs.helper import is_isbn_or_key 11 | from app.spider.yushu_book import YushuBook 12 | from . import web 13 | from app.forms.book import SearchForm 14 | 15 | 16 | @web.route("/book/search") 17 | def search(): 18 | """ 19 | q: 关键字keyword 或者 isbn 20 | page 21 | """ 22 | # 对参数的各种判断 23 | # q = request.args['q'] 24 | # page = request.args['page'] 25 | form = SearchForm(request.args) 26 | if form.validate(): 27 | q = form.q.data.strip() 28 | page = form.page.data 29 | isbn_or_key = is_isbn_or_key(q) 30 | if isbn_or_key == 'isbn': 31 | result = YushuBook.search_by_isbn(q) 32 | else: 33 | result = YushuBook.search_by_keyword(q, page) 34 | return jsonify(result) 35 | # return result 36 | else: 37 | return jsonify(form.errors) 38 | -------------------------------------------------------------------------------- /app/web/user.py: -------------------------------------------------------------------------------- 1 | # @Time : 9/22/2021 10:47 AM 2 | # @Author : arthur 3 | # @Email : arthurwanggang@outlook.com 4 | # @File : user.py 5 | # @Software: PyCharm 6 | -------------------------------------------------------------------------------- /fisher.py: -------------------------------------------------------------------------------- 1 | 2 | # @Time : 9/20/2021 10:47 AM 3 | # @Author : arthur 4 | # @Email : arthurwanggang@outlook.com 5 | # @File : fisher.py 6 | # @Software: PyCharm 7 | 8 | from app import create_app 9 | 10 | app = create_app() 11 | 12 | if __name__ == '__main__': 13 | # 生产环境 nginx+uwsgi 14 | app.run(host='0.0.0.0', debug=app.config['DEBUG']) 15 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | ###### 使用python和flask,mysql实现一个网站 2 | 使用到的基本链接: 3 | Flash: https://flask.net.cn/ 4 | 5 | python版本3.x, 系统macOS, mysql数据库 6 | 7 | python文档: https://docs.python.org/3.6/tutorial/index.html 8 | 9 | Sqlalchemy: https://www.sqlalchemy.org/ --------------------------------------------------------------------------------