├── app
    ├── ML
    │   ├── MLModel.py
    │   ├── RecommandModel.py
    │   ├── SimpleKNN.py
    │   ├── data
    │   │   ├── Iris.csv
    │   │   ├── movies.csv
    │   │   └── ratings.csv
    │   ├── doc
    │   │   ├── AnalyseRecommandModel.md
    │   │   └── index.md
    │   └── index.md
    ├── __init__.py
    ├── api
    │   └── __init__.py
    ├── elasticsearch
    │   ├── bulk.py
    │   ├── index.py
    │   ├── readJson.py
    │   └── test.py
    ├── forms
    │   └── book.py
    ├── libs
    │   ├── __init__.py
    │   ├── helper.py
    │   └── httpRequest.py
    ├── models
    │   └── book.py
    ├── secure.py
    ├── setting.py
    ├── spider
    │   ├── __init__.py
    │   └── yushu_book.py
    ├── testPy
    │   ├── DecisionTreeIris.pmml
    │   ├── groubBy.py
    │   └── index.py
    └── web
    │   ├── __init__.py
    │   ├── book.py
    │   └── user.py
├── fisher.py
└── readme.md


/app/ML/MLModel.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Until-You-Possible/python-flask-web/09de58d305f9efbc4babde5ad0ed853398433df9/app/ML/MLModel.py


--------------------------------------------------------------------------------
/app/ML/RecommandModel.py:
--------------------------------------------------------------------------------
  1 | # code
  2 | import numpy as np
  3 | import pandas as pd
  4 | import sklearn
  5 | import matplotlib.pyplot as plt
  6 | import seaborn as sns
  7 | # Now, we create user-item matrix using scipy csr matrix
  8 | from scipy.sparse import csr_matrix
  9 | 
 10 | import warnings
 11 | 
 12 | warnings.simplefilter(action='ignore', category=FutureWarning)
 13 | 
 14 | ratings = pd.read_csv("./data/ratings.csv")
 15 | ratings.head()
 16 | 
 17 | movies = pd.read_csv("./data/movies.csv")
 18 | movies.head()
 19 | 
 20 | n_ratings = len(ratings)
 21 | n_movies = len(ratings['movieId'].unique())
 22 | n_users = len(ratings['userId'].unique())
 23 | 
 24 | print(f"Number of ratings: {n_ratings}")
 25 | print(f"Number of unique movieId's: {n_movies}")
 26 | print(f"Number of unique users: {n_users}")
 27 | print(f"Average ratings per user: {round(n_ratings / n_users, 2)}")
 28 | print(f"Average ratings per movie: {round(n_ratings / n_movies, 2)}")
 29 | 
 30 | user_freq = ratings[['userId', 'movieId']].groupby('userId').count().reset_index()
 31 | user_freq.columns = ['userId', 'n_ratings']
 32 | user_freq.head()
 33 | 
 34 | # Find Lowest and Highest rated movies:
 35 | mean_rating = ratings.groupby('movieId')[['rating']].mean()
 36 | # Lowest rated movies
 37 | lowest_rated = mean_rating['rating'].idxmin()
 38 | movies.loc[movies['movieId'] == lowest_rated]
 39 | # Highest rated movies
 40 | highest_rated = mean_rating['rating'].idxmax()
 41 | movies.loc[movies['movieId'] == highest_rated]
 42 | # show number of people who rated movies rated movie highest
 43 | ratings[ratings['movieId'] == highest_rated]
 44 | # show number of people who rated movies rated movie lowest
 45 | ratings[ratings['movieId'] == lowest_rated]
 46 | 
 47 | ## the above movies has very low dataset. We will use bayesian average
 48 | movie_stats = ratings.groupby('movieId')[['rating']].agg(['count', 'mean'])
 49 | movie_stats.columns = movie_stats.columns.droplevel()
 50 | 
 51 | 
 52 | def create_matrix(df):
 53 |     N = len(df['userId'].unique())
 54 |     M = len(df['movieId'].unique())
 55 | 
 56 |     # Map Ids to indices
 57 |     user_mapper = dict(zip(np.unique(df["userId"]), list(range(N))))
 58 |     movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(M))))
 59 | 
 60 |     # Map indices to IDs
 61 |     user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userId"])))
 62 |     movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["movieId"])))
 63 | 
 64 |     user_index = [user_mapper[i] for i in df['userId']]
 65 |     movie_index = [movie_mapper[i] for i in df['movieId']]
 66 | 
 67 |     X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))
 68 | 
 69 |     return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper
 70 | 
 71 | 
 72 | X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(ratings)
 73 | 
 74 | from sklearn.neighbors import NearestNeighbors
 75 | 
 76 | """
 77 | Find similar movies using KNN
 78 | """
 79 | 
 80 | 
 81 | def find_similar_movies(movie_id, X, k, metric='cosine', show_distance=False):
 82 |     neighbour_ids = []
 83 | 
 84 |     movie_ind = movie_mapper[movie_id]
 85 |     movie_vec = X[movie_ind]
 86 |     k += 1
 87 |     kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
 88 |     kNN.fit(X)
 89 |     movie_vec = movie_vec.reshape(1, -1)
 90 |     neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)
 91 |     for i in range(0, k):
 92 |         n = neighbour.item(i)
 93 |         neighbour_ids.append(movie_inv_mapper[n])
 94 |     neighbour_ids.pop(0)
 95 |     return neighbour_ids
 96 | 
 97 | 
 98 | movie_titles = dict(zip(movies['movieId'], movies['title']))
 99 | 
100 | movie_id = 3
101 | 
102 | similar_ids = find_similar_movies(movie_id, X, k=10)
103 | movie_title = movie_titles[movie_id]
104 | 
105 | print(f"Since you watched {movie_title}")
106 | for i in similar_ids:
107 |     print(movie_titles[i])
108 | 


--------------------------------------------------------------------------------
/app/ML/SimpleKNN.py:
--------------------------------------------------------------------------------
 1 | # k-近邻算法
 2 | import pandas
 3 | from sklearn.model_selection import train_test_split
 4 | from sklearn.neighbors import KNeighborsClassifier
 5 | 
 6 | 
 7 | def k_near():
 8 |     """
 9 |         2个样本，3个特征
10 |         a(a1,a2,a3),b(b1,b2,b3)
11 |         欧式距离：
12 |              ____________________________________
13 |         p = √(a1 -b1)^2 + (a2-b2)^2 + (a3 - b3)^2
14 |     """
15 |     # 1、原始数据
16 |     # 读取数据
17 |     train_data = pandas.read_csv("k_near/train.csv")
18 |     # print(train_data.head(10))
19 | 
20 |     # 2、数据处理
21 |     # 数据筛选
22 |     train_data = train_data.query("x > 1.0 & x < 1.25 & y > 2.5 & y < 2.75")
23 | 
24 |     # 转换时间
25 |     time_value = pandas.to_datetime(train_data["time"], unit="s")
26 |     # 转换成字典
27 |     time_value = pandas.DatetimeIndex(time_value)
28 |     # print(time_value)
29 | 
30 |     # 构造特征
31 |     data = train_data.copy()
32 |     data["day"] = time_value.day
33 |     data["hour"] = time_value.hour
34 |     data["weekday"] = time_value.weekday
35 |     # print(train_data.head(10))
36 | 
37 |     # 删除影响特征的数据,axis为1纵向删除
38 |     data = data.drop(["time"], axis=1)
39 | 
40 |     # 删除小于目标值的数据
41 |     place_count = data.groupby("place_id").count()
42 |     # print(place_count)
43 |     # 过滤数量大于5的地点ID，并且加入列中
44 |     tf = place_count[place_count.x > 5].reset_index()
45 |     # print(tf)
46 |     data = data[data["place_id"].isin(tf.place_id)]
47 | 
48 |     # 取特征值和目标值
49 |     y = data["place_id"]
50 |     x = data.drop(["place_id", "row_id"], axis=1)
51 | 
52 |     # 数据分割
53 |     x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
54 | 
55 |     # 3、特征工程
56 |     # 特征工程(标准化)
57 |     std = StandardScaler()
58 |     x_train = std.fit_transform(x_train)
59 |     x_test = std.transform(x_test)
60 | 
61 |     # 4、算法
62 |     # 算法计算
63 |     """
64 |         优点：
65 |             简单、易于理解、易于实现、无需估计参数、无需训练
66 |         缺点：
67 |             懒惰算法，对测试样本分类时的计算量大，内存开销大
68 |             必须指定K值，K值选择不当则分类精度不能保证
69 |         问题：
70 |             k值比较小：容易受异常点影响
71 |             k值比较大：容易受K值影响(类别)影响
72 |             性能问题：每一个数据都要循环计算
73 |     """
74 |     # k值就是n_neighbors，也就是通过多少个邻近数据确认分类
75 |     knn = KNeighborsClassifier(n_neighbors=5)
76 |     knn.fit(x_train, y_train)
77 |     y_predict = knn.predict(x_test)
78 |     print("预测值：", y_predict)
79 | 
80 |     # 5、评估
81 |     # 评估
82 |     score = knn.score(x_test, y_test)
83 |     print("准确率：", score)


--------------------------------------------------------------------------------
/app/ML/data/Iris.csv:
--------------------------------------------------------------------------------
  1 | "","Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species"
  2 | "1",5.1,3.5,1.4,0.2,"setosa"
  3 | "2",4.9,3,1.4,0.2,"setosa"
  4 | "3",4.7,3.2,1.3,0.2,"setosa"
  5 | "4",4.6,3.1,1.5,0.2,"setosa"
  6 | "5",5,3.6,1.4,0.2,"setosa"
  7 | "6",5.4,3.9,1.7,0.4,"setosa"
  8 | "7",4.6,3.4,1.4,0.3,"setosa"
  9 | "8",5,3.4,1.5,0.2,"setosa"
 10 | "9",4.4,2.9,1.4,0.2,"setosa"
 11 | "10",4.9,3.1,1.5,0.1,"setosa"
 12 | "11",5.4,3.7,1.5,0.2,"setosa"
 13 | "12",4.8,3.4,1.6,0.2,"setosa"
 14 | "13",4.8,3,1.4,0.1,"setosa"
 15 | "14",4.3,3,1.1,0.1,"setosa"
 16 | "15",5.8,4,1.2,0.2,"setosa"
 17 | "16",5.7,4.4,1.5,0.4,"setosa"
 18 | "17",5.4,3.9,1.3,0.4,"setosa"
 19 | "18",5.1,3.5,1.4,0.3,"setosa"
 20 | "19",5.7,3.8,1.7,0.3,"setosa"
 21 | "20",5.1,3.8,1.5,0.3,"setosa"
 22 | "21",5.4,3.4,1.7,0.2,"setosa"
 23 | "22",5.1,3.7,1.5,0.4,"setosa"
 24 | "23",4.6,3.6,1,0.2,"setosa"
 25 | "24",5.1,3.3,1.7,0.5,"setosa"
 26 | "25",4.8,3.4,1.9,0.2,"setosa"
 27 | "26",5,3,1.6,0.2,"setosa"
 28 | "27",5,3.4,1.6,0.4,"setosa"
 29 | "28",5.2,3.5,1.5,0.2,"setosa"
 30 | "29",5.2,3.4,1.4,0.2,"setosa"
 31 | "30",4.7,3.2,1.6,0.2,"setosa"
 32 | "31",4.8,3.1,1.6,0.2,"setosa"
 33 | "32",5.4,3.4,1.5,0.4,"setosa"
 34 | "33",5.2,4.1,1.5,0.1,"setosa"
 35 | "34",5.5,4.2,1.4,0.2,"setosa"
 36 | "35",4.9,3.1,1.5,0.2,"setosa"
 37 | "36",5,3.2,1.2,0.2,"setosa"
 38 | "37",5.5,3.5,1.3,0.2,"setosa"
 39 | "38",4.9,3.6,1.4,0.1,"setosa"
 40 | "39",4.4,3,1.3,0.2,"setosa"
 41 | "40",5.1,3.4,1.5,0.2,"setosa"
 42 | "41",5,3.5,1.3,0.3,"setosa"
 43 | "42",4.5,2.3,1.3,0.3,"setosa"
 44 | "43",4.4,3.2,1.3,0.2,"setosa"
 45 | "44",5,3.5,1.6,0.6,"setosa"
 46 | "45",5.1,3.8,1.9,0.4,"setosa"
 47 | "46",4.8,3,1.4,0.3,"setosa"
 48 | "47",5.1,3.8,1.6,0.2,"setosa"
 49 | "48",4.6,3.2,1.4,0.2,"setosa"
 50 | "49",5.3,3.7,1.5,0.2,"setosa"
 51 | "50",5,3.3,1.4,0.2,"setosa"
 52 | "51",7,3.2,4.7,1.4,"versicolor"
 53 | "52",6.4,3.2,4.5,1.5,"versicolor"
 54 | "53",6.9,3.1,4.9,1.5,"versicolor"
 55 | "54",5.5,2.3,4,1.3,"versicolor"
 56 | "55",6.5,2.8,4.6,1.5,"versicolor"
 57 | "56",5.7,2.8,4.5,1.3,"versicolor"
 58 | "57",6.3,3.3,4.7,1.6,"versicolor"
 59 | "58",4.9,2.4,3.3,1,"versicolor"
 60 | "59",6.6,2.9,4.6,1.3,"versicolor"
 61 | "60",5.2,2.7,3.9,1.4,"versicolor"
 62 | "61",5,2,3.5,1,"versicolor"
 63 | "62",5.9,3,4.2,1.5,"versicolor"
 64 | "63",6,2.2,4,1,"versicolor"
 65 | "64",6.1,2.9,4.7,1.4,"versicolor"
 66 | "65",5.6,2.9,3.6,1.3,"versicolor"
 67 | "66",6.7,3.1,4.4,1.4,"versicolor"
 68 | "67",5.6,3,4.5,1.5,"versicolor"
 69 | "68",5.8,2.7,4.1,1,"versicolor"
 70 | "69",6.2,2.2,4.5,1.5,"versicolor"
 71 | "70",5.6,2.5,3.9,1.1,"versicolor"
 72 | "71",5.9,3.2,4.8,1.8,"versicolor"
 73 | "72",6.1,2.8,4,1.3,"versicolor"
 74 | "73",6.3,2.5,4.9,1.5,"versicolor"
 75 | "74",6.1,2.8,4.7,1.2,"versicolor"
 76 | "75",6.4,2.9,4.3,1.3,"versicolor"
 77 | "76",6.6,3,4.4,1.4,"versicolor"
 78 | "77",6.8,2.8,4.8,1.4,"versicolor"
 79 | "78",6.7,3,5,1.7,"versicolor"
 80 | "79",6,2.9,4.5,1.5,"versicolor"
 81 | "80",5.7,2.6,3.5,1,"versicolor"
 82 | "81",5.5,2.4,3.8,1.1,"versicolor"
 83 | "82",5.5,2.4,3.7,1,"versicolor"
 84 | "83",5.8,2.7,3.9,1.2,"versicolor"
 85 | "84",6,2.7,5.1,1.6,"versicolor"
 86 | "85",5.4,3,4.5,1.5,"versicolor"
 87 | "86",6,3.4,4.5,1.6,"versicolor"
 88 | "87",6.7,3.1,4.7,1.5,"versicolor"
 89 | "88",6.3,2.3,4.4,1.3,"versicolor"
 90 | "89",5.6,3,4.1,1.3,"versicolor"
 91 | "90",5.5,2.5,4,1.3,"versicolor"
 92 | "91",5.5,2.6,4.4,1.2,"versicolor"
 93 | "92",6.1,3,4.6,1.4,"versicolor"
 94 | "93",5.8,2.6,4,1.2,"versicolor"
 95 | "94",5,2.3,3.3,1,"versicolor"
 96 | "95",5.6,2.7,4.2,1.3,"versicolor"
 97 | "96",5.7,3,4.2,1.2,"versicolor"
 98 | "97",5.7,2.9,4.2,1.3,"versicolor"
 99 | "98",6.2,2.9,4.3,1.3,"versicolor"
100 | "99",5.1,2.5,3,1.1,"versicolor"
101 | "100",5.7,2.8,4.1,1.3,"versicolor"
102 | "101",6.3,3.3,6,2.5,"virginica"
103 | "102",5.8,2.7,5.1,1.9,"virginica"
104 | "103",7.1,3,5.9,2.1,"virginica"
105 | "104",6.3,2.9,5.6,1.8,"virginica"
106 | "105",6.5,3,5.8,2.2,"virginica"
107 | "106",7.6,3,6.6,2.1,"virginica"
108 | "107",4.9,2.5,4.5,1.7,"virginica"
109 | "108",7.3,2.9,6.3,1.8,"virginica"
110 | "109",6.7,2.5,5.8,1.8,"virginica"
111 | "110",7.2,3.6,6.1,2.5,"virginica"
112 | "111",6.5,3.2,5.1,2,"virginica"
113 | "112",6.4,2.7,5.3,1.9,"virginica"
114 | "113",6.8,3,5.5,2.1,"virginica"
115 | "114",5.7,2.5,5,2,"virginica"
116 | "115",5.8,2.8,5.1,2.4,"virginica"
117 | "116",6.4,3.2,5.3,2.3,"virginica"
118 | "117",6.5,3,5.5,1.8,"virginica"
119 | "118",7.7,3.8,6.7,2.2,"virginica"
120 | "119",7.7,2.6,6.9,2.3,"virginica"
121 | "120",6,2.2,5,1.5,"virginica"
122 | "121",6.9,3.2,5.7,2.3,"virginica"
123 | "122",5.6,2.8,4.9,2,"virginica"
124 | "123",7.7,2.8,6.7,2,"virginica"
125 | "124",6.3,2.7,4.9,1.8,"virginica"
126 | "125",6.7,3.3,5.7,2.1,"virginica"
127 | "126",7.2,3.2,6,1.8,"virginica"
128 | "127",6.2,2.8,4.8,1.8,"virginica"
129 | "128",6.1,3,4.9,1.8,"virginica"
130 | "129",6.4,2.8,5.6,2.1,"virginica"
131 | "130",7.2,3,5.8,1.6,"virginica"
132 | "131",7.4,2.8,6.1,1.9,"virginica"
133 | "132",7.9,3.8,6.4,2,"virginica"
134 | "133",6.4,2.8,5.6,2.2,"virginica"
135 | "134",6.3,2.8,5.1,1.5,"virginica"
136 | "135",6.1,2.6,5.6,1.4,"virginica"
137 | "136",7.7,3,6.1,2.3,"virginica"
138 | "137",6.3,3.4,5.6,2.4,"virginica"
139 | "138",6.4,3.1,5.5,1.8,"virginica"
140 | "139",6,3,4.8,1.8,"virginica"
141 | "140",6.9,3.1,5.4,2.1,"virginica"
142 | "141",6.7,3.1,5.6,2.4,"virginica"
143 | "142",6.9,3.1,5.1,2.3,"virginica"
144 | "143",5.8,2.7,5.1,1.9,"virginica"
145 | "144",6.8,3.2,5.9,2.3,"virginica"
146 | "145",6.7,3.3,5.7,2.5,"virginica"
147 | "146",6.7,3,5.2,2.3,"virginica"
148 | "147",6.3,2.5,5,1.9,"virginica"
149 | "148",6.5,3,5.2,2,"virginica"
150 | "149",6.2,3.4,5.4,2.3,"virginica"
151 | "150",5.9,3,5.1,1.8,"virginica"


--------------------------------------------------------------------------------
/app/ML/doc/AnalyseRecommandModel.md:
--------------------------------------------------------------------------------
 1 | ***对RecommendModel模型的分析***
 2 | 
 3 | **import的部分**
 4 | 
 5 | 1: numpy就不赘述了(做矩阵计算用的)
 6 | 
 7 | 2: pandas
 8 |  参考文档 https://www.pypandas.cn/docs/getting_started/basics.html#head-%E4%B8%8E-tail
 9 | 3: sklearn，基本的机器学习的框架
10 | 4：seaborn, 这是一个基于matplotlib进行高级封装的可视化库，相比之下，绘制图表更为集成化、绘图风格具有更高的定制性
11 | 相关参考文档： https://pypi.org/project/seaborn/
12 | https://zhuanlan.zhihu.com/p/342945532
13 | 5: scipy相关的 https://www.biaodianfu.com/scipy-sparse.html
14 | 是关于创建 稀释的矩阵的内容。细节可以参考文档
15 | 6：warning模块 https://blog.csdn.net/low5252/article/details/109334695
16 | 
17 | 
18 | ***数据部分***
19 | 
20 | 首先是数据部分： 我们有两份可用数据分别是ratings.csv和movies.csv，
21 | rating，部分包括 userId,movieId,rating,timestamp 几个字段的数据
22 | movies，部分包括 movieId,title,genres 三个字段
23 | 
24 | 
25 | ***coding部分***
26 | 
27 | 1： 读取rating文件并取前5条 (head, tail是末尾5条)
28 | 2： movies同理
29 | 
30 | n_ratings,得到ratings的length
31 | movies
32 | users
33 | 同理 (unique 去重，从大到小返回)
34 | 
35 | 1: user_freq 这里
36 | 根据userId纬度将ratings数据分为多分(取决于有userId有多少不同值)表格，然后统计数量(count),
37 | reset_index 保留groupby的 参数，方便查看理解数据。
38 | column这里重新命名table
39 | 取出前五条
40 | 
41 | 1：mean_rating
42 |  
43 | 


--------------------------------------------------------------------------------
/app/ML/doc/index.md:
--------------------------------------------------------------------------------
1 | ### 常用概念


--------------------------------------------------------------------------------
/app/ML/index.md:
--------------------------------------------------------------------------------
1 | ### 测试一些模型
2 | 
3 | 1： 一些参考资料
4 | https://www.cnblogs.com/ll409546297/p/11215141.html


--------------------------------------------------------------------------------
/app/__init__.py:
--------------------------------------------------------------------------------
 1 | # @Time    : 9/22/2021 10:47 AM
 2 | # @Author  : arthur
 3 | # @Email   : arthurwanggang@outlook.com
 4 | # @File    : __init__.py
 5 | # @Software: PyCharm
 6 | 
 7 | from flask import Flask
 8 | from app.web.book import web
 9 | from app.models.book import db
10 | 
11 | 
12 | def create_app():
13 |     app = Flask(__name__)
14 |     # 引入配置文件
15 |     app.config.from_object('app.secure')
16 |     app.config.from_object('app.setting')
17 |     register_blueprint(app)
18 |     db.init_app(app)
19 |     db.create_all(app=app)
20 |     return app
21 | 
22 | 
23 | def register_blueprint(app):
24 |     app.register_blueprint(web)
25 | 


--------------------------------------------------------------------------------
/app/api/__init__.py:
--------------------------------------------------------------------------------
1 | # @Time    : 9/22/2021 2:10 PM
2 | # @Author  : arthur
3 | # @Email   : arthurwanggang@outlook.com
4 | # @File    : __init__.py.py
5 | # @Software: PyCharm
6 | 


--------------------------------------------------------------------------------
/app/elasticsearch/bulk.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import time
 3 | 
 4 | from elasticsearch import helpers
 5 | 
 6 | from elasticsearch import Elasticsearch
 7 | 
 8 | es_url = "http://127.0.0.1:9200"
 9 | es = Elasticsearch(es_url)
10 | print(es.info)
11 | 
12 | configurations = {
13 |     "index_name": "index_name",
14 |     "index_type": "index_type",
15 |     "request_body": {}
16 | }
17 | 
18 | source_path = "/Users/wanggang/Documents/kba/kba.json"
19 | 
20 | 
21 | def create_index():
22 |     es.indices.create(index=configurations.get("index_name"), body=configurations.get("request_body"))
23 |     print("create a new index")
24 | 
25 | 
26 | # create_index()
27 | 
28 | 
29 | def check_json_count():
30 |     count = 0
31 |     start_time = time.time()
32 |     with open(source_path, "r", encoding="UTF-8") as fp:
33 |         json_data = json.load(fp, strict=False)
34 |         for item in json_data:
35 |             count += 1
36 |     end_time = time.time()
37 |     t = end_time - start_time
38 |     des = "读取这些数据{}条，共花费{}秒".format(count, t)
39 |     return des
40 | 
41 | 
42 | print(check_json_count())
43 | 
44 | 
45 | def read_json_file():
46 |     with open(source_path, "r", encoding="UTF-8") as fp:
47 |         json_data = json.load(fp, strict=False)
48 |         actions = []
49 |         count = 0
50 |         for item in json_data:
51 | 
52 |             count += 1
53 |             action = {
54 |                 "_index": configurations.get("index_name"),
55 |                 "type": configurations.get("index_type"),
56 |                 "_source": item
57 |             }
58 |             actions.append(action)
59 |             if len(actions) == 1000:
60 |                 helpers.bulk(es, actions)
61 |                 actions = []
62 |     helpers.bulk(es, actions)
63 | 
64 | 
65 | # read_json_file()
66 | 
67 | 
68 | def check_json_count_block():
69 |     count = 0
70 |     block_size = 1024 * 8
71 |     with open(source_path, "r", encoding="UTF-8") as fp:
72 |         while True:
73 |             chunk = fp.read(block_size)
74 |             if not chunk:
75 |                 break
76 |             count += chunk.count("SAP")
77 |     return count
78 | 
79 | 
80 | print("test number", check_json_count_block())
81 | 
82 | 
83 | 
84 | 
85 | 
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 
92 | 
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/app/elasticsearch/index.py:
--------------------------------------------------------------------------------
 1 | from elasticsearch import Elasticsearch
 2 | import json
 3 | import time
 4 | from elasticsearch import helpers
 5 | from elasticsearch import Elasticsearch
 6 | 
 7 | es = Elasticsearch('http://127.0.0.1:9200')
 8 | print(es.info())
 9 | count = 0
10 | start_time = time.time()
11 | actions = []
12 | source_path = "/Users/wanggang/Documents/kba/kba.json"
13 | index_name = "kba"
14 | index_type = "_doc"
15 | with open(source_path, 'r', encoding='utf8') as fp:
16 |     json_data = json.load(fp, strict=False)
17 |     for item in json_data:
18 |         count += 1
19 |         action = {
20 |             "_index": index_name,
21 |             "_type": index_type,
22 |             "source": item
23 |         }
24 |         actions.append(action)
25 |         if len(actions) == 1000:
26 |             helpers.bulk(es, actions)
27 |             actions = []
28 |     helpers.bulk(es, actions)
29 |     print("总共有数据", count)
30 |     end_time = time.time()
31 |     t = end_time - start_time
32 |     print("已经被导入数据", count, "耗时", t)
33 | 
34 | 


--------------------------------------------------------------------------------
/app/elasticsearch/readJson.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import jsonlines
 4 | from elasticsearch import Elasticsearch
 5 | 
 6 | es = Elasticsearch(['http://127.0.0.1'], port=9200)
 7 | # 链接es成功
 8 | print(es.info())
 9 | 
10 | mapping = {
11 |     'properties': {
12 |         'title': {
13 |             'type': 'text',
14 |             'analyzer': 'standard'
15 |         }
16 |     }
17 | }
18 | 
19 | es.indices.create(index='news', ignore=400)
20 | es.indices.put_mapping(index='news', doc_type='politics', include_type_name=True, body=mapping)
21 | 
22 | with open('/Users/wanggang/Desktop/index.json', 'r', encoding='utf8') as fp:
23 |     json_data = json.load(fp)
24 |     for item in json_data:
25 |         print("item", item)
26 |         es.index(index="news",  doc_type='politics', document=item)
27 |     print('这是文件中的json数据：', json_data)
28 |     print('这是读取到文件数据的数据类型：', type(json_data))
29 | 


--------------------------------------------------------------------------------
/app/elasticsearch/test.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | import os
  4 | import uuid
  5 | 
  6 | from elasticsearch import Elasticsearch, helpers
  7 | 
  8 | # create a new instance of the Elasticsearch client class
  9 | elastic = Elasticsearch()
 10 | # ...or uncomment to use this instead:
 11 | # elastic = Elasticsearch("localhost")
 12 | 
 13 | '''
 14 | a simple function that gets the working path of
 15 | the Python script and returns it
 16 | '''
 17 | 
 18 | 
 19 | def script_path():
 20 |     path = os.path.dirname(os.path.realpath(__file__))
 21 |     if os.name == 'posix':  # posix is for macOS or Linux
 22 |         path = path + "/"
 23 |     else:
 24 |         path = path + chr(92)  # backslash is for Windows
 25 |     return path
 26 | 
 27 | 
 28 | '''
 29 | this function opens a file and returns its
 30 | contents as a list of strings split by linebreaks
 31 | '''
 32 | 
 33 | 
 34 | def get_data_from_file(self, path=script_path()):
 35 |     file = open(path + str(self), encoding="utf8", errors='ignore')
 36 |     data = [line.strip() for line in file]
 37 |     file.close()
 38 |     return data
 39 | 
 40 | 
 41 | '''
 42 | generator to push bulk data from a JSON
 43 | file into an Elasticsearch index
 44 | '''
 45 | 
 46 | 
 47 | def bulk_json_data(json_file, _index, doc_type):
 48 |     json_list = get_data_from_file(json_file)
 49 |     for doc in json_list:
 50 |         # use a `yield` generator so that the data
 51 |         # isn't loaded into memory
 52 |         if '{"index"' not in doc:
 53 |             yield {
 54 |                 "_index": _index,
 55 |                 "_type": doc_type,
 56 |                 "_id": uuid.uuid4(),
 57 |                 "_source": doc
 58 |             }
 59 | 
 60 | 
 61 | try:
 62 |     # make the bulk call, and get a response
 63 |     response = helpers.bulk(elastic, bulk_json_data("people.json", "employees", "people"))
 64 |     print("\nbulk_json_data() RESPONSE:", response)
 65 | except Exception as e:
 66 |     print("\nERROR:", e)
 67 | 
 68 | # iterator for a single document
 69 | actions = [
 70 |     {
 71 |         "_id": uuid.uuid4(),  # random UUID for _id
 72 |         "doc_type": "person",  # document _type
 73 |         "doc": {  # the body of the document
 74 |             "name": "George Peterson",
 75 |             "sex": "male",
 76 |             "age": 34,
 77 |             "years": 10
 78 |         }
 79 |     }
 80 | ]
 81 | 
 82 | # iterator for multiple docs
 83 | actions = [
 84 |     {
 85 |         "_id": uuid.uuid4(),  # random UUID for _id
 86 |         "doc_type": "person",  # document _type
 87 |         "doc": {  # the body of the document
 88 |             "name": "George Peterson",
 89 |             "sex": "male",
 90 |             "age": 34 + doc,
 91 |             "years": 10 + doc
 92 |         }
 93 |     }
 94 |     for doc in range(100)  # use 'for' loop to insert 100 documents
 95 | ]
 96 | 
 97 | try:
 98 |     # make the bulk call using 'actions' and get a response
 99 |     response = helpers.bulk(elastic, actions, index='employees', doc_type='people')
100 |     print("\nactions RESPONSE:", response)
101 | except Exception as e:
102 |     print("\nERROR:", e)
103 | 


--------------------------------------------------------------------------------
/app/forms/book.py:
--------------------------------------------------------------------------------
 1 | # @Time    : 9/22/2021 12:47 PM
 2 | # @Author  : arthur
 3 | # @Email   : arthurwanggang@outlook.com
 4 | # @File    : book.py
 5 | # @Software: PyCharm
 6 | 
 7 | from wtforms import Form, StringField, IntegerField
 8 | from wtforms.validators import Length, NumberRange, DataRequired, Regexp
 9 | 
10 | 
11 | class SearchForm(Form):
12 |     q = StringField(validators=[DataRequired(), Length(min=1, max=30)])
13 |     page = IntegerField(validators=[NumberRange(min=1, max=99)], default=1)
14 | 


--------------------------------------------------------------------------------
/app/libs/__init__.py:
--------------------------------------------------------------------------------
1 | # @Time    : 9/23/2021 10:38 AM
2 | # @Author  : arthur
3 | # @Email   : arthurwanggang@outlook.com
4 | # @File    : __init__.py.py
5 | # @Software: PyCharm
6 | 


--------------------------------------------------------------------------------
/app/libs/helper.py:
--------------------------------------------------------------------------------
 1 | # 判断是关键字还是isbn的查询
 2 | def is_isbn_or_key(word):
 3 |     # 默认是关键字查询key
 4 |     isbn_or_key = "key"
 5 |     if len(word) == 13 and word.isdigit():
 6 |         isbn_or_key = "isbn"
 7 |     short_word = word.replace("-", "")
 8 |     if "-" in word and len(short_word) == 10 and short_word.isdigit():
 9 |         isbn_or_key = "isbn"
10 |     return isbn_or_key
11 | 


--------------------------------------------------------------------------------
/app/libs/httpRequest.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # @Time    : 9/22/2021 10:47 AM
 3 | # @Author  : arthur
 4 | # @Email   : arthurwanggang@outlook.com
 5 | # @File    : httpRequest.py
 6 | # @Software: PyCharm
 7 | 
 8 | import requests
 9 | 
10 | 
11 | class HttpRequest:
12 |     @staticmethod
13 |     def get(url, return_json=True):
14 |         # restful
15 |         # json
16 |         r = requests.get(url)
17 |         if r.status_code != 200:
18 |             return {} if return_json else ''
19 |         return r.json() if return_json else r.text
20 | 


--------------------------------------------------------------------------------
/app/models/book.py:
--------------------------------------------------------------------------------
 1 | # @Time    : 9/23/2021 10:39 AM
 2 | # @Author  : arthur
 3 | # @Email   : arthurwanggang@outlook.com
 4 | # @File    : book.py
 5 | # @Software: PyCharm
 6 | 
 7 | # sqlalchemy
 8 | 
 9 | from sqlalchemy import Column, Integer, String
10 | from flask_sqlalchemy import SQLAlchemy
11 | 
12 | db = SQLAlchemy()
13 | 
14 | 
15 | class Book(db.Model):
16 |     id = Column(Integer, primary_key=True, autoincrement=True)
17 |     title = Column(String(50), nullable=False)
18 |     author = Column(String(30), nullable=True, default='未名')
19 |     binding = Column(String(20))
20 |     publisher = Column(String(50))
21 |     price = Column(String(20))
22 |     pages = Column(Integer)
23 |     pubdate = Column(String(20))
24 |     isbn = Column(String(15), nullable=False, unique=True)
25 |     summary = Column(String(1000))
26 |     image = Column(String(50))
27 | 


--------------------------------------------------------------------------------
/app/secure.py:
--------------------------------------------------------------------------------
1 | # @Time    : 9/20/2021 10:47 AM
2 | # @Author  : arthur
3 | # @Email   : arthurwanggang@outlook.com
4 | # @File    : yushu_book.py
5 | # @Software: PyCharm
6 | 
7 | DEBUG = False
8 | SQLALCHEMY_DATABASE_URI = 'mysql+cymysql://root:dyh18215153215@localhost:3306/fisher'
9 | 


--------------------------------------------------------------------------------
/app/setting.py:
--------------------------------------------------------------------------------
1 | # @Time    : 9/22/2021 2:11 PM
2 | # @Author  : arthur
3 | # @Email   : arthurwanggang@outlook.com
4 | # @File    : setting.py
5 | # @Software: PyCharm
6 | 
7 | PER_PAGE = 15
8 | 


--------------------------------------------------------------------------------
/app/spider/__init__.py:
--------------------------------------------------------------------------------
1 | # @Time    : 9/23/2021 10:26 AM
2 | # @Author  : arthur
3 | # @Email   : arthurwanggang@outlook.com
4 | # @File    : __init__.py.py
5 | # @Software: PyCharm
6 | 


--------------------------------------------------------------------------------
/app/spider/yushu_book.py:
--------------------------------------------------------------------------------
 1 | # @Time    : 9/22/2021 10:47 AM
 2 | # @Author  : arthur
 3 | # @Email   : arthurwanggang@outlook.com
 4 | # @File    : yushu_book.py
 5 | # @Software: PyCharm
 6 | 
 7 | 
 8 | from app.libs.httpRequest import HttpRequest
 9 | from flask import current_app
10 | 
11 | 
12 | class YushuBook:
13 |     isbn_url = "http://t.talelin.com/v2/book/isbn/{}"
14 |     keyword_url = "http://t.talelin.com/v2/book/search?q={}&count={}&start={}"
15 | 
16 |     @classmethod
17 |     def search_by_isbn(cls, isbn):
18 |         url = YushuBook.isbn_url.format(isbn)
19 |         # dict
20 |         result = HttpRequest.get(url)
21 |         return result
22 | 
23 |     @classmethod
24 |     def search_by_keyword(cls, keyword, page=1):
25 |         url = YushuBook.keyword_url.format(keyword, current_app.config['PER_PAGE'], cls.calculate_start(page))
26 |         # dict
27 |         result = HttpRequest.get(url)
28 |         return result
29 | 
30 |     @staticmethod
31 |     def calculate_start(page):
32 |         return (page - 1) * current_app.config['PER_PAGE']
33 | 


--------------------------------------------------------------------------------
/app/testPy/DecisionTreeIris.pmml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
 2 | <PMML xmlns="http://www.dmg.org/PMML-4_4" xmlns:data="http://jpmml.org/jpmml-model/InlineTable" version="4.4">
 3 | 	<Header>
 4 | 		<Application name="JPMML-SkLearn library" version="1.7.10"/>
 5 | 		<Timestamp>2022-09-06T15:00:16Z</Timestamp>
 6 | 	</Header>
 7 | 	<MiningBuildTask>
 8 | 		<Extension name="repr">PMMLPipeline(steps=[('classifier', DecisionTreeClassifier())])</Extension>
 9 | 	</MiningBuildTask>
10 | 	<DataDictionary>
11 | 		<DataField name="Species" optype="categorical" dataType="string">
12 | 			<Value value="setosa"/>
13 | 			<Value value="versicolor"/>
14 | 			<Value value="virginica"/>
15 | 		</DataField>
16 | 		<DataField name="Petal.Width" optype="continuous" dataType="float"/>
17 | 		<DataField name="Unnamed: 0" optype="continuous" dataType="float"/>
18 | 	</DataDictionary>
19 | 	<TreeModel functionName="classification" algorithmName="sklearn.tree._classes.DecisionTreeClassifier" missingValueStrategy="nullPrediction">
20 | 		<MiningSchema>
21 | 			<MiningField name="Species" usageType="target"/>
22 | 			<MiningField name="Petal.Width"/>
23 | 			<MiningField name="Unnamed: 0"/>
24 | 		</MiningSchema>
25 | 		<Output>
26 | 			<OutputField name="probability(setosa)" optype="continuous" dataType="double" feature="probability" value="setosa"/>
27 | 			<OutputField name="probability(versicolor)" optype="continuous" dataType="double" feature="probability" value="versicolor"/>
28 | 			<OutputField name="probability(virginica)" optype="continuous" dataType="double" feature="probability" value="virginica"/>
29 | 		</Output>
30 | 		<LocalTransformations>
31 | 			<DerivedField name="double(Petal.Width)" optype="continuous" dataType="double">
32 | 				<FieldRef field="Petal.Width"/>
33 | 			</DerivedField>
34 | 			<DerivedField name="double(Unnamed: 0)" optype="continuous" dataType="double">
35 | 				<FieldRef field="Unnamed: 0"/>
36 | 			</DerivedField>
37 | 		</LocalTransformations>
38 | 		<Node>
39 | 			<True/>
40 | 			<Node score="setosa" recordCount="50">
41 | 				<SimplePredicate field="double(Petal.Width)" operator="lessOrEqual" value="0.800000011920929"/>
42 | 				<ScoreDistribution value="setosa" recordCount="50"/>
43 | 				<ScoreDistribution value="versicolor" recordCount="0"/>
44 | 				<ScoreDistribution value="virginica" recordCount="0"/>
45 | 			</Node>
46 | 			<Node score="versicolor" recordCount="50">
47 | 				<SimplePredicate field="double(Unnamed: 0)" operator="lessOrEqual" value="100.5"/>
48 | 				<ScoreDistribution value="setosa" recordCount="0"/>
49 | 				<ScoreDistribution value="versicolor" recordCount="50"/>
50 | 				<ScoreDistribution value="virginica" recordCount="0"/>
51 | 			</Node>
52 | 			<Node score="virginica" recordCount="50">
53 | 				<True/>
54 | 				<ScoreDistribution value="setosa" recordCount="0"/>
55 | 				<ScoreDistribution value="versicolor" recordCount="0"/>
56 | 				<ScoreDistribution value="virginica" recordCount="50"/>
57 | 			</Node>
58 | 		</Node>
59 | 	</TreeModel>
60 | </PMML>
61 | 


--------------------------------------------------------------------------------
/app/testPy/groubBy.py:
--------------------------------------------------------------------------------
 1 | # @Time:2022/9/21 09:54
 2 | # @Author:Ray
 3 | # @File:groubBy.py.py
 4 | 
 5 | import pandas as pd
 6 | import numpy as np
 7 | 
 8 | # 先创建一个测试table
 9 | 
10 | df = pd.DataFrame({'key1': list('aabba'),
11 |                    'key2': ['one', 'two', 'one', 'two', 'one'],
12 |                    'data1': np.random.randn(5),
13 |                    'data2': np.random.randn(5)})
14 | 
15 | # print("df", df)
16 | 
17 | # 1： 使用单特征对table进行划分
18 | grouped = df.groupby(["key1"])
19 | 
20 | for name, group in grouped:
21 |     print(name)
22 |     print(group)
23 | 
24 | # 打印之后会发现，是根据key1的不同的值，将df分成两个table，第一部分是key1=a,第二部分是key2=b
25 | # 用其他特征划分原理也是一样的
26 | 
27 | 
28 | # 2：用key1对表格进行划分，对划分之后的表格求其中的特征的均值，
29 | 
30 | groupedMean = df.groupby(["key1"]).mean()
31 | # 如果groupby()一个参数的话，也可以是 groupby("key")
32 | 
33 | print("groupedMean", groupedMean)
34 | 
35 | # 3：使用多特征对表格进行划分
36 | 
37 | for name, group, in df.groupby(["key1", "key2"]):
38 |     print("multiple features", name)
39 |     print(group, "\n")
40 | 
41 | # 具体使用场景 比如
42 | # 有两家商店1和2，每家商店有a，b，c三种商品，每家商店的每种商品都有各自的日销售额，
43 | # 现在要快速得到每家商店每种商品的月销售额，那么我们就可以使用groupby来进行操作。
44 | 
45 | df2 = pd.DataFrame({'shop_id': list('111111222222'),
46 |                     'item_id': list('abcabcabcabc'),
47 |                     'item_daysales': list('123456123456')})
48 | 
49 | print("df_field_table", df2[["shop_id", "item_id"]].groupby("shop_id"))
50 | 
51 | # 计算每家点每月的销售量
52 | # reset_index是保留划分的字段(这里就是shop_id and item_id)
53 | month_grounded = df2.groupby(["shop_id", "item_id"]).sum().reset_index()
54 | 
55 | print("month_grounded", month_grounded)
56 | 


--------------------------------------------------------------------------------
/app/testPy/index.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import pandas
 3 | from sklearn.tree import DecisionTreeClassifier
 4 | from sklearn2pmml.pipeline import PMMLPipeline
 5 | from sklearn2pmml import sklearn2pmml
 6 | 
 7 | 
 8 | iris_df = pandas.read_csv("../ML/data/Iris.csv")
 9 | 
10 | iris_X = iris_df[iris_df.columns.difference(["Species"])]
11 | iris_y = iris_df["Species"]
12 | 
13 | pipeline = PMMLPipeline([
14 |     ("classifier", DecisionTreeClassifier())
15 | ])
16 | pipeline.fit(iris_X, iris_y)
17 | 
18 | sklearn2pmml(pipeline, "DecisionTreeIris.pmml", with_repr=True)
19 | 


--------------------------------------------------------------------------------
/app/web/__init__.py:
--------------------------------------------------------------------------------
 1 | # @Time    : 9/22/2021 10:47 AM
 2 | # @Author  : arthur
 3 | # @Email   : arthurwanggang@outlook.com
 4 | # @File    : __init__.py
 5 | # @Software: PyCharm
 6 | 
 7 | from flask import Blueprint
 8 | 
 9 | web = Blueprint('web', __name__)
10 | 


--------------------------------------------------------------------------------
/app/web/book.py:
--------------------------------------------------------------------------------
 1 | # @Time    : 9/22/2021 10:47 AM
 2 | # @Author  : arthur
 3 | # @Email   : arthurwanggang@outlook.com
 4 | # @File    : book.py
 5 | # @Software: PyCharm
 6 | 
 7 | 
 8 | from flask import jsonify, request
 9 | 
10 | from app.libs.helper import is_isbn_or_key
11 | from app.spider.yushu_book import YushuBook
12 | from . import web
13 | from app.forms.book import SearchForm
14 | 
15 | 
16 | @web.route("/book/search")
17 | def search():
18 |     """
19 |        q: 关键字keyword 或者 isbn
20 |        page
21 |     """
22 |     # 对参数的各种判断
23 |     # q = request.args['q']
24 |     # page = request.args['page']
25 |     form = SearchForm(request.args)
26 |     if form.validate():
27 |         q = form.q.data.strip()
28 |         page = form.page.data
29 |         isbn_or_key = is_isbn_or_key(q)
30 |         if isbn_or_key == 'isbn':
31 |             result = YushuBook.search_by_isbn(q)
32 |         else:
33 |             result = YushuBook.search_by_keyword(q, page)
34 |         return jsonify(result)
35 |         # return result
36 |     else:
37 |         return jsonify(form.errors)
38 | 


--------------------------------------------------------------------------------
/app/web/user.py:
--------------------------------------------------------------------------------
1 | # @Time    : 9/22/2021 10:47 AM
2 | # @Author  : arthur
3 | # @Email   : arthurwanggang@outlook.com
4 | # @File    : user.py
5 | # @Software: PyCharm
6 | 


--------------------------------------------------------------------------------
/fisher.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # @Time    : 9/20/2021 10:47 AM
 3 | # @Author  : arthur
 4 | # @Email   : arthurwanggang@outlook.com
 5 | # @File    : fisher.py
 6 | # @Software: PyCharm
 7 | 
 8 | from app import create_app
 9 | 
10 | app = create_app()
11 | 
12 | if __name__ == '__main__':
13 |     # 生产环境 nginx+uwsgi
14 |     app.run(host='0.0.0.0', debug=app.config['DEBUG'])
15 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | ###### 使用python和flask，mysql实现一个网站
2 | 使用到的基本链接：
3 | Flash: https://flask.net.cn/
4 | 
5 | python版本3.x, 系统macOS， mysql数据库
6 | 
7 | python文档： https://docs.python.org/3.6/tutorial/index.html
8 | 
9 | Sqlalchemy: https://www.sqlalchemy.org/


--------------------------------------------------------------------------------