├── .gitignore
├── README.md
├── app
    ├── ConstFile.py
    ├── Utils.py
    ├── __init__.py
    ├── dao
    │   ├── MLModelDao.py
    │   ├── ModelDao.py
    │   ├── ModelExecuteDao.py
    │   ├── OperatorDao.py
    │   ├── OperatorTypeDao.py
    │   ├── ProjectDao.py
    │   └── __init__.py
    ├── enmus
    │   ├── EnumConst.py
    │   └── __init__.py
    ├── ml
    │   ├── __init__.py
    │   ├── multipleClassification
    │   │   ├── LR.py
    │   │   ├── MPC.py
    │   │   ├── RF.py
    │   │   └── __init__.py
    │   └── secondClassification
    │   │   ├── GBDT.py
    │   │   ├── LR.py
    │   │   ├── SVM.py
    │   │   └── __init__.py
    ├── models
    │   ├── MSEntity.py
    │   ├── ServerNameMap.py
    │   └── __init__.py
    ├── service
    │   ├── ClearTask.py
    │   ├── ExplorationService.py
    │   ├── FEService.py
    │   ├── MLModelService.py
    │   ├── ModelExecuteService.py
    │   ├── ModelService.py
    │   ├── PreprocessService.py
    │   ├── __init__.py
    │   └── ml
    │   │   ├── Evaluation.py
    │   │   ├── ModelService.py
    │   │   ├── MultipleClassifition.py
    │   │   ├── PredictService.py
    │   │   ├── SecondClassification.py
    │   │   └── __init__.py
    ├── test
    │   ├── FPGrowthTest.py
    │   ├── PySparkTest.py
    │   ├── RandomForestTest.py
    │   ├── Test.py
    │   └── zhoukang
    └── views
    │   ├── OperateFlow.py
    │   ├── OperateType.py
    │   ├── Operator.py
    │   ├── Project.py
    │   ├── ProjectModel.py
    │   ├── Report.py
    │   ├── Test.py
    │   ├── __init__.py
    │   ├── datasource
    │       ├── DataSource.py
    │       ├── __init__.py
    │       └── werkzeug
    │       │   ├── __init__.py
    │       │   └── utils.py
    │   └── v1
    │       ├── Exploration.py
    │       ├── FeatureEngineering.py
    │       ├── Mysql.py
    │       ├── Process.py
    │       ├── Process2.py
    │       └── __init__.py
├── config.py
├── requirements.txt
└── run.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | __pycache__/
3 | *.pyc
4 | .DS_Store
5 | app/.DS_Store
6 | .xml
7 | config.py
8 | run.py


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | *机器学习可视化项目*
2 | 数据探索-数据预处理-特征工程-模型训练和预测


--------------------------------------------------------------------------------
/app/ConstFile.py:
--------------------------------------------------------------------------------
 1 | class Const(object):
 2 |     class ConstError(TypeError):
 3 |         pass
 4 | 
 5 |     class ConstCaseError(ConstError):
 6 |         pass
 7 | 
 8 |     def __setattr__(self, name, value):
 9 |         if name in self.__dict__:  # 判断是否已经被赋值，如果是则报错
10 |             raise self.ConstError("Can't change const.%s" % name)
11 |         if not name.isupper():  # 判断所赋值是否是全部大写，用来做第一次赋值的格式判断，也可以根据需要改成其他判断条件
12 |             raise self.ConstCaseError('const name "%s" is not all supercase' % name)
13 | 
14 |         self.__dict__[name] = value
15 | 
16 | 
17 | const = Const()
18 | 
19 | const.ROOTURL = "/home/zk/project/"
20 | # const.ROOTURL = "/Users/kang/PycharmProjects/project"
21 | 
22 | # csv文件存储目录（临时）
23 | const.SAVEDIR = "/home/zk/project/test.csv"
24 | # const.SAVEDIR = '/Users/kang/PycharmProjects/project/test.csv'
25 | # const.SAVEDIR = "/Users/tc/Desktop/可视化4.0/Project/test.csv"
26 | 
27 | # exploration 临时视图存放
28 | const.JSONFILENAME = 'qazwsxedcrfvtgbyhnujmiopkl' + '.json'
29 | 
30 | # 算子运行产生的中间数据
31 | const.MIDDATA = '/home/zk/midData/'
32 | 


--------------------------------------------------------------------------------
/app/Utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | from app.models.MSEntity import Project, ProcessFlow
  3 | import os, json, time
  4 | from app import db
  5 | import pandas as pd
  6 | from pyspark.sql import SparkSession
  7 | import uuid, shutil, traceback
  8 | from flask.json import jsonify
  9 | from app.ConstFile import const
 10 | 
 11 | 
 12 | def list_str_to_list(str):
 13 |     """
 14 |     字符串数组
 15 |     :param str:  "["1","2"]"
 16 |     :return:
 17 |     """
 18 |     if str is None or str == '':
 19 |         return {}
 20 |     change = json.loads("{\"key\":" + str.replace("'", "\"") + "}")
 21 |     return change['key']
 22 | 
 23 | 
 24 | # 返回数据
 25 | def returnDataModel(df, state, reason):
 26 |     if state:
 27 |         return jsonify({'state': state, 'reason': reason, 'length': df.count(), 'data': dfToJson(df, 50)})
 28 |     else:
 29 |         return jsonify({'state': state, 'reason': reason, 'length': 0, 'data': {}})
 30 | 
 31 | 
 32 | # 获取时间戳
 33 | def funTime():
 34 |     t = time.time()
 35 |     return str(int(round(t * 1000)))  # 毫秒级时间戳
 36 | 
 37 | 
 38 | # 获取一个新的SparkSession
 39 | def getSparkSession(userId, computationName):
 40 |     appName = str(userId) + "_" + computationName + '_' + str(funTime())
 41 |     print('Spark Session Name: ', appName)
 42 |     # ss = SparkSession \
 43 |     #     .builder \
 44 |     #     .appName(appName) \
 45 |     #     .master("spark://10.108.211.130:7077") \
 46 |     #     .getOrCreate()
 47 | 
 48 |     ss = SparkSession \
 49 |         .builder \
 50 |         .appName(appName) \
 51 |         .master("local[*]") \
 52 |         .getOrCreate()
 53 |     return ss
 54 | 
 55 | 
 56 | # 返回前nums条数据（json格式）
 57 | def dfToJson(df, nums):
 58 |     data_1 = df.limit(nums).toJSON().collect()
 59 |     data_2 = ",".join(data_1)
 60 |     data_3 = '[' + data_2 + ']'
 61 |     return json.loads(data_3)
 62 | 
 63 | 
 64 | # 获取处理流
 65 | def getProcessFlowByProjectId(projectId):
 66 |     try:
 67 |         filters = {
 68 |             ProcessFlow.project_id == projectId,
 69 |         }
 70 |         return ProcessFlow.query.filter(*filters).first()
 71 |     except:
 72 |         return "error"
 73 | 
 74 | 
 75 | # 追加处理流程记录
 76 | def addProcessingFlow(projectName, userId, operateType, operateParameter):
 77 |     try:
 78 |         operate = {}
 79 |         operate['type'] = operateType
 80 |         operate['key'] = str(uuid.uuid1())
 81 |         print(operate['key'])
 82 |         operate['operate'] = operateParameter
 83 |         print("追加处理流程", projectName, userId, operate)
 84 |         pflow = db.session.query(ProcessFlow.id, ProcessFlow.project_id, ProcessFlow.operates, ProcessFlow.cur_ope_id,
 85 |                                  ProcessFlow.links). \
 86 |             join(Project, Project.id == ProcessFlow.project_id). \
 87 |             filter(Project.project_name == projectName). \
 88 |             filter(Project.user_id == userId). \
 89 |             first()
 90 |         # 修改 operates
 91 |         # print(len(pflow))
 92 |         if not (pflow[2] == None or pflow[2] == ""):
 93 |             operates = json.loads(pflow[2])
 94 |         else:
 95 |             operates = []
 96 |         operates.append(operate)
 97 |         operateStr = json.dumps(operates, ensure_ascii=False)
 98 |         # 修改 links
 99 |         if not (pflow[3] == None or pflow[3] == ""):
100 |             link = {}
101 |             link['from'] = pflow[3]
102 |             link['to'] = operate['key']
103 |             if not (pflow[4] == None or pflow[4] == ""):
104 |                 links = json.loads(pflow[4])
105 |             else:
106 |                 links = []
107 |             links.append(link)
108 |             linkStr = json.dumps(links, ensure_ascii=False)
109 |         else:
110 |             linkStr = pflow[4]
111 |         filters = {
112 |             ProcessFlow.id == pflow[0],
113 |         }
114 |         result = ProcessFlow.query.filter(*filters).first()
115 |         result.operates = operateStr
116 |         result.links = linkStr
117 |         result.cur_ope_id = operate['key']
118 |         db.session.commit()
119 |         return ""
120 |     except Exception:
121 |         print('traceback.format_exc():\n%s' % traceback.format_exc())
122 |         print("追加数据流程出错")
123 |         return "追加数据流程出错"
124 | 
125 | 
126 | # addProcessingFlow('甜点销售数据预处理',1,{'type':'1','operate':'列名一,关系,值,组合关系;列名一,关系,值,'})
127 | 
128 | # 获取项目
129 | def getProjectByNameAndUserId(projectName, userId):
130 |     try:
131 |         print('projectName=', projectName, ' userId=', userId)
132 |         return db.session.query(Project).filter(Project.project_name == projectName) \
133 |             .filter(Project.user_id == userId) \
134 |             .first()
135 |     except:
136 |         return "error"
137 | 
138 | 
139 | # 获取项目的正在操作的数据文件地址
140 | def getProjectCurrentDataUrl(projectName):
141 |     try:
142 |         filters = {
143 |             Project.project_name == projectName
144 |         }
145 |         pro = Project.query.filter(*filters).first()
146 |         project_address = pro.project_address
147 |         filename = ''
148 |         for root, dirs, files in os.walk(project_address):
149 |             # print(root) #当前目录路径
150 |             # print(dirs) #当前路径下所有子目录
151 |             # print(files) #当前路径下所有非目录子文件
152 |             for file in files:
153 |                 if file[-4:] == '.csv':
154 |                     filename = file
155 |                     break
156 |             break
157 |         # print(filename)
158 |         if filename == '':
159 |             return "error"
160 |         else:
161 |             # return {'fileUrl': ProjectAddress+'/'+filename, 'projectAddress': ProjectAddress}
162 |             return {'fileUrl': 'file://' + project_address + '/' + filename, 'projectAddress': project_address}
163 |     except:
164 |         return "error"
165 | 
166 | 
167 | # 获取项目的正在操作的文件数据
168 | def getProjectCurrentData(ss, projectName):
169 |     # 解析项目路径，读取csv
170 |     urls = getProjectCurrentDataUrl(projectName)
171 |     if urls == 'error':
172 |         return "error: 项目名或项目路径有误"  # 错误类型：项目名或项目路径有误
173 |     fileUrl = urls['fileUrl']  # 读本地文件
174 |     df = ss.read.csv(fileUrl, header=True, inferSchema=True)
175 |     # ss.
176 |     # import pandas as pd
177 |     # sc = ss.sparkContext
178 |     # sqlContext = SQLContext(sc)
179 |     # df = pd.read_csv(fileUrl)
180 |     # df = sqlContext.createDataFrame(df)
181 | 
182 |     # df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(fileUrl)
183 |     return df
184 | 
185 | 
186 | def read_data_pandas(file_url):
187 |     """
188 |     pandas 读取数据
189 |     :param file_url:
190 |     :return:
191 |     """
192 |     if file_url[-4:] == ".csv":
193 |         df = pd.read_csv(file_url, encoding="utf-8")
194 |     else:
195 |         df = pd.read_excel(file_url, encoding="utf-8")
196 |     return df
197 | 
198 | 
199 | def save_data_pandas(data, file_type="", file_url="", index=0):
200 |     """
201 |     pandas 写数据
202 |     :return:
203 |     """
204 |     if file_type == "":
205 |         file_type = 'csv'
206 |     if file_url == "":
207 |         file_url = const.MIDDATA + str(uuid.uuid1())
208 | 
209 |     if file_type == 'json':
210 |         file_url = file_url + '.json'
211 |         json_str = json.dumps(data, ensure_ascii=False)
212 |         with open(file_url, "w", encoding="utf-8") as f:
213 |             json.dump(json_str, f, ensure_ascii=False)
214 |     elif file_type == 'csv':
215 |         file_url = file_url + '.csv'
216 |         data.to_csv(file_url, header=True, index=index)
217 | 
218 |     return file_url
219 | 
220 | 
221 | def read_data(ss, file_url):
222 |     """
223 |     spark 读取数据
224 |     :param ss:spark session
225 |     :param file_url:
226 |     :return:
227 |     """
228 | 
229 |     df = ss.read.csv(file_url, header=True, inferSchema=True)
230 |     return df
231 | 
232 | 
233 | def save_data(df, file_url=""):
234 |     """
235 |     保存数据
236 |     :param df:
237 |     :param file_url:
238 |     :return:
239 |     """
240 |     if file_url == "":
241 |         file_url = const.MIDDATA + str(uuid.uuid1()) + '.csv'
242 |     df.toPandas().to_csv(file_url, header=True, index=0)
243 |     return file_url
244 | 
245 | 
246 | def mkdir(path):
247 |     """
248 |     根据指定路径创建文件夹
249 |     :param path:
250 |     :return:
251 |     """
252 |     import os
253 | 
254 |     # 去除首位空格
255 |     path = path.strip()
256 |     # 去除尾部 \ 符号
257 |     path = path.rstrip("\\")
258 | 
259 |     # 判断路径是否存在
260 |     # 存在     True
261 |     # 不存在   False
262 |     isExists = os.path.exists(path)
263 | 
264 |     # 判断结果
265 |     if not isExists:
266 |         # 如果不存在则创建目录
267 |         print(path + ' 创建成功')
268 |         # 创建目录操作函数
269 |         os.makedirs(path)
270 |         return True
271 |     else:
272 |         # 如果目录存在则不创建，并提示目录已存在
273 |         print(path + ' 目录已存在')
274 |         return False
275 | 
276 | 
277 | def deldir(path):
278 |     import os
279 |     if os.path.exists(path):
280 |         # 删除文件，可使用以下两种方法。
281 |         os.remove(path)
282 |         return True
283 |     else:
284 |         print('no such file:%s' % path)
285 |         return False
286 | 
287 | 
288 | def deltree(path):
289 |     import os
290 |     if os.path.exists(path):
291 |         shutil.rmtree(path)  # 递归删除文件夹
292 |         return True
293 |     else:
294 |         print('no such path:%s' % path)
295 |         return False
296 | 
297 | 
298 | def is_number(s):
299 |     try:
300 |         float(s)
301 |         return True
302 |     except ValueError:
303 |         pass
304 | 
305 |     try:
306 |         import unicodedata
307 |         unicodedata.numeric(s)
308 |         return True
309 |     except (TypeError, ValueError):
310 |         pass
311 |     return False
312 | 


--------------------------------------------------------------------------------
/app/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | 
 3 | from flask import Flask
 4 | from flask_sqlalchemy import SQLAlchemy
 5 | 
 6 | app = Flask(__name__)
 7 | 
 8 | 
 9 | # 跨域 方法一 ：falsk_cors模块
10 | # from flask_cors import CORS
11 | # CORS(app, supports_credentials=True)
12 | 
13 | # 跨域支持 方法二：flask 内置的after_request()方法
14 | def after_request(resp):
15 |     resp.headers['Access-Control-Allow-Origin'] = '*'
16 |     return resp
17 | 
18 | 
19 | app.after_request(after_request)
20 | 
21 | # 加载配置文件
22 | app.config.from_object('config')
23 | db = SQLAlchemy(app)
24 | #
25 | from app.views.datasource import DataSource
26 | from app.views import Project, OperateType, OperateFlow, ProjectModel, Report, Operator
27 | 
28 | # from app import test
29 | 


--------------------------------------------------------------------------------
/app/dao/MLModelDao.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf8
 2 | from app import db
 3 | from app.models.MSEntity import MLModel
 4 | import traceback
 5 | 
 6 | """
 7 | 提供 ml_model（模型算子训练结果保存表） 的增删改查
 8 | """
 9 | 
10 | 
11 | def create_ml_model(ml_model):
12 |     """
13 |     创建 新的 ml_model记录（）保存模型
14 |     :param ml_model: 类型 [MLModel]
15 |     :return:
16 |     """
17 |     try:
18 |         session = db.session
19 |         session.add(ml_model)
20 |         session.commit()
21 |         print('成功创建一个算子')
22 |         return True
23 | 
24 |     except Exception:
25 |         print(traceback.print_exc())
26 |         return False
27 | 
28 | 
29 | def get_ml_model(ml_model_id):
30 |     """
31 |     查询记录
32 |     :param ml_model_id:
33 |     :return:
34 |     """
35 | 
36 |     filters = {MLModel.id == ml_model_id}
37 |     ml_model = MLModel.query.filter(*filters).first()
38 |     return ml_model
39 | 


--------------------------------------------------------------------------------
/app/dao/ModelDao.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf8
 2 | from app.models.MSEntity import Model
 3 | from app import db
 4 | import traceback
 5 | import json
 6 | import app.Utils as utils
 7 | 
 8 | """
 9 | 提供 model（项目） 表的增删改查
10 | """
11 | 
12 | 
13 | def get_model_by_id(id):
14 |     """
15 |     通过ID 获取 model
16 |     :param id:
17 |     :return:
18 |     """
19 | 
20 |     try:
21 |         query = db.session.query(Model).filter(Model.id == id).first()
22 |         db.session.commit()
23 |         return query
24 | 
25 |     except Exception:
26 |         print(traceback.print_exc())
27 |         return False
28 | 
29 | 
30 | def get_model_by_project_id(project_id):
31 |     """
32 |     通过项目ID 获取 项目对应的model
33 |     :return:
34 |     """
35 | 
36 |     try:
37 |         query = db.session.query(Model).filter(Model.project_id == project_id).first()
38 |         db.session.commit()
39 |         return query
40 | 
41 |     except Exception:
42 |         print(traceback.print_exc())
43 |         return False
44 | 
45 | 
46 | def update_with_project_id(project_id, start_nodes, relationship, config_order):
47 |     """
48 |     通过项目ID 更新 项目对应的model
49 |     :return:
50 |     """
51 | 
52 |     try:
53 |         start_nodes = utils.list_str_to_list(start_nodes)
54 |         relationship = utils.list_str_to_list(relationship)
55 |         relationship_item_str = []
56 |         for item in relationship:
57 |             relationship_item_str.append(str(item))
58 |         config = json.dumps({'config_order': config_order, 'relationship': '*,'.join(relationship_item_str)},
59 |                             ensure_ascii=False)
60 |         query = db.session.query(Model)
61 |         query.filter(Model.project_id == project_id).update(
62 |             {Model.start_nodes: ','.join(start_nodes),
63 |              Model.config: config})
64 |         db.session.commit()
65 |         print('更新完成')
66 |         return True
67 | 
68 |     except Exception:
69 |         print(traceback.print_exc())
70 |         return False
71 | 


--------------------------------------------------------------------------------
/app/dao/ModelExecuteDao.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf8
 2 | from app.models.MSEntity import ModelExecute
 3 | from app import db
 4 | import traceback
 5 | 
 6 | """
 7 | 模型执行表 增删改查
 8 | """
 9 | 
10 | 
11 | def get_model_execute_by_id(model_execute_id):
12 |     """
13 |     通过 id 查询 model_execute
14 |     :param model_execute_id:
15 |     :return:
16 |     """
17 |     try:
18 |         query = db.session.query(ModelExecute).filter(ModelExecute.id == model_execute_id).first()
19 |         db.session.commit()
20 |         return query
21 |     except Exception:
22 |         print(traceback.print_exc())
23 |         return False
24 | 
25 | 
26 | def create_model_execute(model_execute):
27 |     """
28 |     创建 model_execute
29 |     :param model_execute: 类型 ModelExecute
30 |     :return:
31 |     """
32 |     try:
33 |         session = db.session
34 |         session.add(model_execute)
35 |         session.commit()
36 |         print('成功创建一条执行记录')
37 |         return model_execute
38 |     except Exception:
39 |         print(traceback.print_exc())
40 |         return False
41 | 
42 | 
43 | def update_model_execute(model_execute_id, status, run_info, end_time):
44 |     """
45 |     更新 model_execute
46 | 
47 |     :param model_execute_id:
48 |     :param status:
49 |     :param run_info:
50 |     :param end_time:
51 |     :return:
52 |     """
53 |     try:
54 |         filters = {
55 |             ModelExecute.id == model_execute_id,
56 |         }
57 |         result = ModelExecute.query.filter(*filters).first()
58 |         result.status = status
59 |         result.run_info = run_info
60 |         result.end_time = end_time
61 |         db.session.commit()
62 |         return True
63 |     except Exception as e:
64 |         print(traceback.print_exc())
65 |         return False
66 | 


--------------------------------------------------------------------------------
/app/dao/OperatorDao.py:
--------------------------------------------------------------------------------
  1 | # encoding=utf8
  2 | from app.models.MSEntity import Operator
  3 | from app import db
  4 | import traceback
  5 | 
  6 | """
  7 | operator（算子）表 增删改查
  8 | """
  9 | 
 10 | 
 11 | def update_operator_by_id(operator_id, status, operator_output_url="", run_info=""):
 12 |     """
 13 |     通过 operator_id 更新 operator的执行状态、结果保存路径、运行信息
 14 |     :param operator_id:
 15 |     :param status:
 16 |     :param operator_output_url:
 17 |     :param run_info:
 18 |     :return:
 19 |     """
 20 |     try:
 21 |         filters = {
 22 |             Operator.id == operator_id,
 23 |         }
 24 |         result = Operator.query.filter(*filters).first()
 25 |         result.status = status
 26 |         result.operator_output_url = operator_output_url
 27 |         result.run_info = run_info
 28 |         db.session.commit()
 29 |         return True
 30 |     except Exception as e:
 31 |         print(traceback.print_exc())
 32 |         return False
 33 | 
 34 | 
 35 | def update_operator_input_url(operator_id, operator_input_url):
 36 |     """
 37 |     通过 operator_id 更新 operator的输入路径
 38 |     :param operator_id:
 39 |     :param operator_input_url:
 40 |     :return:
 41 |     """
 42 |     try:
 43 |         filters = {
 44 |             Operator.id == operator_id,
 45 |         }
 46 |         result = Operator.query.filter(*filters).first()
 47 |         result.operator_input_url = operator_input_url
 48 |         db.session.commit()
 49 |         return True
 50 |     except Exception as e:
 51 |         print(traceback.print_exc())
 52 |         return False
 53 | 
 54 | 
 55 | def get_operator_by_id(operator_id):
 56 |     """
 57 |     通过 id 查询 operator
 58 |     :param operator_id:
 59 |     :return:
 60 |     """
 61 |     try:
 62 |         query = db.session.query(Operator).filter(Operator.id == operator_id).first()
 63 |         db.session.commit()
 64 |         return query
 65 |     except Exception:
 66 |         print(traceback.print_exc())
 67 |         return False
 68 | 
 69 | 
 70 | def get_operator_by_ids(operator_ids):
 71 |     """
 72 |     TODO:不好使待调试
 73 |     通过 id集合 查询 operator
 74 |     :param operator_ids:[]
 75 |     :return:
 76 |     """
 77 |     try:
 78 |         query = db.session.query(Operator).filter(Operator.id in operator_ids)
 79 |         db.session.commit()
 80 |         return query
 81 |     except Exception:
 82 |         print(traceback.print_exc())
 83 |         return False
 84 | 
 85 | 
 86 | def get_operator_by_model_id(model_id):
 87 |     """
 88 |     通过 model_id 查询 operator
 89 |     :param model_id:
 90 |     :return:
 91 |     """
 92 |     try:
 93 |         query = db.session.query(Operator).filter(Operator.model_id == model_id)
 94 |         db.session.commit()
 95 |         return query
 96 |     except Exception:
 97 |         print(traceback.print_exc())
 98 |         return False
 99 | 
100 | 
101 | def delete_operator_by_model_id(model_id):
102 |     """
103 |     通过 model_id 删除 operator
104 |     :param model_id:
105 |     :return:
106 |     """
107 |     try:
108 |         db.session.query(Operator).filter(Operator.model_id == model_id).delete()
109 |         db.session.commit()
110 |         return True
111 |     except Exception:
112 |         print(traceback.print_exc())
113 |         return False
114 | 
115 | 
116 | def delete_operator_by_id(operator_id):
117 |     """
118 |     通过 id 删除 operator
119 |     :param operator_id:
120 |     :return:
121 |     """
122 |     try:
123 |         db.session.query(Operator).filter(Operator.id == operator_id).delete()
124 |         db.session.commit()
125 |         return True
126 |     except Exception:
127 |         print(traceback.print_exc())
128 |         return False
129 | 
130 | 
131 | def create_operator(operators):
132 |     """
133 |     创建 新的operator（算子）
134 |     :param operators: 类型 [Operator]
135 |     :return:
136 |     """
137 |     try:
138 |         session = db.session
139 |         session.add_all(operators)
140 |         session.commit()
141 |         return True
142 | 
143 |     except Exception:
144 |         print(traceback.print_exc())
145 |         return False
146 | 


--------------------------------------------------------------------------------
/app/dao/OperatorTypeDao.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf8
 2 | from app.models.MSEntity import OperatorType
 3 | 
 4 | """
 5 | operator_type（算子种类）表 增删改查
 6 | """
 7 | 
 8 | 
 9 | def get_all_operator_type():
10 |     """
11 |     查询所有的 算子种类
12 |     :return:
13 |     """
14 |     return OperatorType.query.all()
15 | 


--------------------------------------------------------------------------------
/app/dao/ProjectDao.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf8
 2 | from app.models.MSEntity import Project
 3 | 
 4 | """
 5 | 提供 project（项目） 表的增删改查
 6 | """
 7 | 
 8 | 
 9 | def get_project_by_id(project_id):
10 |     """
11 |     通过项目ID获取项目
12 |     :return:
13 |     """
14 | 
15 |     try:
16 |         filters = {
17 |             Project.id == project_id,
18 |         }
19 |         return Project.query.filter(*filters).first()
20 |     except Exception:
21 |         return "error"
22 | 


--------------------------------------------------------------------------------
/app/dao/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | dao 层
3 | 提供操作数据库的功能函数
4 | """
5 | 


--------------------------------------------------------------------------------
/app/enmus/EnumConst.py:
--------------------------------------------------------------------------------
 1 | # 导入枚举类
 2 | from enum import Enum
 3 | 
 4 | """
 5 | v1版本，弃用
 6 | """
 7 | 
 8 | 
 9 | # 继承枚举类
10 | # 算子编码枚举类
11 | class OperatorType(Enum):
12 |     ## 数据预处理
13 |     # 过滤
14 |     FILTER = '1001'
15 |     # 排序
16 |     SORT = '1002'
17 |     # 按列拆分
18 |     COLUMNSPLIT = '1003'
19 |     # 按行拆分
20 |     ROWSPLIT = '1004'
21 |     # 多列合并
22 |     COLUMNMERGE = '1005'
23 |     # 数据列替换
24 |     REPLACE = '1006'
25 |     # 空值填充
26 |     FILLNULLVALUE = '1007'
27 |     # 列映射
28 |     COLUMNMAP = '1008'
29 | 
30 |     ## 特征工程
31 |     # 分位数离散化
32 |     QUANTILEDISCRETIZATION = '2001'
33 |     # 向量索引转换
34 |     VECTORINDEXER = '2002'
35 |     # 标准化列
36 |     STANDARDSCALER = '2003'
37 |     # 降维
38 |     PCA = '2004'
39 |     # 字符串转标签
40 |     STRINGINDEXER = '2005'
41 |     # 独热编码
42 |     ONEHOTENCODER = '2006'
43 |     # 多项式扩展
44 |     POLYNOMIALEXPANSION = '2007'
45 |     # 卡放选择
46 |     CHISQSELECTOR = '2008'
47 | 


--------------------------------------------------------------------------------
/app/enmus/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kugomusic/Easy_Data/a74c3cd2c9c3b0e5a9298f8c3b7af2a2f5caf260/app/enmus/__init__.py


--------------------------------------------------------------------------------
/app/ml/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kugomusic/Easy_Data/a74c3cd2c9c3b0e5a9298f8c3b7af2a2f5caf260/app/ml/__init__.py


--------------------------------------------------------------------------------
/app/ml/multipleClassification/LR.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel, LogisticRegressionSummary, \
  3 |     LogisticRegressionTrainingSummary
  4 | from pyspark.sql import Row, DataFrame
  5 | from pyspark.ml.linalg import Vectors
  6 | from app.Utils import *
  7 | 
  8 | '''
  9 | class pyspark.ml.classification.LogisticRegression(featuresCol='features', labelCol='label', predictionCol='prediction', 
 10 | maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-06, fitIntercept=True, threshold=0.5, thresholds=None, 
 11 | probabilityCol='probability', rawPredictionCol='rawPrediction', standardization=True, weightCol=None, 
 12 | aggregationDepth=2, family='auto', lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, 
 13 | lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None)
 14 | 
 15 | featuresCol # 特征列
 16 | labelCol # 标签列
 17 | predictionCol # 预测输出列
 18 | maxIter # 最大迭代轮数
 19 | regParam # 正则化参数
 20 | elasticNetParam # ElasticNet混合参数，范围为[0，1]。对于alpha = 0，惩罚是L2惩罚。对于alpha = 1，这是L1惩罚（默认值：0.0）
 21 | tol # 迭代算法的收敛容限（> = 0）（默认值：1e-06）
 22 | fitIntercept # 是否训练截距项（默认值：True）
 23 | threshold # 二进制分类预测中的阈值，范围为[0，1]（默认值：0.5）。
 24 | thresholds # 多类别分类中的阈值，用于调整预测每个类别的概率。数组的长度必须等于类的数量，其值必须大于0，但最多一个值可以为0，这是预测值。p/t最大的类是可预测的，其中p是该类的原始概率，t是该类的概率阈值（未定义）
 25 | rawPredictionCol：原始预测（也称为置信度）列名称（默认值：rawPrediction）
 26 | standardization: whether to standardize the training features before fitting the model (default: True)
 27 | weightCol: weight column name. If this is not set or empty, we treat all instance weights as 1.0 (current: weight)
 28 | aggregationDepth # suggested depth for treeAggregate (>= 2) (default: 2)
 29 | family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial. (default: auto)
 30 | lowerBoundsOnCoefficients：如果在边界约束优化下拟合，则系数的下界。 （未定义）
 31 | lowerBoundsOnIntercepts：如果在边界约束优化下拟合，则截距的下限。 （未定义）
 32 | upperBoundsOnIntercepts：如果在边界约束优化下拟合，则截距的上限。 （未定义）
 33 | 
 34 | '''
 35 | 
 36 | 
 37 | def is_number(s):
 38 |     try:
 39 |         float(s)
 40 |         return True
 41 |     except ValueError:
 42 |         pass
 43 | 
 44 |     try:
 45 |         import unicodedata
 46 |         unicodedata.numeric(s)
 47 |         return True
 48 |     except (TypeError, ValueError):
 49 |         pass
 50 | 
 51 |     return False
 52 | 
 53 | 
 54 | def project_url(projectName):
 55 |     urls = getProjectCurrentDataUrl(projectName)
 56 |     if urls == 'error':
 57 |         return "error: 项目名或项目路径有误"  # 错误类型：项目名或项目路径有误
 58 |     return urls['projectAddress']  # 项目地址
 59 | 
 60 | 
 61 | def lr(ss, data, label_index, feature_indexs, project_url):
 62 |     # 1.构造训练数据集
 63 |     def func(x):
 64 |         features_data = []
 65 |         for feature in feature_indexs:
 66 |             if (is_number(x[feature])):
 67 |                 features_data.append(float(x[feature]))
 68 |             else:
 69 |                 features_data.append(0.0)
 70 |         label_data = 0.0
 71 |         if (is_number(x[label_index])):
 72 |             label_data = float(x[label_index])
 73 |         return Row(label=label_data, features=Vectors.dense(features_data))
 74 | 
 75 |     training_set = data.rdd.map(list).map(lambda x: func(x)).toDF()
 76 | 
 77 |     # 2.训练模型
 78 |     lr_param = LogisticRegression(regParam=0.01, family='multinomial')
 79 |     lr_model = lr_param.fit(training_set)
 80 |     print(lr_model.coefficientMatrix)  # 系数
 81 |     print(lr_model.interceptVector)  # 截距
 82 |     # print(lr_model.explainParams())  # 参数以及其注解
 83 | 
 84 |     # 3.保存模型
 85 |     # model_path = project_url + '/model/multipleClassification/lr'
 86 |     # lr_model.write().overwrite().save(model_path)
 87 |     #
 88 |     # # 4.读取模型
 89 |     # lr2 = lr_model.load(model_path)
 90 | 
 91 |     # 5.预测
 92 |     result = lr_model.transform(training_set).head()
 93 |     print(result.prediction)
 94 | 
 95 |     LogisticRegressionTrainingSummary
 96 |     sum = lr_model.summary
 97 | 
 98 |     # 6.评估
 99 |     summary = lr_model.evaluate(training_set)
100 |     summary.show()
101 | 
102 | 
103 | userId = 1
104 | functionName = 'lr'
105 | projectName = '订单分析'
106 | label = 0  # 标签列
107 | features = [2, 4, 10, 11, 12]  # 特征列
108 | project_path = project_url(projectName)  # 项目路径
109 | # spark会话
110 | ss = getSparkSession(userId, functionName)
111 | # 解析项目路径，读取csv
112 | fileUrl = '/home/zk/data/adult.csv'
113 | df = ss.read.csv(fileUrl)
114 | df.filter
115 | print(df.dtypes)
116 | 
117 | df.show()
118 | # df = getProjectCurrentData(ss, projectName)
119 | # 罗辑回归二分类
120 | lr(ss, df, label, features, project_path)
121 | 


--------------------------------------------------------------------------------
/app/ml/multipleClassification/MPC.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | from pyspark.ml.classification import MultilayerPerceptronClassifier, MultilayerPerceptronClassificationModel
 3 | from pyspark.sql import Row, DataFrame
 4 | from pyspark.ml.linalg import Vectors
 5 | from app.Utils import *
 6 | 
 7 | 
 8 | # userId = 1
 9 | # functionName = 'gdbt'
10 | # projectName = '订单分析'
11 | # # spark会话
12 | # spark = getSparkSession(userId, functionName)
13 | # df = spark.createDataFrame([
14 | #     (0.0, Vectors.dense([0.0, 0.0])),
15 | #     (1.0, Vectors.dense([0.0, 1.0])),
16 | #     (1.0, Vectors.dense([1.0, 0.0])),
17 | #     (0.0, Vectors.dense([1.0, 1.0]))], ["label", "features"])
18 | # mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[2, 2, 2], blockSize=1, seed=123)
19 | # model = mlp.fit(df)
20 | # print(model.layers)
21 | # print(model.weights.size)
22 | # testDF = spark.createDataFrame([
23 | #     (Vectors.dense([1.0, 0.0]),),
24 | #     (Vectors.dense([0.0, 0.0]),)], ["features"])
25 | # model.transform(testDF).select("features", "prediction").show()
26 | 
27 | 
28 | def project_url(projectName):
29 |     urls = getProjectCurrentDataUrl(projectName)
30 |     if urls == 'error':
31 |         return "error: 项目名或项目路径有误"  # 错误类型：项目名或项目路径有误
32 |     return urls['projectAddress']  # 项目地址
33 | 
34 | 
35 | def mpc(ss, data, label_index, feature_indexs, project_url):
36 |     # 1.构造训练数据集
37 |     def func(x):
38 |         features_data = []
39 |         for feature in feature_indexs:
40 |             features_data.append(x[feature])
41 |         return Row(label=label_index, features=Vectors.dense(features_data))
42 | 
43 |     training_set = data.rdd.map(lambda x: func(x)).toDF()
44 | 
45 |     # 2.训练模型
46 |     # maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, solver="l-bfgs", initialWeights=None
47 |     mpc_param = MultilayerPerceptronClassifier(maxIter=100, tol=1e-6, blockSize=128, stepSize=0.03, solver="l-bfgs")
48 |     mpc_param.setSeed(1)
49 |     mpc_param.setLayers([4, 2, 2])
50 |     mpc_model = mpc_param.fit(training_set)
51 | 
52 |     # 3.保存模型
53 |     model_path = project_url + '/model/multipleClassification/mpc'
54 |     mpc_model.write().overwrite().save(model_path)
55 | 
56 |     # 4.读取模型
57 |     mpc2 = MultilayerPerceptronClassificationModel.load(model_path)
58 | 
59 |     # 5.预测
60 |     result = mpc2.transform(training_set).select("prediction", "features").show()
61 | 
62 | 
63 | userId = 1
64 | functionName = 'mpc'
65 | projectName = '订单分析'
66 | label = 0  # 标签列
67 | features = [12, 13, 14, 15]  # 特征列
68 | project_path = project_url(projectName)  # 项目路径
69 | # spark会话
70 | ss = getSparkSession(userId, functionName)
71 | # 解析项目路径，读取csv
72 | df = getProjectCurrentData(ss, projectName)
73 | # 罗辑回归二分类
74 | mpc(ss, df, label, features, project_path)
75 | 


--------------------------------------------------------------------------------
/app/ml/multipleClassification/RF.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel, LogisticRegressionSummary, \
 3 |     LogisticRegressionTrainingSummary, RandomForestClassifier
 4 | from pyspark.sql import Row, DataFrame
 5 | from pyspark.ml.linalg import Vectors
 6 | from app.Utils import *
 7 | 
 8 | '''
 9 |     It supports both binary and multiclass labels, as well as both continuous and categoricaleatures.
10 | 
11 | '''
12 | 
13 | 
14 | def is_number(s):
15 |     try:
16 |         float(s)
17 |         return True
18 |     except ValueError:
19 |         pass
20 | 
21 |     try:
22 |         import unicodedata
23 |         unicodedata.numeric(s)
24 |         return True
25 |     except (TypeError, ValueError):
26 |         pass
27 | 
28 |     return False
29 | 
30 | 
31 | def project_url(projectName):
32 |     urls = getProjectCurrentDataUrl(projectName)
33 |     if urls == 'error':
34 |         return "error: 项目名或项目路径有误"  # 错误类型：项目名或项目路径有误
35 |     return urls['projectAddress']  # 项目地址
36 | 
37 | 
38 | def rf(ss, data, label_index, feature_indexs, project_url):
39 |     # 1.构造训练数据集
40 |     def func(x):
41 |         features_data = []
42 |         for feature in feature_indexs:
43 |             features_data.append(x[feature])
44 |         return Row(label=label_index, features=Vectors.dense(features_data))
45 | 
46 |     training_set = data.rdd.map(list).map(lambda x: func(x)).toDF()
47 | 
48 |     # 2.训练模型
49 |     rf_param = RandomForestClassifier(numTrees=50)
50 |     rf_model = rf_param.fit(training_set)
51 | 
52 |     # 3.保存模型
53 |     model_path = project_url + '/model/multipleClassification/rf'
54 |     rf_model.write().overwrite().save(model_path)
55 | 
56 |     # 4.读取模型
57 |     rf2 = rf_model.load(model_path)
58 | 
59 |     # 5.预测
60 |     rf_pred = rf2.transform(training_set)
61 |     rf_pred.select("prediction", "features").show()
62 | 
63 |     from pyspark.ml.evaluation import MulticlassClassificationEvaluator
64 |     # 6.评估
65 |     rf_accuracy = MulticlassClassificationEvaluator(metricName='accuracy').evaluate(rf_pred)
66 |     print("RF's accuracy is %f" % rf_accuracy)
67 |     rf_precision = MulticlassClassificationEvaluator(metricName='weightedPrecision').evaluate(rf_pred)
68 |     print("RF's precision is %f" % rf_precision)
69 | 
70 | 
71 | userId = 1
72 | functionName = 'lr'
73 | projectName = '订单分析'
74 | label = 0  # 标签列
75 | features = [2, 4, 10, 11, 12]  # 特征列
76 | project_path = project_url(projectName)  # 项目路径
77 | # spark会话
78 | ss = getSparkSession(userId, functionName)
79 | # 解析项目路径，读取csv
80 | fileUrl = '/home/zk/data/adult.csv'
81 | df = ss.read.csv(fileUrl)
82 | df.filter
83 | print(df.dtypes)
84 | 
85 | df.show()
86 | # df = getProjectCurrentData(ss, projectName)
87 | # 罗辑回归二分类
88 | rf(ss, df, label, features, project_path)
89 | 


--------------------------------------------------------------------------------
/app/ml/multipleClassification/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kugomusic/Easy_Data/a74c3cd2c9c3b0e5a9298f8c3b7af2a2f5caf260/app/ml/multipleClassification/__init__.py


--------------------------------------------------------------------------------
/app/ml/secondClassification/GBDT.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | from pyspark.ml.linalg import Vectors
 3 | from pyspark.ml.classification import GBTClassifier, GBTClassificationModel
 4 | from pyspark.ml.feature import StringIndexer
 5 | from pyspark.sql.types import Row
 6 | from app.Utils import *
 7 | 
 8 | # GBDT(Gradient Boosting Decision Tree) 又叫 MART（Multiple Additive Regression Tree)，是一种迭代的决策树算法，
 9 | # 该算法由多棵决策树组成，所有树的结论累加起来做最终答案。
10 | 
11 | 
12 | # GBTC API
13 | # GBTClassifier(featuresCol='features', labelCol='label', predictionCol='prediction',
14 | # maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256,
15 | # cacheNodeIds=False, checkpointInterval=10, lossType='logistic', maxIter=20, stepSize=0.1,
16 | # seed=None, subsamplingRate=1.0, featureSubsetStrategy='all')
17 | 
18 | trainDataRatio = 0.75  # 训练数据比例
19 | maxIter = 20  # 迭代次数
20 | stepSize = 0.1  # 学习速率(0-1)
21 | maxDepth = 5  # 数的最大深度[1,100]
22 | minInstancesPerNode = 1  # 叶子节点最少样本数[1,1000]
23 | seed = 1  # 随机数产生器种子[0,10]
24 | maxBins = 32  # 一个特征分裂的最大数量[1,1000]
25 | 
26 | 
27 | def model_url(projectName):
28 |     urls = getProjectCurrentDataUrl(projectName)
29 |     if urls == 'error':
30 |         return "error: 项目名或项目路径有误"  # 错误类型：项目名或项目路径有误
31 |     return urls['projectAddress'] + '/model/secondClassification'  # 项目地址
32 | 
33 | 
34 | def gbdt(data, label_index, feature_indexs, project_url):
35 | 
36 |     # 2.构造训练数据集
37 |     data_set = data.rdd.map(list)
38 |     (train_data, test_data) = data_set.randomSplit([trainDataRatio, 1 - trainDataRatio])
39 |     data.show()
40 | 
41 |     def func(x):
42 |         features_data = []
43 |         for feature in feature_indexs:
44 |             features_data.append(x[feature])
45 |         return Row(label=label_index, features=Vectors.dense(features_data))
46 | 
47 |     training_set = train_data.map(list).map(lambda x: func(x)).toDF()
48 |     training_set.show()
49 |     train_num = training_set.count()
50 |     print("训练样本数:{}".format(train_num))
51 | 
52 |     # 3.使用GBDT进行训练
53 |     string_indexer = StringIndexer(inputCol="label", outputCol="indexed")
54 |     si_model = string_indexer.fit(training_set)
55 |     tf = si_model.transform(training_set)
56 | 
57 |     gbdt = GBTClassifier(labelCol="indexed",
58 |                          maxIter=maxIter, stepSize=stepSize, maxDepth=maxDepth, minInstancesPerNode=minInstancesPerNode,
59 |                          seed=seed)
60 |     gbdt_model = gbdt.fit(tf)
61 |     print(gbdt_model.featureImportances)
62 |     # 保存模型
63 |     model_path = project_url + '/gbdt'
64 |     gbdt_model.write().overwrite().save(model_path)
65 | 
66 |     # 加载模型
67 |     gbdt_model2 = GBTClassificationModel.load(model_path)
68 | 
69 |     # 预测
70 |     gbdt_model2.transform(training_set).select("prediction", "label", "features").show(5)
71 | 
72 | 
73 | userId = 1
74 | functionName = 'gdbt'
75 | projectName = '订单分析'
76 | label = 0  # 标签列
77 | features = [12, 13, 14, 15]  # 特征列
78 | model_path = model_url(projectName)  # 项目路径
79 | 
80 | # spark会话
81 | ss = getSparkSession(userId, functionName)
82 | # 解析项目路径，读取csv
83 | df = getProjectCurrentData(ss, projectName)
84 | # if df == "error: 项目名或项目路径有误":
85 | #     state = False
86 | #     reason = df
87 | #     return returnDataModel(df, state, reason)
88 | gbdt(df, label, features, model_path)  # 二分类
89 | 
90 | # 错误 'PipelinedRDD' object has no attribute '_jdf'
91 | # 报这个错，是因为导入的机器学习包错误所致。
92 | # pyspark.ml 是用来处理DataFrame。
93 | # pyspark.mllib是用来处理RDD。
94 | 


--------------------------------------------------------------------------------
/app/ml/secondClassification/LR.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel, LogisticRegressionSummary
 3 | from pyspark.sql import Row
 4 | from pyspark.ml.linalg import Vectors
 5 | from app.Utils import *
 6 | 
 7 | '''
 8 | class pyspark.ml.classification.LogisticRegression(featuresCol='features', labelCol='label', predictionCol='prediction', 
 9 | maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-06, fitIntercept=True, threshold=0.5, thresholds=None, 
10 | probabilityCol='probability', rawPredictionCol='rawPrediction', standardization=True, weightCol=None, 
11 | aggregationDepth=2, family='auto', lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, 
12 | lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None)
13 | 
14 | featuresCol # 特征列
15 | labelCol # 标签列
16 | predictionCol # 预测输出列
17 | maxIter # 最大迭代轮数
18 | regParam # 正则化参数
19 | elasticNetParam # ElasticNet混合参数，范围为[0，1]。对于alpha = 0，惩罚是L2惩罚。对于alpha = 1，这是L1惩罚（默认值：0.0）
20 | tol # 迭代算法的收敛容限（> = 0）（默认值：1e-06）
21 | fitIntercept # 是否训练截距项（默认值：True）
22 | threshold # 二进制分类预测中的阈值，范围为[0，1]（默认值：0.5）。
23 | thresholds # 多类别分类中的阈值，用于调整预测每个类别的概率。数组的长度必须等于类的数量，其值必须大于0，但最多一个值可以为0，这是预测值。p/t最大的类是可预测的，其中p是该类的原始概率，t是该类的概率阈值（未定义）
24 | rawPredictionCol：原始预测（也称为置信度）列名称（默认值：rawPrediction）
25 | standardization: whether to standardize the training features before fitting the model (default: True)
26 | weightCol: weight column name. If this is not set or empty, we treat all instance weights as 1.0 (current: weight)
27 | aggregationDepth # suggested depth for treeAggregate (>= 2) (default: 2)
28 | family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial. (default: auto)
29 | lowerBoundsOnCoefficients：如果在边界约束优化下拟合，则系数的下界。 （未定义）
30 | lowerBoundsOnIntercepts：如果在边界约束优化下拟合，则截距的下限。 （未定义）
31 | upperBoundsOnIntercepts：如果在边界约束优化下拟合，则截距的上限。 （未定义）
32 | 
33 | '''
34 | 
35 | 
36 | def project_url(projectName):
37 |     urls = getProjectCurrentDataUrl(projectName)
38 |     if urls == 'error':
39 |         return "error: 项目名或项目路径有误"  # 错误类型：项目名或项目路径有误
40 |     return urls['projectAddress']  # 项目地址
41 | 
42 | 
43 | def lr(ss, data, label_index, feature_indexs, project_url):
44 |     # 1.构造训练数据集
45 |     def func(x):
46 |         features_data = []
47 |         for feature in feature_indexs:
48 |             features_data.append(x[feature])
49 |         return Row(label=label_index, features=Vectors.dense(features_data))
50 | 
51 |     training_set = data.rdd.map(list).map(lambda x: func(x)).toDF()
52 | 
53 |     # 2.训练模型
54 |     lr_param = LogisticRegression(regParam=0.01)
55 |     lr_model = lr_param.fit(training_set)
56 |     print(lr_model.coefficients)  # 系数
57 |     print(lr_model.intercept)  # 截距
58 |     print(lr_model.explainParams())  # 参数以及其注解
59 | 
60 |     # 3.保存模型
61 |     model_path = project_url + '/model/secondClassification/lr'
62 |     lr_model.write().overwrite().save(model_path)
63 | 
64 |     # 4.读取模型
65 |     lr2 = lr_model.load(model_path)
66 | 
67 |     # 5.预测
68 |     result = lr2.transform(training_set).head()
69 |     print(result.prediction)
70 | 
71 |     sum = lr_model.summary
72 |     print('------roc--', sum.areaUnderROC)
73 | 
74 |     # 6.评估
75 |     # summary = lr_model.evaluate(training_set)
76 | 
77 | 
78 | userId = 1
79 | functionName = 'gdbt'
80 | projectName = '订单分析'
81 | label = 0  # 标签列
82 | features = [12, 13, 14, 15]  # 特征列
83 | project_path = project_url(projectName)  # 项目路径
84 | # spark会话
85 | ss = getSparkSession(userId, functionName)
86 | # 解析项目路径，读取csv
87 | df = getProjectCurrentData(ss, projectName)
88 | # 罗辑回归二分类
89 | lr(ss, df, label, features, project_path)
90 | 


--------------------------------------------------------------------------------
/app/ml/secondClassification/SVM.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | from pyspark.mllib.classification import SVMModel
  3 | from pyspark.mllib.classification import SVMWithSGD
  4 | from pyspark.mllib.regression import LabeledPoint
  5 | from pyspark.mllib.evaluation import BinaryClassificationMetrics
  6 | from app.Utils import *
  7 | import numpy as np
  8 | 
  9 | 
 10 | def model_url(projectName):
 11 |     urls = getProjectCurrentDataUrl(projectName)
 12 |     if urls == 'error':
 13 |         return "error: 项目名或项目路径有误"  # 错误类型：项目名或项目路径有误
 14 |     return urls['projectAddress'] + '/model/secondClassification'  # 项目地址
 15 | 
 16 | 
 17 | def svm(ss, data, label_index, feature_indexs, model_url):
 18 |     # 1. 准备数据
 19 |     def func(x):
 20 |         features_data = []
 21 |         for feature in feature_indexs:
 22 |             features_data.append(x[feature])
 23 |         return LabeledPoint(label=np.random.randint(0, 2), features=features_data)
 24 | 
 25 |     training_data = data.rdd.map(lambda x: func(x))
 26 | 
 27 |     # 2. 训练
 28 |     svm_model = SVMWithSGD.train(training_data, iterations=20, step=1.0, regParam=0.01,
 29 |                                  miniBatchFraction=1.0, initialWeights=None, regType="l2",
 30 |                                  intercept=False, validateData=True, convergenceTol=0.001)
 31 | 
 32 |     # 3.预测
 33 |     predict_data = training_data.map(lambda x: x.features)
 34 |     prediction = svm_model.predict(predict_data)
 35 |     print(prediction.take(10))
 36 |     # print("真实值:{},预测值{}".format(prediction, training_data.first().label))
 37 | 
 38 |     # # 4.保存模型
 39 |     # svm_model_path = model_url + '/svm'
 40 |     # deltree(svm_model_path)  # 删除已经存在的模型
 41 |     # svm_model.save(ss.sparkContext, svm_model_path)
 42 |     #
 43 |     # # 5.加载模型
 44 |     # same_model = SVMModel.load(ss.sparkContext, svm_model_path)
 45 | 
 46 |     # 6.模型评估
 47 |     evl(training_data, svm_model)
 48 | 
 49 | 
 50 | def evl(data, svmModel):
 51 |     ## 正确率和错误率
 52 |     # lrTotalCorrect = data.map(lambda r: 1 if (lrModel.predict(r.features) == r.label) else 0).reduce(lambda x, y: x + y)
 53 |     # lrAccuracy = lrTotalCorrect / float(data.count())  # 0.5136044023234485
 54 | 
 55 |     svmTotalCorrect = data.map(lambda r: 1 if (svmModel.predict(r.features) == r.label) else 0).reduce(
 56 |         lambda x, y: x + y)
 57 |     svmAccuracy = svmTotalCorrect / float(data.count())  # 0.5136044023234485
 58 |     #
 59 |     # nbTotalCorrect = data_for_bayes.map(lambda r: 1 if (bayesModel.predict(r.features) == r.label) else 0).reduce(
 60 |     #     lambda x, y: x + y)
 61 |     # nbAccuracy = nbTotalCorrect / float(data_for_bayes.count())  # 0.5799449709568939
 62 | 
 63 |     # dt_predictions = dtModel.predict(data.map(lambda x: x.features))
 64 |     # labelsAndPredictions = data.map(lambda x: x.label).zip(dt_predictions)
 65 |     # dtTotalCorrect = labelsAndPredictions.map(lambda r: 1 if (r[0] == r[1]) else 0).reduce(lambda x, y: x + y)
 66 |     # dtAccuracy = dtTotalCorrect / float(data.count())  # 0.654234179150107
 67 | 
 68 |     # Compute raw scores on the test set
 69 |     # lrPredictionAndLabels = data.map(lambda lp: (float(lrModel.predict(lp.features)), lp.label))
 70 |     # # Instantiate metrics object
 71 |     # lrmetrics = BinaryClassificationMetrics(lrPredictionAndLabels)
 72 |     # # Area under precision-recall curve
 73 |     # print("Area under PR = %s" % lrmetrics.areaUnderPR)
 74 |     # # Area under ROC curve
 75 |     # print("Area under ROC = %s" % lrmetrics.areaUnderROC)
 76 | 
 77 |     # 清除默认阈值，这样会输出原始的预测评分，即带有确信度的结果
 78 |     svmModel.clearThreshold()
 79 |     predict_data = data.map(lambda x: x.features)
 80 |     prediction = svmModel.predict(predict_data)
 81 |     print(prediction.take(10))
 82 | 
 83 |     svmPredictionAndLabels = data.map(lambda lp: (float(svmModel.predict(lp.features)), lp.label))
 84 |     svmMetrics = BinaryClassificationMetrics(svmPredictionAndLabels)
 85 |     print("Area under PR = %s" % svmMetrics.areaUnderPR)
 86 |     print("Area under ROC = %s" % svmMetrics.areaUnderROC)
 87 | 
 88 |     # bayesPredictionAndLabels = data_for_bayes.map(lambda lp: (float(bayesModel.predict(lp.features)), lp.label))
 89 |     # bayesMetrics = BinaryClassificationMetrics(bayesPredictionAndLabels)
 90 |     # print("Area under PR = %s" % bayesMetrics.areaUnderPR)
 91 |     # print("Area under ROC = %s" % bayesMetrics.areaUnderROC)
 92 | 
 93 | 
 94 | userId = 1
 95 | functionName = 'gdbt'
 96 | projectName = '订单分析'
 97 | label = 0  # 标签列
 98 | features = ["数量", "折扣", "利润", "装运成本"]  # 特征列
 99 | model_path = model_url(projectName)  # 项目路径
100 | 
101 | # spark会话
102 | ss = getSparkSession(userId, functionName)
103 | # 解析项目路径，读取csv
104 | df = getProjectCurrentData(ss, projectName)
105 | # svm二分类
106 | svm(ss, df, label, features, model_path)
107 | 
108 | '''
109 | 错误 'PipelinedRDD' object has no attribute 'show'
110 | 报这个错，是因为 df.show() is only for spark DataFrame 所致。
111 | '''
112 | 


--------------------------------------------------------------------------------
/app/ml/secondClassification/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kugomusic/Easy_Data/a74c3cd2c9c3b0e5a9298f8c3b7af2a2f5caf260/app/ml/secondClassification/__init__.py


--------------------------------------------------------------------------------
/app/models/MSEntity.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | from app import db
  3 | import click
  4 | 
  5 | 
  6 | # @app.cli.command()
  7 | def initdb():
  8 |     """
  9 |     创建数据库
 10 |     :return:
 11 |     """
 12 |     db.create_all()
 13 |     click.echo('Initialized database.')
 14 | 
 15 | 
 16 | # @app.cli.command()
 17 | def dropdb():
 18 |     """
 19 |     删除数据库
 20 |     :return:
 21 |     """
 22 |     db.drop_all()
 23 |     click.echo('Drop database.')
 24 | 
 25 | 
 26 | class DataSource(db.Model):
 27 |     """
 28 |     数据源类
 29 |     """
 30 |     __tablename__ = 'data_source'
 31 |     id = db.Column(db.Integer, primary_key=True)
 32 |     file_name = db.Column(db.String(64), unique=True, index=True)
 33 |     file_url = db.Column(db.Text, unique=True)
 34 |     file_type = db.Column(db.String(64))
 35 |     create_user = db.Column(db.String(64))
 36 |     open_level = db.Column(db.String(64))
 37 |     create_time = db.Column(db.String(64))
 38 | 
 39 | 
 40 | class User(db.Model):
 41 |     """
 42 |     用户类
 43 |     """
 44 |     __tablename__ = 'user'
 45 |     id = db.Column(db.Integer, primary_key=True)
 46 |     user_name = db.Column(db.String(64), unique=True, index=True)
 47 |     password = db.Column(db.String(128))
 48 | 
 49 | 
 50 | class Project(db.Model):
 51 |     """
 52 |     项目类
 53 |     """
 54 |     __tablename__ = 'project'
 55 |     id = db.Column(db.Integer, primary_key=True)
 56 |     project_name = db.Column(db.String(64), unique=True, index=True)
 57 |     project_address = db.Column(db.String(256), unique=True, index=True)
 58 |     user_id = db.Column(db.Integer, db.ForeignKey('user.id'))
 59 |     dataSource_id = db.Column(db.Integer, db.ForeignKey('data_source.id'))
 60 | 
 61 | 
 62 | class ProcessFlow(db.Model):
 63 |     """
 64 |     数据处理流程类
 65 |     """
 66 |     __tablename__ = 'process_flow'
 67 |     id = db.Column(db.Integer, primary_key=True)
 68 |     project_id = db.Column(db.String(64), unique=True, index=True)
 69 |     operates = db.Column(db.String(13000))
 70 |     cur_ope_id = db.Column(db.String(128))
 71 |     links = db.Column(db.String(5000))
 72 | 
 73 | 
 74 | class Model(db.Model):
 75 |     """
 76 |     DAG图模型类
 77 |     """
 78 |     __tablename__ = 'model'
 79 |     id = db.Column(db.Integer, primary_key=True)
 80 |     model_name = db.Column(db.String(64))
 81 |     project_id = db.Column(db.Integer)
 82 |     start_nodes = db.Column(db.String(2048))
 83 |     config = db.Column(db.String(8192))
 84 |     create_time = db.Column(db.String(32))
 85 | 
 86 | 
 87 | class OperatorType(db.Model):
 88 |     """
 89 |     算子的类型（如：读数据，filter）
 90 |     """
 91 |     __tablename__ = 'operator_type'
 92 |     id = db.Column(db.Integer, primary_key=True)
 93 |     type_name = db.Column(db.String(128))
 94 |     type_label = db.Column(db.String(128))
 95 | 
 96 | 
 97 | class Operator(db.Model):
 98 |     """
 99 |     算子类
100 |     """
101 |     __tablename__ = 'operator'
102 |     id = db.Column(db.String(128), primary_key=True)
103 |     operator_name = db.Column(db.String(64))
104 |     father_operator_ids = db.Column(db.String(128))
105 |     child_operator_ids = db.Column(db.String(128))
106 |     model_id = db.Column(db.Integer)
107 |     status = db.Column(db.String(32))
108 |     operator_output_url = db.Column(db.String(512))
109 |     operator_input_url = db.Column(db.String(512))
110 |     operator_type_id = db.Column(db.Integer)
111 |     operator_config = db.Column(db.String(4096))
112 |     operator_style = db.Column(db.String(4096))
113 |     run_info = db.Column(db.String(8192))
114 | 
115 | 
116 | class ModelExecute(db.Model):
117 |     """
118 |     模型执行记录表
119 |     """
120 |     __tablename__ = 'model_execute'
121 |     id = db.Column(db.Integer, primary_key=True)
122 |     model_id = db.Column(db.Integer)
123 |     start_nodes = db.Column(db.String(2048))
124 |     status = db.Column(db.String(32))
125 |     execute_user_id = db.Column(db.Integer)
126 |     run_info = db.Column(db.String(4096))
127 |     create_time = db.Column(db.String(32))
128 |     end_time = db.Column(db.String(32))
129 | 
130 | 
131 | class Report(db.Model):
132 |     """
133 |     报告表
134 |     """
135 |     __tablename__ = 'report'
136 |     id = db.Column(db.Integer, primary_key=True)
137 |     user_id = db.Column(db.String(128))
138 |     report_title = db.Column(db.String(128))
139 |     report_content = db.Column(db.String(20000))
140 | 
141 | 
142 | class MLModel(db.Model):
143 |     """
144 |     机器学习算法训练结果模型表
145 |     """
146 |     __tablename__ = 'ml_model'
147 |     id = db.Column(db.Integer, primary_key=True)
148 |     user_id = db.Column(db.Integer)
149 |     project_id = db.Column(db.Integer)
150 |     model_id = db.Column(db.Integer)
151 |     status = db.Column(db.String(32))
152 |     name = db.Column(db.String(32))
153 |     operator_type_id = db.Column(db.Integer)
154 |     model_url = db.Column(db.String(256))
155 | 


--------------------------------------------------------------------------------
/app/models/ServerNameMap.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | 
 3 | """
 4 | v1版本，弃用
 5 | """
 6 | 
 7 | 
 8 | class ServerNameMap():
 9 |     operateIdToNameMap = {
10 |         # 1000 数据预处理
11 |         "1001": "过滤",
12 |         "1002": "排序",
13 |         "1003": "列拆分",
14 |         "1004": "行拆分",
15 |         "1005": "列合并",
16 |         "1006": "替换",
17 |         "1007": "填充控制",
18 |         "1008": "列变换",
19 |         # 2000 特征工程
20 |         "2001": "分位数离散化",
21 |         "2002": "向量索引转换",
22 |         "2003": "标准化",
23 |         "2004": "PCA",
24 |         "2005": "字符串转标签",
25 |         "2006": "独热编码",
26 |         "2007": "多项式扩展",
27 |         "2008": "卡方选择"
28 |     }
29 |     operateIdToTypeMap = {
30 |         # 1000 数据预处理
31 |         "1001": "数据预处理",
32 |         "1002": "数据预处理",
33 |         "1003": "数据预处理",
34 |         "1004": "数据预处理",
35 |         "1005": "数据预处理",
36 |         "1006": "数据预处理",
37 |         "1007": "数据预处理",
38 |         "1008": "数据预处理",
39 |         # 2000 特征工程
40 |         "2001": "特征工程",
41 |         "2002": "特征工程",
42 |         "2003": "特征工程",
43 |         "2004": "特征工程",
44 |         "2005": "特征工程",
45 |         "2006": "特征工程",
46 |         "2007": "特征工程",
47 |         "2008": "特征工程"
48 |     }
49 | 
50 |     typeToColorMap = {
51 |         "数据预处理": '#5fe0a6',
52 |         "特征工程": '#f0bb66'
53 |     }
54 | 


--------------------------------------------------------------------------------
/app/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kugomusic/Easy_Data/a74c3cd2c9c3b0e5a9298f8c3b7af2a2f5caf260/app/models/__init__.py


--------------------------------------------------------------------------------
/app/service/ClearTask.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | from app.models.MSEntity import Operator, MLModel
 3 | from app import db
 4 | from app.Utils import deltree, deldir
 5 | 
 6 | """
 7 | 该类的作用：清理无用的中间数据
 8 | """
 9 | 
10 | import os
11 | 
12 | filePath = '/home/zk/midData'
13 | modelPaths = ['/home/zk/midData/model/secondClassification/svm', '/home/zk/midData/model/secondClassification/gbdt']
14 | 
15 | # 数据可用到的中间数据
16 | urls_arr = []
17 | # 查找operator
18 | query = db.session.query(Operator)
19 | db.session.commit()
20 | for operator in query:
21 |     url = operator.operator_output_url
22 |     if (url is not None) and (url is not ''):
23 |         urls_arr.extend(url.split('*,'))
24 | 
25 | # 查找 保存的model
26 | models = db.session.query(MLModel)
27 | db.session.commit()
28 | for model in models:
29 |     url = model.model_url
30 |     if (url is not None) and (url is not ''):
31 |         urls_arr.extend(url.split('*,'))
32 | print(urls_arr)
33 | 
34 | # 磁盘上所有中间数据
35 | all_file = []
36 | for i, j, k in os.walk(filePath):
37 |     for item in k:
38 |         all_file.append(filePath + '/' + item)
39 |     break
40 | 
41 | for modelPath in modelPaths:
42 |     for i, j, k in os.walk(modelPath):
43 |         for item in j:
44 |             all_file.append(modelPath + '/' + item)
45 |         break
46 | 
47 | for url in all_file:
48 |     print(url)
49 | 
50 | print('******删除一下内容：')
51 | for url in all_file:
52 |     if url not in urls_arr:
53 |         print("删除：" + url)
54 |         if os.path.isdir(url):
55 |             deltree(url)
56 |         elif os.path.isfile(url):
57 |             deldir(url)
58 | 


--------------------------------------------------------------------------------
/app/service/ExplorationService.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | from app.Utils import *
  3 | import app.dao.OperatorDao as OperatorDao
  4 | import pandas as pd
  5 | 
  6 | 
  7 | def full_table_statistics(spark_session, operator_id, file_url, condition):
  8 |     """
  9 |     全表统计
 10 |     :param spark_session:
 11 |     :param operator_id:
 12 |     :param file_url:
 13 |     :param condition:
 14 |     :return:
 15 |     """
 16 | 
 17 |     try:
 18 |         # 修改计算状态
 19 |         OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
 20 |         # 读取数据
 21 |         df = read_data_pandas(file_url)
 22 |         # 全表统计函数
 23 |         result_df = full_table_statistics_core(df, condition)
 24 |         if isinstance(result_df, str):
 25 |             OperatorDao.update_operator_by_id(operator_id, 'error', '', result_df)
 26 |         else:
 27 |             # 存储结果
 28 |             result_file_url = save_data_pandas(result_df, '', '', 1)
 29 |             run_info = '全表统计算子执行成功'
 30 |             # 修改计算状态
 31 |             OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url, run_info)
 32 |             return [result_file_url]
 33 | 
 34 |     except Exception as e:
 35 |         run_info = str(e)
 36 |         OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
 37 |         traceback.print_exc()
 38 |     return []
 39 | 
 40 | 
 41 | def full_table_statistics_core(df, condition):
 42 |     """
 43 |     全表统计核心函数
 44 |     :param df: 数据（pandas）
 45 |     :param condition: {"projectId": 32, "columnNames": ['利润']}
 46 |     :return:
 47 |     """
 48 |     column_names = condition['columnNames']
 49 |     statistics = ['类型', '总数', '最小值', '最小值位置', '25%分位数', '中位数', '75%分位数', '均值', '最大值', '最大值位置', '平均绝对偏差', '方差',
 50 |                   '标准差', '偏度', '峰度']
 51 |     data = {}
 52 |     for columnName in column_names:
 53 |         info = []
 54 |         column_type = df[columnName].dtype
 55 |         if column_type == 'int64' or column_type == 'float64':
 56 |             info.append('number')
 57 |             info.append(str(df[columnName].count()))
 58 |             info.append(str(df[columnName].min()))
 59 |             info.append(str(df[columnName].idxmin()))
 60 |             info.append(str(df[columnName].quantile(.25)))
 61 |             info.append(str(df[columnName].median()))
 62 |             info.append(str(df[columnName].quantile(.75)))
 63 |             info.append(str(df[columnName].mean()))
 64 |             info.append(str(df[columnName].max()))
 65 |             info.append(str(df[columnName].idxmax()))
 66 |             info.append(str(df[columnName].mad()))
 67 |             info.append(str(df[columnName].var()))
 68 |             info.append(str(df[columnName].std()))
 69 |             info.append(str(df[columnName].skew()))
 70 |             info.append(str(df[columnName].kurt()))
 71 |         else:
 72 |             info.append('text')
 73 |             info.append(str(df[columnName].count()))
 74 |             info.append('')
 75 |             info.append('')
 76 |             info.append('')
 77 |             info.append('')
 78 |             info.append('')
 79 |             info.append('')
 80 |             info.append('')
 81 |             info.append('')
 82 |             info.append('')
 83 |             info.append('')
 84 |             info.append('')
 85 |             info.append('')
 86 |             info.append('')
 87 |         data[columnName] = info
 88 |         print(pd.DataFrame(data, index=statistics))
 89 |     return pd.DataFrame(data, index=statistics)
 90 | 
 91 | 
 92 | def frequency_statistics(spark_session, operator_id, file_url, condition):
 93 |     """
 94 |     频次统计
 95 |     :param spark_session:
 96 |     :param operator_id:
 97 |     :param file_url:
 98 |     :param condition:
 99 |     :return:
100 |     """
101 |     try:
102 |         # 修改计算状态
103 |         OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
104 |         # 读取数据
105 |         df = read_data_pandas(file_url)
106 |         # 频次统计函数
107 |         result_df = frequency_statistics_core(df, condition)
108 |         if isinstance(result_df, str):
109 |             OperatorDao.update_operator_by_id(operator_id, 'error', '', result_df)
110 |         else:
111 |             # 存储结果
112 |             result_file_url = save_data_pandas(result_df)
113 |             run_info = '频次统计算子执行成功'
114 |             # 修改计算状态
115 |             OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url, run_info)
116 |             return [result_file_url]
117 | 
118 |     except Exception as e:
119 |         run_info = str(e)
120 |         OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
121 |         traceback.print_exc()
122 |     return []
123 | 
124 | 
125 | def frequency_statistics_core(df, condition):
126 |     """
127 |     :param df:
128 |     :param condition:{"projectId":32,"columnName":"Item"}
129 |     :return:
130 |     """
131 |     column_name = condition['columnName']
132 | 
133 |     s = df[column_name].value_counts()
134 |     data = {column_name: s.index, "频率": s.values}
135 |     print(pd.DataFrame(data))
136 |     return pd.DataFrame(data)
137 | 
138 | 
139 | def correlation_coefficient(spark_session, operator_id, file_url, condition):
140 |     """
141 |     相关系数
142 |     :param spark_session:
143 |     :param operator_id:
144 |     :param file_url:
145 |     :param condition:
146 |     :return:
147 |     """
148 |     try:
149 |         # 修改计算状态
150 |         OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
151 |         # 读取数据
152 |         df = read_data_pandas(file_url)
153 |         # 相关系数函数
154 |         result_df = correlation_coefficient_core(df, condition)
155 |         if isinstance(result_df, str):
156 |             OperatorDao.update_operator_by_id(operator_id, 'error', '', result_df)
157 |         else:
158 |             # 存储结果
159 |             result_file_url = save_data_pandas(result_df, '', '', 1)
160 |             run_info = '相关系数算子执行成功'
161 |             # 修改计算状态
162 |             OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url, run_info)
163 |             return [result_file_url]
164 | 
165 |     except Exception as e:
166 |         run_info = str(e)
167 |         OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
168 |         traceback.print_exc()
169 |     return []
170 | 
171 | 
172 | def correlation_coefficient_core(df, condition):
173 |     """
174 |     :param df:
175 |     :param condition: {"projectId": 32, "columnNames": ["销售额", "折扣", "装运成本"]}
176 |     :return:
177 |     """
178 |     column_names = condition['columnNames']
179 |     # 报错信息：如果所选列不是数值型，则报错
180 |     accept_types = ['int64', 'float64']
181 |     for columnName in column_names:
182 |         if df[columnName].dtype not in accept_types:
183 |             return "只能计算数值型列的相关系数，但是 <" + columnName + "> 的类型为 " + str(df[columnName].dtype)
184 |     # 计算出相关系数矩阵df
185 |     return df.corr()
186 | 


--------------------------------------------------------------------------------
/app/service/MLModelService.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | 
 3 | import app.dao.ModelDao as ModelDao
 4 | import app.dao.MLModelDao as MLModelDao
 5 | import app.dao.OperatorDao as OperatorDao
 6 | import app.dao.OperatorTypeDao as OperatorTypeDao
 7 | import app.dao.ModelExecuteDao as ModelExecuteDao
 8 | from app.models.MSEntity import Operator, ModelExecute, MLModel
 9 | import app.service.ModelExecuteService as ModelExecuteService
10 | from app.Utils import *
11 | 
12 | """
13 | 关于ml_model 的处理方法
14 | """
15 | 
16 | 
17 | def save_ml_model(operator_id, user_id, name):
18 |     """
19 |     保存训练模型
20 |     :param operator_id:
21 |     :param user_id:
22 |     :param name:
23 |     :return:
24 |     """
25 |     # 查看算子
26 |     operator = OperatorDao.get_operator_by_id(operator_id)
27 |     if operator.operator_type_id > 7000 or operator.operator_type_id < 6001:
28 |         return "所选择的节点并不是模型算子节点"
29 |     if operator.status != "success":
30 |         return "请执行该节点"
31 |     if operator.operator_output_url is not None:
32 |         operator_output_url = operator.operator_output_url.split('*,')
33 |     else:
34 |         return "没有运行结果"
35 | 
36 |     model_url = operator_output_url[0]
37 |     operator_type_id = operator.operator_type_id
38 |     model_id = operator.model_id
39 | 
40 |     # 查看执行流程model
41 |     model = ModelDao.get_model_by_id(model_id)
42 |     project_id = model.project_id
43 | 
44 |     ml_model = MLModel(user_id=user_id, project_id=project_id, model_id=model_id, status='save', name=name,
45 |                        operator_type_id=operator_type_id, model_url=model_url)
46 |     return MLModelDao.create_ml_model(ml_model)
47 | 
48 | 
49 | def get_ml_model(ml_model_id, project_id, user_id, model_id, name, status):
50 |     """
51 |     按照条件查询ml_model
52 |     :param ml_model_id:
53 |     :param project_id:
54 |     :param user_id:
55 |     :param model_id:
56 |     :param name:
57 |     :param status:
58 |     :return:
59 |     """
60 |     ml_models = MLModel.query
61 |     if (ml_model_id is not None) and (ml_model_id is not ''):
62 |         ml_models = ml_models.filter(MLModel.id == ml_model_id)
63 |     if (project_id is not None) and (project_id is not ''):
64 |         ml_models = ml_models.filter(MLModel.project_id == project_id)
65 |     if (user_id is not None) and (user_id is not ''):
66 |         ml_models = ml_models.filter(MLModel.user_id == user_id)
67 |     if (model_id is not None) and (model_id is not ''):
68 |         ml_models = ml_models.filter(MLModel.model_id == model_id)
69 |     if (name is not None) and (name is not ''):
70 |         ml_models = ml_models.filter(MLModel.name == name)
71 |     if (status is not None) and (status is not ''):
72 |         ml_models = ml_models.filter(MLModel.status == status)
73 |     results = []
74 |     for ml_model in ml_models:
75 |         results.append({"MLModelId": ml_model.id, "status": ml_model.status, "name": ml_model.name,
76 |                         "operatorTypeId": ml_model.operator_type_id})
77 |     return results
78 | 
79 | 
80 | def delete_ml_model(ml_model_id):
81 |     """
82 |     删除 ml_model
83 |     :param ml_model_id:
84 |     :return:
85 |     """
86 |     filters = {MLModel.id: ml_model_id}
87 |     MLModel.query.filter(*filters).delete()
88 |     db.session.commit()
89 | 


--------------------------------------------------------------------------------
/app/service/ModelExecuteService.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import time
  3 | import queue
  4 | import threading
  5 | import traceback
  6 | import app.service.FEService as FEService
  7 | import app.service.ml.Evaluation as Evaluation
  8 | import app.service.ml.ModelService as ModelService
  9 | import app.service.PreprocessService as preprocessService
 10 | import app.service.ExplorationService as ExplorationService
 11 | import app.service.ml.PredictService as PredictService
 12 | import app.service.ml.SecondClassification as SecondClassification
 13 | import app.service.ml.MultipleClassifition as MultipleClassifition
 14 | import app.dao.OperatorDao as OperatorDao
 15 | 
 16 | 
 17 | def model_thread_execute(spark_session, start_nodes):
 18 |     """
 19 |     多线程执行 model（执行流程）
 20 |     :param spark_session：
 21 |     :param start_nodes:['1','2'] model（执行流程启动的节点）
 22 |     :return:
 23 |     """
 24 | 
 25 |     class MyThread(threading.Thread):
 26 |         def __init__(self, threadID, name, q):
 27 |             threading.Thread.__init__(self)
 28 |             self.threadID = threadID
 29 |             self.name = name
 30 |             self.q = q
 31 | 
 32 |         def run(self):
 33 |             print("开启线程：" + self.name)
 34 |             process_data(self.name, self.q)
 35 |             print("退出线程：" + self.name)
 36 | 
 37 |     def process_data(threadName, q):
 38 |         print('-------进入线程：', threadName)
 39 | 
 40 |         while G.noExecFlag or G.execCounter or not workQueue.empty():
 41 |             print('-------进入线程内部循环：', not workQueue.empty())
 42 |             # TODO 多线程安全
 43 |             G.noExecFlag = 0
 44 |             G.execCounter += 1
 45 |             queueLock.acquire()
 46 |             if not workQueue.empty():
 47 |                 operator_id = q.get()
 48 |                 queueLock.release()
 49 |                 # TODO:处理函数
 50 |                 could_execute_operator_ids = operator_execute(spark_session, operator_id)
 51 |                 if could_execute_operator_ids is False:
 52 |                     print("%s processing %s error" % (threadName, operator_id))
 53 |                 else:
 54 |                     for item_id in could_execute_operator_ids:
 55 |                         if item_id != '' and item_id is not None:
 56 |                             q.put(item_id)
 57 |                     print("%s processing %s add %s to queue" % (
 58 |                         threadName, operator_id, ','.join(could_execute_operator_ids)))
 59 |                     print("q.size: %s --- workQueue.size: %s" % (q.qsize(), workQueue.qsize()))
 60 |             else:
 61 |                 queueLock.release()
 62 |             G.execCounter -= 1
 63 |             time.sleep(1)
 64 | 
 65 |     class G:
 66 |         # 未开始执行
 67 |         noExecFlag = 1
 68 |         # 正在执行operator的个数
 69 |         execCounter = 0
 70 | 
 71 |     threadList = ["Thread-1", "Thread-2", "Thread-3"]
 72 |     queueLock = threading.Lock()
 73 |     workQueue = queue.Queue(10)
 74 |     threads = []
 75 |     threadID = 1
 76 | 
 77 |     # 填充队列
 78 |     queueLock.acquire()
 79 |     for word in start_nodes:
 80 |         workQueue.put(word)
 81 |     queueLock.release()
 82 | 
 83 |     # 创建新线程
 84 |     for tName in threadList:
 85 |         thread = MyThread(threadID, tName, workQueue)
 86 |         thread.start()
 87 |         threads.append(thread)
 88 |         threadID += 1
 89 | 
 90 |     # 等待队列清空
 91 |     while (not workQueue.empty()) or G.execCounter:
 92 |         pass
 93 | 
 94 |     # 等待所有线程完成
 95 |     for t in threads:
 96 |         t.join()
 97 |     print("退出主线程")
 98 | 
 99 | 
100 | def operator_execute(spark_session, operator_id):
101 |     """
102 |     执行算子
103 |     :param spark_session:
104 |     :param operator_id:
105 |     :return:
106 |     """
107 |     try:
108 |         # 查算子
109 |         operator = OperatorDao.get_operator_by_id(operator_id)
110 |         print("------执行算子------", "operator_id：", operator_id, operator.operator_type_id)
111 |         # 获取input_url
112 |         config = json.loads(operator.operator_config)
113 |         file_url_list = config['fileUrl']
114 |         # 获取输入地址
115 |         url_arr = []
116 |         for file_url_dict in file_url_list:
117 |             key = ''
118 |             for ikey in file_url_dict.keys():
119 |                 key = ikey
120 |             if operator_id == key:
121 |                 url_arr.append(file_url_dict[key])
122 |             else:
123 |                 father = OperatorDao.get_operator_by_id(key)
124 |                 # 检查父节点是否准备就绪
125 |                 if father.status != 'success':
126 |                     return []
127 |                 # TODO:暂定从0 开始
128 |                 father_output_url_index = file_url_dict[key]
129 |                 father_url_arr = father.operator_output_url.split('*,')
130 |                 url_arr.append(father_url_arr[father_output_url_index])
131 |         # 算子函数
132 |         if operator.operator_type_id == 1001:
133 |             preprocessService.filter_multi_conditions(spark_session, operator_id, url_arr[0],
134 |                                                       json.loads(operator.operator_config)['parameter'])
135 |         elif operator.operator_type_id == 1002:
136 |             preprocessService.sort(spark_session, operator_id, url_arr[0],
137 |                                    json.loads(operator.operator_config)['parameter'])
138 |         elif operator.operator_type_id == 1003:
139 |             preprocessService.column_split(spark_session, operator_id, url_arr[0],
140 |                                            json.loads(operator.operator_config)['parameter'])
141 |         elif operator.operator_type_id == 1005:
142 |             preprocessService.columns_merge(spark_session, operator_id, url_arr[0],
143 |                                             json.loads(operator.operator_config)['parameter'])
144 |         elif operator.operator_type_id == 1006:
145 |             preprocessService.replace(spark_session, operator_id, url_arr[0],
146 |                                       json.loads(operator.operator_config)['parameter'])
147 |         elif operator.operator_type_id == 1007:
148 |             preprocessService.fill_null_value(spark_session, operator_id, url_arr[0],
149 |                                               json.loads(operator.operator_config)['parameter'])
150 |         elif operator.operator_type_id == 1008:
151 |             preprocessService.column_map(spark_session, operator_id, url_arr[0],
152 |                                          json.loads(operator.operator_config)['parameter'])
153 |         elif operator.operator_type_id == 1009:
154 |             preprocessService.random_split(spark_session, operator_id, url_arr[0],
155 |                                            json.loads(operator.operator_config)['parameter'])
156 |         elif operator.operator_type_id == 2001:
157 |             FEService.quantile_discretization(spark_session, operator_id, url_arr[0],
158 |                                               json.loads(operator.operator_config)['parameter'])
159 |         elif operator.operator_type_id == 2002:
160 |             FEService.vector_indexer(spark_session, operator_id, url_arr[0],
161 |                                      json.loads(operator.operator_config)['parameter'])
162 |         elif operator.operator_type_id == 2003:
163 |             FEService.standard_scaler(spark_session, operator_id, url_arr[0],
164 |                                       json.loads(operator.operator_config)['parameter'])
165 |         elif operator.operator_type_id == 2004:
166 |             FEService.pca(spark_session, operator_id, url_arr[0],
167 |                           json.loads(operator.operator_config)['parameter'])
168 |         elif operator.operator_type_id == 2005:
169 |             FEService.string_indexer(spark_session, operator_id, url_arr[0],
170 |                                      json.loads(operator.operator_config)['parameter'])
171 |         elif operator.operator_type_id == 2006:
172 |             FEService.one_hot_encoder(spark_session, operator_id, url_arr[0],
173 |                                       json.loads(operator.operator_config)['parameter'])
174 |         elif operator.operator_type_id == 2007:
175 |             FEService.polynomial_expansion(spark_session, operator_id, url_arr[0],
176 |                                            json.loads(operator.operator_config)['parameter'])
177 |         elif operator.operator_type_id == 2008:
178 |             FEService.chiSqSelector(spark_session, operator_id, url_arr[0],
179 |                                     json.loads(operator.operator_config)['parameter'])
180 |         elif operator.operator_type_id == 3001:
181 |             ExplorationService.full_table_statistics(spark_session, operator_id, url_arr[0],
182 |                                                      json.loads(operator.operator_config)['parameter'])
183 |         elif operator.operator_type_id == 3002:
184 |             ExplorationService.frequency_statistics(spark_session, operator_id, url_arr[0],
185 |                                                     json.loads(operator.operator_config)['parameter'])
186 |         elif operator.operator_type_id == 3003:
187 |             ExplorationService.correlation_coefficient(spark_session, operator_id, url_arr[0],
188 |                                                        json.loads(operator.operator_config)['parameter'])
189 |         elif operator.operator_type_id == 5001:
190 |             preprocessService.read_data_with_update_record(spark_session, operator_id, url_arr[0])
191 |         elif operator.operator_type_id == 6000:
192 |             PredictService.ml_predict(spark_session, operator_id, url_arr,
193 |                                       json.loads(operator.operator_config)['parameter'])
194 |         elif operator.operator_type_id == 6001:
195 |             SecondClassification.svm(spark_session, operator_id, url_arr[0],
196 |                                      json.loads(operator.operator_config)['parameter'])
197 |         elif operator.operator_type_id == 6002:
198 |             SecondClassification.gbdt(spark_session, operator_id, url_arr[0],
199 |                                       json.loads(operator.operator_config)['parameter'])
200 |         elif operator.operator_type_id == 6003:
201 |             SecondClassification.lr(spark_session, operator_id, url_arr[0],
202 |                                     json.loads(operator.operator_config)['parameter'])
203 |         elif operator.operator_type_id == 6004:
204 |             MultipleClassifition.lr(spark_session, operator_id, url_arr[0],
205 |                                     json.loads(operator.operator_config)['parameter'])
206 |         elif operator.operator_type_id == 6005:
207 |             MultipleClassifition.mpc(spark_session, operator_id, url_arr[0],
208 |                                      json.loads(operator.operator_config)['parameter'])
209 |         elif operator.operator_type_id == 7001:
210 |             Evaluation.second_evaluation(spark_session, operator_id,
211 |                                          json.loads(operator.operator_config)['parameter'])
212 |         elif operator.operator_type_id == 8000:
213 |             ModelService.model_operator(operator_id, json.loads(operator.operator_config)['parameter'])
214 | 
215 |         return operator.child_operator_ids.split(',')
216 | 
217 |     except:
218 |         traceback.print_exc()
219 |         return False
220 | 


--------------------------------------------------------------------------------
/app/service/ModelService.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | 
  3 | import app.dao.ModelDao as ModelDao
  4 | import app.dao.OperatorDao as OperatorDao
  5 | import app.dao.OperatorTypeDao as OperatorTypeDao
  6 | import app.dao.ModelExecuteDao as ModelExecuteDao
  7 | from app.models.MSEntity import Operator, ModelExecute
  8 | import app.service.ModelExecuteService as ModelExecuteService
  9 | from app.Utils import *
 10 | 
 11 | """
 12 | 关于model（执行流程）的处理方法
 13 | """
 14 | 
 15 | 
 16 | def update_model(project_id, start_nodes, config, relationship, config_order):
 17 |     """
 18 |     更新 model (处理流程图)
 19 |     ToDo: 没有考虑数据库操作的原子性
 20 |     :param project_id:
 21 |     :param start_nodes:
 22 |     :param config:
 23 |     :return:
 24 |     """
 25 |     try:
 26 |         # 获取model
 27 |         model = ModelDao.get_model_by_project_id(project_id)
 28 |         if model is False:
 29 |             return False
 30 | 
 31 |         # 更新model
 32 |         update_result = ModelDao.update_with_project_id(project_id, start_nodes, relationship, config_order)
 33 |         if update_result is False:
 34 |             return False
 35 | 
 36 |         # 获取 operator
 37 |         operator_old = OperatorDao.get_operator_by_model_id(model.id)
 38 | 
 39 |         # 新的operator
 40 |         operators = []
 41 |         config_dict = json.loads(config)
 42 |         for operator_id in config_dict.keys():
 43 |             operator_dict = config_dict.get(operator_id)
 44 |             operator_style = json.dumps({'location': operator_dict['location'], }, ensure_ascii=False)
 45 |             # json.dumps(operates, ensure_ascii=False)
 46 |             ope = Operator(id=operator_id,
 47 |                            father_operator_ids=','.join(operator_dict['pre']),
 48 |                            child_operator_ids=','.join(operator_dict['next']),
 49 |                            model_id=model.id,
 50 |                            status='initial',
 51 |                            operator_type_id=operator_dict['name'],
 52 |                            operator_config=json.dumps(operator_dict['config'], ensure_ascii=False),
 53 |                            operator_style=operator_style)
 54 |             operators.append(ope)
 55 | 
 56 |         # 准备删除的算子
 57 |         operator_delete = []
 58 |         # 准备更新的算子
 59 |         operator_update = []
 60 |         for old in operator_old:
 61 |             flag_exist = False
 62 |             for new in operators:
 63 |                 if old.id == new.id:
 64 |                     flag_exist = True
 65 |                     operator_update.append([old, new])
 66 |                     break
 67 |             if not flag_exist:
 68 |                 operator_delete.append(old)
 69 |         # 删除算子
 70 |         for delete in operator_delete:
 71 |             OperatorDao.delete_operator_by_id(delete.id)
 72 |             print("********删除算子", delete)
 73 | 
 74 |         # 更新算子
 75 |         for update in operator_update:
 76 |             update[0].father_operator_ids = update[1].father_operator_ids
 77 |             update[0].child_operator_ids = update[1].child_operator_ids
 78 |             update[0].model_id = update[1].model_id
 79 |             update[0].operator_type_id = update[1].operator_type_id
 80 |             update[0].operator_config = update[1].operator_config
 81 |             update[0].operator_style = update[1].operator_style
 82 |             # 更新算子
 83 |             OperatorDao.create_operator([update[0]])
 84 |             print("*********更新算子", update[0])
 85 | 
 86 |         # 准备添加的算子
 87 |         operator_add = []
 88 |         for new in operators:
 89 |             flag_exist = False
 90 |             for old in operator_old:
 91 |                 if old.id == new.id:
 92 |                     flag_exist = True
 93 |                     break
 94 |             if not flag_exist:
 95 |                 operator_add.append(new)
 96 |         # 添加算子
 97 |         OperatorDao.create_operator(operator_add)
 98 |         print("*********添加算子", operator_add)
 99 |         return True
100 |     except:
101 |         traceback.print_exc()
102 |         return False
103 | 
104 | 
105 | def get_model_by_project_id(project_id):
106 |     """
107 |     获取项目对应的model(执行流程)
108 |     :param project_id:
109 |     :return:
110 |     """
111 |     # 获取 model
112 |     model = ModelDao.get_model_by_project_id(project_id)
113 |     if model is False:
114 |         return False
115 | 
116 |     # 获取 operator
117 |     operators = OperatorDao.get_operator_by_model_id(model.id)
118 |     if operators is False:
119 |         return False
120 | 
121 |     # 获取 operator_type
122 |     operator_types = OperatorTypeDao.get_all_operator_type()
123 |     if operator_types is False:
124 |         return False
125 | 
126 |     # TODO : 查询数据源表
127 | 
128 |     operator_types_dict = dict()
129 |     for operator_type in operator_types:
130 |         operator_types_dict[operator_type.id] = operator_type
131 | 
132 |     # 返回结果
133 |     config = dict()
134 |     for operator in operators:
135 |         if operator_types_dict[operator.operator_type_id].id == 5001:
136 |             data_operator_type = json.loads(operator.operator_config)['fileUrl'][0][operator.id].split('/')[-1]
137 |         else:
138 |             data_operator_type = operator_types_dict[operator.operator_type_id].type_name
139 |         config[operator.id] = {'type': data_operator_type,
140 |                                'name': operator_types_dict[operator.operator_type_id].id,
141 |                                'location': json.loads(operator.operator_style)['location'],
142 |                                'config': json.loads(operator.operator_config),
143 |                                'next': operator.child_operator_ids.split(','),
144 |                                "pre": operator.father_operator_ids.split(',')}
145 |     model_config = json.loads(model.config)
146 |     relationship = []
147 |     for item in model_config['relationship'].split('*,'):
148 |         relationship.append(list_str_to_list(item))
149 |     config_order = json.loads(model_config['config_order'])
150 |     return {'projectId': project_id, 'config': config, 'startNode': model.start_nodes.split(','),
151 |             'relationship': relationship, 'config_order': config_order}
152 | 
153 | 
154 | def get_status_model_execute_end(project_id, start_operator_ids):
155 |     """
156 |     获取运行结束后的状态
157 | 
158 |     :param project_id:
159 |     :param start_operator_ids:
160 |     :return:
161 |     """
162 |     # 获取 model
163 |     model = ModelDao.get_model_by_project_id(project_id)
164 |     if model is False:
165 |         return False
166 | 
167 |     # 获取 operator
168 |     operators = OperatorDao.get_operator_by_model_id(model.id)
169 |     if operators is False:
170 |         return False
171 | 
172 |     # 构造dict
173 |     id_operator_dict = {}
174 |     for operator in operators:
175 |         id_operator_dict[operator.id] = operator
176 | 
177 |     operator_from_one_ids = []
178 |     operator_from_one_ids.extend(start_operator_ids)
179 | 
180 |     # 从此次执行 起始节点及以后节点的状态
181 |     status_set = set()
182 |     while operator_from_one_ids:
183 |         item = operator_from_one_ids.pop(0)
184 |         if not (item is None or item == ''):
185 |             status_set.add(id_operator_dict[item].status)
186 |             operator_from_one_ids.extend(id_operator_dict[item].child_operator_ids.split(','))
187 | 
188 |     if len(status_set) == 1 and "success" in status_set:
189 |         return "success"
190 |     else:
191 |         return "error"
192 | 
193 | 
194 | def get_run_status_by_project_id(project_id, model_execute_id):
195 |     """
196 |     获取某次执行的状态和其中的每个算子的状态
197 | 
198 |     :param project_id:
199 |     :param model_execute_id: model的执行记录ID
200 |     :return:
201 |     """
202 | 
203 |     # 获取 model
204 |     model = ModelDao.get_model_by_project_id(project_id)
205 |     if model is False:
206 |         return False
207 | 
208 |     # 获取 operator
209 |     operators = OperatorDao.get_operator_by_model_id(model.id)
210 |     if operators is False:
211 |         return False
212 | 
213 |     # 构造dict
214 |     id_operator_dict = {}
215 |     for operator in operators:
216 |         id_operator_dict[operator.id] = operator
217 | 
218 |     # 查看此次执行记录（状态、起始节点）
219 |     model_execute_ = ModelExecuteDao.get_model_execute_by_id(model_execute_id)
220 |     operator_from_one_ids = model_execute_.start_nodes.split(',')
221 | 
222 |     # 查看此次执行的所有节点的状态
223 |     result = dict()
224 |     while operator_from_one_ids:
225 |         item = operator_from_one_ids.pop(0)
226 |         if not (item is None or item == ''):
227 |             result[id_operator_dict[item].id] = {"status": id_operator_dict[item].status,
228 |                                                  "log": id_operator_dict[item].run_info}
229 |             operator_from_one_ids.extend(id_operator_dict[item].child_operator_ids.split(','))
230 | 
231 |     return {"modelExecuteStatus": model_execute_.status, "operatorStatus": result}
232 | 
233 | 
234 | def run_execute_status_from_start(user_id, project_id):
235 |     """
236 |     设置模型运行时状态(从头开始执行)
237 |     :param user_id:
238 |     :param project_id:
239 |     :return:
240 |     """
241 |     # 获取 model
242 |     model = ModelDao.get_model_by_project_id(project_id)
243 |     if model is False:
244 |         return False
245 |     # 状态初始化
246 |     start_nodes = model.start_nodes.split(',')
247 |     model_execute_id = initial_execute_status(user_id, start_nodes)
248 |     ModelExecuteDao.update_model_execute(model_execute_id, "running", "", "")
249 |     return {'model_execute_id': model_execute_id, 'start_nodes': start_nodes}
250 | 
251 | 
252 | def run_execute_status_from_one(user_id, operator_id):
253 |     """
254 |     设置模型运行时状态（从某个节点开始执行）
255 |     :param user_id:
256 |     :param operator_id:
257 |     :return:
258 |     """
259 |     # 状态初始化
260 |     start_nodes = [operator_id]
261 |     model_execute_id = initial_execute_status(user_id, start_nodes)
262 |     ModelExecuteDao.update_model_execute(model_execute_id, "running", "", "")
263 |     return {'model_execute_id': model_execute_id, 'start_nodes': start_nodes}
264 | 
265 | 
266 | def model_execute(user_id, project_id, param):
267 |     """
268 |     执行模型
269 |     :param user_id: 1
270 |     :param project_id: 32
271 |     :param param: {'model_execute_id': model_execute_id, 'start_nodes': start_nodes}
272 |     :return:
273 |     """
274 |     model_execute_id = param['model_execute_id']
275 |     start_nodes = param['start_nodes']
276 |     # spark会话
277 |     spark_session = getSparkSession(user_id, "executeModel")
278 |     # 多线程执行
279 |     print("-----model_execute_from_start------", "start_nodes", ','.join(start_nodes))
280 |     ModelExecuteService.model_thread_execute(spark_session, start_nodes)
281 |     # 执行完毕，更改执行状态
282 |     end_status = get_status_model_execute_end(project_id, start_nodes)
283 |     ModelExecuteDao.update_model_execute(model_execute_id, end_status, "",
284 |                                          time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
285 |     return model_execute_id
286 | 
287 | 
288 | def initial_execute_status(execute_user_id, start_nodes):
289 |     """
290 |     每次执行model时，初始化执行状态
291 |     :param execute_user_id:
292 |     :param start_nodes: []
293 |     :return:
294 |     """
295 |     # 查找参与运行的 operator
296 |     operator_list = []
297 |     operator_id_queue = []
298 |     for x in start_nodes:
299 |         operator_id_queue.append(x)
300 |     while len(operator_id_queue) > 0:
301 |         operator_id = operator_id_queue.pop(0)
302 |         if operator_id is None or operator_id == "":
303 |             continue
304 |         operator = OperatorDao.get_operator_by_id(operator_id)
305 |         operator_list.append(operator)
306 |         for x in operator.child_operator_ids.split(','):
307 |             operator_id_queue.append(x)
308 | 
309 |     # 每个operator 状态初始化为initial
310 |     for operator in operator_list:
311 |         OperatorDao.update_operator_by_id(operator.id, "initial")
312 | 
313 |     # 追加执行记录
314 |     model_execute = ModelExecute(start_nodes=','.join(start_nodes), status='initial', execute_user_id=execute_user_id,
315 |                                  create_time=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
316 |     model_execute = ModelExecuteDao.create_model_execute(model_execute)
317 |     if model_execute is False:
318 |         return False
319 |     else:
320 |         return model_execute.id
321 | 


--------------------------------------------------------------------------------
/app/service/PreprocessService.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | from app.Utils import *
  3 | import random
  4 | import string
  5 | from app.ConstFile import const
  6 | import app.dao.OperatorDao as OperatorDao
  7 | from pyspark.sql import functions as func
  8 | from pyspark.sql.functions import split, explode, concat_ws, regexp_replace
  9 | 
 10 | save_dir = const.SAVEDIR
 11 | 
 12 | 
 13 | def read_data_with_update_record(spark_session, operator_id, file_url):
 14 |     """
 15 |     读数据算子，拷贝数据并更新算子记录表
 16 | 
 17 |     :param spark_session:
 18 |     :param operator_id:
 19 |     :param file_url:
 20 |     :return:
 21 |     """
 22 |     try:
 23 |         # 修改计算状态
 24 |         OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
 25 |         # 读取数据
 26 |         df = read_data(spark_session, file_url)
 27 |         # 存储结果
 28 |         result_file_url = save_data(df)
 29 | 
 30 |         run_info = 'read_data算子执行成功'
 31 |         # 修改计算状态
 32 |         OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url, run_info)
 33 |         return [result_file_url]
 34 |     except Exception as e:
 35 |         run_info = str(e)
 36 |         OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
 37 |         traceback.print_exc()
 38 |         return []
 39 | 
 40 | 
 41 | def filter_multi_conditions(spark_session, operator_id, file_url, condition):
 42 |     """
 43 |     按照多个条件进行过滤
 44 | 
 45 |     :param spark_session:
 46 |     :param operator_id:
 47 |     :param file_url:
 48 |     :param condition: {"userId":1,"projectId":32,"parameter":[{"colName":"利润", "operate":">","value":"100", "relation":"AND"},{"colName":"装运方式", "operate":"==", "value":"一级", "relation":""}]}
 49 |     :return:
 50 |     """
 51 | 
 52 |     try:
 53 | 
 54 |         # 修改计算状态
 55 |         OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
 56 |         # 读取数据
 57 |         df = read_data(spark_session, file_url)
 58 |         # 过滤函数
 59 |         result_df = filter_core(spark_session, df, condition['parameter'])
 60 |         # 存储结果
 61 |         result_file_url = save_data(result_df)
 62 | 
 63 |         run_info = '过滤算子执行成功'
 64 |         # 修改计算状态
 65 |         OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url, run_info)
 66 |         return [result_file_url]
 67 |     except Exception as e:
 68 |         run_info = str(e)
 69 |         OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
 70 |         traceback.print_exc()
 71 |         return []
 72 | 
 73 | 
 74 | def filter_core(spark, df, condition):
 75 |     """
 76 |     过滤的核心函数
 77 |     :param spark:
 78 |     :param df:
 79 |     :param condition:
 80 |     :return:
 81 |     """
 82 | 
 83 |     table_name = ''.join(random.sample(string.ascii_letters + string.digits, 8))
 84 |     sql_str = 'select * from ' + table_name + ' where'
 85 |     for i in condition:
 86 |         if is_number(i['value']):
 87 |             sql_str = sql_str + ' `' + i['colName'] + '` ' + i['operate'] + ' ' + i['value'] + ' ' + i['relation']
 88 |         else:
 89 |             sql_str = sql_str + ' `' + i['colName'] + '` ' + i['operate'] + ' \"' + i['value'] + '\" ' + i['relation']
 90 |     print(sql_str)
 91 |     df.createOrReplaceTempView(table_name)
 92 |     sql_df = spark.sql(sql_str)
 93 | 
 94 |     return sql_df
 95 | 
 96 | 
 97 | def sort(spark_session, operator_id, file_url, condition):
 98 |     """
 99 |     排序
100 | 
101 |     :param spark_session:
102 |     :param operator_id:
103 |     :param file_url:
104 |     :param condition: {"userId":1,"projectId":32,"columnName":"利润","sortType":"降序"}
105 |     :return:
106 |     """
107 | 
108 |     try:
109 |         # 修改计算状态
110 |         OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
111 |         # 读取数据
112 |         df = read_data(spark_session, file_url)
113 |         # 过滤函数
114 |         result_df = sort_core(df, condition['columnName'], condition['sortType'])
115 |         # 存储结果
116 |         result_file_url = save_data(result_df)
117 |         # TODO ：判断返回结果是否是String（异常信息）
118 |         run_info = '排序算子执行成功'
119 |         # 修改计算状态
120 |         OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url, run_info)
121 |         return [result_file_url]
122 | 
123 |     except Exception as e:
124 |         run_info = str(e)
125 |         OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
126 |         traceback.print_exc()
127 |         return []
128 | 
129 | 
130 | def sort_core(df, column_name, column_type):
131 |     """
132 |     排序主函数，函数功能包括解析参数、排序；返回df(spark格式)
133 |     :param df:
134 |     :param column_name:
135 |     :param column_type:
136 |     :return:
137 |     """
138 |     # 只能输入一列，否则报错
139 |     if len(column_name.split(",")) != 1:
140 |         return "ERROR_NOT_ONLY_ONE_COL"
141 | 
142 |     # sortType默认为升序，若用户指定，以用户指定为准
143 |     if (column_type == "") or (column_type is None):
144 |         column_type = "升序"
145 | 
146 |     # 排序
147 |     if column_type == "降序":
148 |         df = df.sort(column_name, ascending=False)
149 |     else:
150 |         df = df.sort(column_name)
151 |     return df
152 | 
153 | 
154 | def column_split(spark_session, operator_id, file_url, condition):
155 |     """
156 |     按列拆分
157 |     :param spark_session:
158 |     :param operator_id:
159 |     :param file_url:
160 |     :param condition:  {"userId": 1, "projectId": 32, "columnName": "订购日期", "delimiter": "/", "newColumnNames": ["year", "月"]}
161 |     :return:
162 |     """
163 |     try:
164 |         # 修改计算状态
165 |         OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
166 |         # 读取数据
167 |         df = read_data(spark_session, file_url)
168 |         # 拆分函数
169 |         result_df = column_split_core(spark_session, df, condition)
170 |         # 存储结果
171 |         result_file_url = save_data(result_df)
172 |         # 修改计算状态
173 |         run_info = '拆分算子执行成功'
174 |         OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url, run_info)
175 |         return [result_file_url]
176 | 
177 |     except Exception as e:
178 |         run_info = str(e)
179 |         OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
180 |         traceback.print_exc()
181 |         return []
182 | 
183 | 
184 | def column_split_core(ss, df, condition_dict):
185 |     """
186 |     按列拆分主函数，返回dataFrame(spark格式)
187 |     :param ss:
188 |     :param df:
189 |     :param condition_dict:
190 |     :return:
191 |     """
192 |     # 参数解析
193 |     column_name = condition_dict['columnName']
194 |     delimiter = condition_dict['delimiter']
195 |     new_column_names = condition_dict['newColumnNames']
196 |     # 获取拆分出的新列的列名，若未指定，暂时存储为空列表，后续根据拆分数填充成为[x_split_1, x_split_2, ...]
197 |     if new_column_names is None:
198 |         new_column_names = []
199 |     # 将指定列columnName按splitSymbol拆分，存入"splitColumn"列，列内数据格式为[a, b, c, ...]
200 |     first_row = df.first()
201 |     df_split = df.withColumn("splitColumn", split(df[column_name], delimiter))
202 |     split_number = len(first_row[column_name].split(delimiter))
203 |     # 若用户为指定拆分出的新列的列名，根据拆分数填充
204 |     if len(new_column_names) == 0:
205 |         for i in range(split_number):
206 |             new_column_names.append(column_name + '_split_' + str(i + 1))
207 |     # 给新列名生成索引，格式为：[('年', 0), ('月', 1), ('日', 2)]，方便后续操作
208 |     sc = ss.sparkContext
209 |     newColumnNames_with_index = sc.parallelize(new_column_names).zipWithIndex().collect()
210 |     # 遍历生成新列
211 |     for name, index in newColumnNames_with_index:
212 |         df_split = df_split.withColumn(name, df_split["splitColumn"].getItem(index))
213 |     df = df_split.drop("splitColumn")
214 |     return df
215 | 
216 | 
217 | def columns_merge(spark_session, operator_id, file_url, condition):
218 |     """
219 |     多列合并
220 |     :param spark_session:
221 |     :param operator_id:
222 |     :param file_url:
223 |     :param condition: {"userId": 1, "projectId": 32, "columnNames": ["类别", "子类别", "产品名称"], "connector": "-", "newColumnName": "品类名称"}
224 |     :return:
225 |     """
226 |     try:
227 |         # 修改计算状态
228 |         OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
229 |         # 读取数据
230 |         df = read_data(spark_session, file_url)
231 |         # 合并函数
232 |         result_df = columns_merge_core(df, condition)
233 |         # 存储结果
234 |         result_file_url = save_data(result_df)
235 |         # 修改计算状态
236 |         run_info = '多列合并算子执行成功'
237 |         OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url, run_info)
238 |         return [result_file_url]
239 | 
240 |     except Exception as e:
241 |         run_info = str(e)
242 |         OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
243 |         traceback.print_exc()
244 |         return []
245 | 
246 | 
247 | def columns_merge_core(df, condition_dict):
248 |     """
249 |     多列合并主函数，新增一列，列内的值为指定多列合并而成；返回df(spark格式)
250 |     :param df:
251 |     :param condition_dict:
252 |     :return:
253 |     """
254 |     # 解析参数
255 |     column_names = condition_dict['columnNames']
256 |     split_symbol = condition_dict['connector']
257 |     new_column_name = condition_dict['newColumnName']
258 |     # 默认分隔符是","，或以用户指定为准
259 |     if split_symbol is None or split_symbol == '':
260 |         split_symbol = ','
261 |     if new_column_name is None or new_column_name == '':
262 |         new_column_name = '_'.join(new_column_name)
263 |     # 合并
264 |     column_list = []
265 |     for i in range(len(column_names)):
266 |         column_list.append(df[column_names[i]])
267 |     df = df.withColumn(new_column_name, concat_ws(split_symbol, *column_list))
268 |     return df
269 | 
270 | 
271 | def replace(spark_session, operator_id, file_url, condition):
272 |     """
273 |     数据替换
274 |     :param spark_session:
275 |     :param operator_id:
276 |     :param file_url:
277 |     :param condition: {"userId": 1, "projectId": 32, "columnNames": ["类别", "子类别", "客户名称"],"replaceCharacters":[{"source":"技术","target":"技术copy"},{"source":"电话","target":"电话copy"}]}
278 |     :return:
279 |     """
280 |     try:
281 |         # 修改计算状态
282 |         OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
283 |         # 读取数据
284 |         df = read_data(spark_session, file_url)
285 |         # 替换函数
286 |         result_df = replace_core(df, condition)
287 |         # 存储结果
288 |         result_file_url = save_data(result_df)
289 |         # 修改计算状态
290 |         run_info = '数据替换算子执行成功'
291 |         OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url, run_info)
292 |         return [result_file_url]
293 | 
294 |     except Exception as e:
295 |         run_info = str(e)
296 |         OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
297 |         traceback.print_exc()
298 |         return []
299 | 
300 | 
301 | def replace_core(df, condition_dict):
302 |     """
303 |     数据列替换主函数, 将多个列中的字符进行替换；返回df(spark格式)
304 |     :param df:
305 |     :param condition_dict:
306 |     :return:
307 |     """
308 | 
309 |     def mul_regexp_replace(col):
310 |         """
311 |         对每一列进行替换
312 |         :param col:
313 |         :return:
314 |         """
315 |         for item in replace_characters:
316 |             col = regexp_replace(col, item["source"], item["target"])
317 |         return col
318 | 
319 |     # 解析参数
320 |     column_names = condition_dict['columnNames']
321 |     replace_characters = condition_dict['replaceCharacters']
322 |     # 对每一列进行替换
323 |     for i in range(len(column_names)):
324 |         column_name = column_names[i]
325 |         df = df.withColumn(column_name, (mul_regexp_replace(df[column_name])))
326 |     return df
327 | 
328 | 
329 | def fill_null_value(spark_session, operator_id, file_url, condition):
330 |     """
331 |     填充空值
332 |     :param spark_session:
333 |     :param operator_id:
334 |     :param file_url:
335 |     :param condition: {'userId':1,'projectId':32,'parameter':[{'operate':'均值填充','colName':''},{'operate':'均值填充','colName':'最大值填充'}]}
336 |     :return:
337 |     """
338 | 
339 |     try:
340 |         # 修改计算状态
341 |         OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
342 |         # 读取数据
343 |         df = read_data(spark_session, file_url)
344 |         # 空值填充函数
345 |         result_df = fill_null_value_core(df, condition["parameter"])
346 |         # 存储结果
347 |         result_file_url = save_data(result_df)
348 |         # 修改计算状态
349 |         run_info = '数据替换算子执行成功'
350 |         OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url, run_info)
351 |         return [result_file_url]
352 | 
353 |     except Exception as e:
354 |         run_info = str(e)
355 |         OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
356 |         traceback.print_exc()
357 |         return []
358 | 
359 | 
360 | def fill_null_value_core(df, condition):
361 |     """
362 |     填充空值核心函数
363 |     :param df:
364 |     :param condition:
365 |     :return:
366 |     """
367 |     for i in condition:
368 |         if i['operate'] == '均值填充':
369 |             mean_item = df.select(func.mean(i['colName'])).collect()[0][0]
370 |             df = df.na.fill({i['colName']: mean_item})
371 |         elif i['operate'] == '最大值填充':
372 |             mean_item = df.select(func.max(i['colName'])).collect()[0][0]
373 |             df = df.na.fill({i['colName']: mean_item})
374 |         elif i['operate'] == '最小值填充':
375 |             mean_item = df.select(func.min(i['colName'])).collect()[0][0]
376 |             df = df.na.fill({i['colName']: mean_item})
377 |     return df
378 | 
379 | 
380 | def column_map(spark_session, operator_id, file_url, condition):
381 |     """
382 |     列映射
383 |     :param spark_session:
384 |     :param operator_id:
385 |     :param file_url:
386 |     :param condition:{"userId":1,"projectId":32,"parameter":[{"colName_1":"利润", "operate_1":"+","value_1":"100","operate":"+","colName_2":"数量", "operate_2":"*","value_2":"0.0001","newName":"newCol1"},{"colName_1":"利润", "operate_1":"+","value_1":"10","operate":"*","colName_2":"数量", "operate_2":"*","value_2":"0.1","newName":"newCol2"}]}
387 |     :return:
388 |     """
389 | 
390 |     try:
391 |         # 修改计算状态
392 |         OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
393 |         # 读取数据
394 |         df = read_data(spark_session, file_url)
395 |         # 列映射函数
396 |         result_df = column_map_core(df, condition["parameter"])
397 |         # 存储结果
398 |         result_file_url = save_data(result_df)
399 |         # 修改计算状态
400 |         run_info = '列映射算子执行成功'
401 |         OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url, run_info)
402 |         return [result_file_url]
403 | 
404 |     except Exception as e:
405 |         run_info = str(e)
406 |         OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
407 |         traceback.print_exc()
408 |         return []
409 | 
410 | 
411 | def column_map_core(df, condition):
412 |     """
413 |     列映射核心函数
414 |     :param df:
415 |     :param condition:
416 |     :return:
417 |     """
418 |     for i in condition:
419 |         name1 = i['colName_1']
420 |         name2 = i['colName_2']
421 |         new_name = i['newName']
422 |         if i['operate_1'] == '+':
423 |             df = df.withColumn(new_name, df[name1] + i['value_1'])
424 |         elif i['operate_1'] == '-':
425 |             df = df.withColumn(new_name, df[name1] - i['value_1'])
426 |         elif i['operate_1'] == '*':
427 |             df = df.withColumn(new_name, df[name1] * i['value_1'])
428 |         elif i['operate_1'] == '/':
429 |             df = df.withColumn(new_name, df[name1] / i['value1_'])
430 |         if not ((name2 == "") or (name2 is None)):
431 |             new_name2 = new_name + "_2"
432 |             if i['operate_2'] == '+':
433 |                 df = df.withColumn(new_name2, df[name2] + i['value_2'])
434 |             elif i['operate_2'] == '-':
435 |                 df = df.withColumn(new_name2, df[name2] - i['value_2'])
436 |             elif i['operate_2'] == '*':
437 |                 df = df.withColumn(new_name2, df[name2] * i['value_2'])
438 |             elif i['operate_2'] == '/':
439 |                 df = df.withColumn(new_name2, df[name2] / i['value_2'])
440 | 
441 |             if i['operate'] == '+':
442 |                 df = df.withColumn(new_name, df[new_name] + df[new_name2])
443 |             elif i['operate'] == '-':
444 |                 df = df.withColumn(new_name, df[new_name] - df[new_name2])
445 |             elif i['operate'] == '*':
446 |                 df = df.withColumn(new_name, df[new_name] * df[new_name2])
447 |             elif i['operate'] == '/':
448 |                 df = df.withColumn(new_name, df[new_name] / df[new_name2])
449 |             df = df.drop(new_name2)
450 |     return df
451 | 
452 | 
453 | def random_split(spark_session, operator_id, file_url, condition):
454 |     """
455 |     按照比例随机划分数据
456 |     :param spark_session:
457 |     :param operator_id:
458 |     :param file_url:
459 |     :param condition:
460 |     :return:
461 |     """
462 |     try:
463 |         # 修改计算状态
464 |         OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
465 |         # 读取数据
466 |         df = read_data(spark_session, file_url)
467 |         # 划分函数
468 |         (result_df1, result_df2) = random_split_core(df, condition)
469 |         # 存储结果
470 |         result_file_url1 = save_data(result_df1)
471 |         result_file_url2 = save_data(result_df2)
472 |         # 修改计算状态
473 |         run_info = '列映射算子执行成功'
474 |         OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url1 + "*," +
475 |                                           result_file_url2, run_info)
476 |         return [result_file_url1, result_file_url2]
477 | 
478 |     except Exception as e:
479 |         run_info = str(e)
480 |         OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
481 |         traceback.print_exc()
482 |         return []
483 | 
484 | 
485 | def random_split_core(df, condition):
486 |     """
487 |     划分函数
488 |     :param df:
489 |     :param condition:{"proportion1": 0.7, "proportion2": 0.3, "seed": 10}
490 |     :return:
491 |     """
492 | 
493 |     seed = condition['seed']
494 |     train = float(condition['proportion1'])
495 |     test = float(condition['proportion2'])
496 |     (trainingData, testData) = df.randomSplit([train, test], seed=seed)
497 |     return trainingData, testData
498 | 


--------------------------------------------------------------------------------
/app/service/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | 逻辑处理
3 | """
4 | 


--------------------------------------------------------------------------------
/app/service/ml/Evaluation.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | 
  3 | """
  4 | 模型评估
  5 | """
  6 | import app.dao.OperatorDao as OperatorDao
  7 | from pyspark.mllib.classification import SVMModel
  8 | from pyspark.mllib.regression import LabeledPoint
  9 | from pyspark.mllib.evaluation import BinaryClassificationMetrics
 10 | from pyspark.ml.linalg import Vectors
 11 | from pyspark.sql.types import Row
 12 | from pyspark.ml.classification import LogisticRegressionModel
 13 | from app.Utils import *
 14 | 
 15 | 
 16 | def second_evaluation(spark_session, operator_id, condition):
 17 |     """
 18 |     二分类评估
 19 |     :param spark_session:
 20 |     :param operator_id:
 21 |     :param condition:
 22 |     :return:
 23 |     """
 24 |     try:
 25 |         # 修改计算状态
 26 |         OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
 27 |         # 评估函数
 28 |         result_df = second_evaluation_core(spark_session, condition, operator_id)
 29 |         if isinstance(result_df, str):
 30 |             OperatorDao.update_operator_by_id(operator_id, 'error', '', result_df)
 31 |         else:
 32 |             # 存储结果
 33 |             result_df.show()
 34 |             result_file_url = save_data(result_df)
 35 |             run_info = '评估算子执行成功'
 36 |             # 修改计算状态
 37 |             OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url, run_info)
 38 |             return [result_file_url]
 39 | 
 40 |     except Exception as e:
 41 |         run_info = str(e)
 42 |         OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
 43 |         traceback.print_exc()
 44 |     return []
 45 | 
 46 | 
 47 | def second_evaluation_core(spark_session, condition, operator_id):
 48 |     """
 49 |     二分类评估核心函数
 50 |     :param spark_session:
 51 |     :param condition:
 52 |     :param operator_id:
 53 |     :return:
 54 |     """
 55 |     # 读模型
 56 |     # 当前节点（评估节点）一个父节点
 57 |     operator = OperatorDao.get_operator_by_id(operator_id)
 58 |     # 父节点(预测节点) 两个父节点
 59 |     father_id = operator.father_operator_ids
 60 |     father_operator = OperatorDao.get_operator_by_id(father_id)
 61 |     # 祖节点（模型节点和读预测数据节点）
 62 |     grand_father_ids = father_operator.father_operator_ids.split(',')
 63 |     print("**********祖节点（模型节点和读预测数据源节点）:", grand_father_ids)
 64 | 
 65 |     # 读数据
 66 |     def get_predict_data(operator_config_):
 67 |         for grand_father_file_ in operator_config_:
 68 |             grand_father_id_ = list(grand_father_file_.keys())[0]
 69 |             grand_father_ = OperatorDao.get_operator_by_id(grand_father_id_)
 70 |             if grand_father_.operator_type_id == 5001 or grand_father_.operator_type_id < 3000:
 71 |                 print("***************评估函数，预测数据：", grand_father_.operator_type_id)
 72 |                 pre_data_file_url = grand_father_.operator_output_url.split('*,')[
 73 |                     grand_father_file_[grand_father_id_]]
 74 |                 print("***************评估函数，预测数据url：", pre_data_file_url)
 75 |                 return read_data(spark_session, pre_data_file_url)
 76 | 
 77 |     print("**********预测节点:", father_operator.operator_config)
 78 |     df = get_predict_data(json.loads(father_operator.operator_config)['fileUrl'])
 79 | 
 80 |     # 评估
 81 |     for grand_father_id in grand_father_ids:
 82 |         grand_father = OperatorDao.get_operator_by_id(grand_father_id)
 83 |         grand_father_operator_type = grand_father.operator_type_id
 84 |         # 模型加载节点
 85 |         if grand_father_operator_type == 8000:
 86 |             grand_father_operator_type = json.loads(grand_father.operator_config)['parameter']['modelTypeId']
 87 |         if grand_father_operator_type == 6001:  # svm二分类节点
 88 |             print("***************评估函数，训练模型", grand_father.operator_type_id)
 89 |             evaluation_df = svm_second_evaluation(spark_session, grand_father.operator_output_url, df,
 90 |                                                   json.loads(father_operator.operator_config)['parameter'], condition)
 91 |             return evaluation_df
 92 |         elif grand_father_operator_type == 6003:  # lr二分类节点
 93 |             print("***************评估函数，训练模型", grand_father.operator_type_id)
 94 |             evaluation_df = lr_second_evaluation(spark_session, grand_father.operator_output_url, df,
 95 |                                                  json.loads(father_operator.operator_config)['parameter'], condition)
 96 |             return evaluation_df
 97 | 
 98 | 
 99 | def svm_second_evaluation(spark_session, svm_model_path, df, predict_condition, condition):
100 |     """
101 |     svm二分类评估
102 |     :param spark_session:
103 |     :param svm_model_path: 模型地址
104 |     :param df: 预测数据
105 |     :param predict_condition: 预测算子（父算子）配置
106 |     :param condition: 该算子配置 {"label":"标签"}
107 |     :return:
108 |     """
109 | 
110 |     feature_indexs = predict_condition['features']
111 |     label = condition['label']
112 | 
113 |     # 1. 准备数据
114 |     def func(x):
115 |         features_data = []
116 |         for feature in feature_indexs:
117 |             features_data.append(x[feature])
118 |         return LabeledPoint(label=x[label], features=features_data)
119 | 
120 |     predict_data = df.rdd.map(lambda x: func(x))
121 | 
122 |     # 加载模型
123 |     svm_model = SVMModel.load(spark_session.sparkContext, svm_model_path)
124 | 
125 |     # 计算评估指标
126 |     svmTotalCorrect = predict_data.map(lambda r: 1 if (svm_model.predict(r.features) == r.label) else 0).reduce(
127 |         lambda x, y: x + y)
128 |     svmAccuracy = svmTotalCorrect / float(predict_data.count())
129 | 
130 |     # 清除默认阈值，这样会输出原始的预测评分，即带有确信度的结果
131 |     svm_model.clearThreshold()
132 |     svmPredictionAndLabels = predict_data.map(lambda lp: (float(svm_model.predict(lp.features)), lp.label))
133 |     svmMetrics = BinaryClassificationMetrics(svmPredictionAndLabels)
134 |     print("Area under PR = %s" % svmMetrics.areaUnderPR)
135 |     print("Area under ROC = %s" % svmMetrics.areaUnderROC)
136 | 
137 |     # 返回数据
138 |     result = [("正确个数", float(svmTotalCorrect)),
139 |               ("精准度", float(svmAccuracy)),
140 |               ("Area under PR", float(svmMetrics.areaUnderPR)),
141 |               ("Area under ROC", float(svmMetrics.areaUnderROC))]
142 |     return spark_session.createDataFrame(result, schema=['指标', '值'])
143 | 
144 | 
145 | def lr_second_evaluation(spark_session, lr_model_path, df, predict_condition, condition):
146 |     """
147 |     lr二分类评估
148 |     :param spark_session:
149 |     :param lr_model_path: 模型地址
150 |     :param df: 预测数据
151 |     :param predict_condition: 预测算子（父算子）配置
152 |     :param condition: 该算子配置 {"label":"标签"}
153 |     :return:
154 |     """
155 | 
156 |     feature_indexs = predict_condition['features']
157 |     label_index = condition['label']
158 | 
159 |     # 1. 准备数据
160 |     def func(x):
161 |         features_data = []
162 |         for feature in feature_indexs:
163 |             features_data.append(x[feature])
164 |         return Row(label=x[label_index], features=Vectors.dense(features_data))
165 | 
166 |     predict_data = df.rdd.map(lambda x: func(x)).toDF()
167 | 
168 |     # 2.加载模型
169 |     print("*****lr_model_path:", lr_model_path)
170 |     lr_model = LogisticRegressionModel.load(lr_model_path)
171 | 
172 |     # 计算评估指标
173 |     result = lr_model.transform(predict_data)
174 |     print(result.prediction)
175 |     lrTotalCorrect = result.rdd.map(lambda r: 1 if (r.prediction == r.label) else 0).reduce(lambda x, y: x + y)
176 | 
177 |     lrAccuracy = lrTotalCorrect / float(predict_data.count())  # 0.5136044023234485
178 |     # # 清除默认阈值，这样会输出原始的预测评分，即带有确信度的结果
179 |     lrPredictionAndLabels = result.rdd.map(lambda lp: (float(lp.prediction), float(lp.label)))
180 |     lrmetrics = BinaryClassificationMetrics(lrPredictionAndLabels)
181 | 
182 |     print("Area under PR = %s" % lrmetrics.areaUnderPR)
183 |     print("Area under ROC = %s" % lrmetrics.areaUnderROC)
184 | 
185 |     # 返回数据
186 |     result = [("正确个数", float(lrTotalCorrect)),
187 |               ("精准度", float(lrAccuracy)),
188 |               ("Area under PR", float(lrmetrics.areaUnderPR)),
189 |               ("Area under ROC", float(lrmetrics.areaUnderROC))]
190 |     return spark_session.createDataFrame(result, schema=['指标', '值'])
191 | 


--------------------------------------------------------------------------------
/app/service/ml/ModelService.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 模型加载
 3 | """
 4 | import app.dao.OperatorDao as OperatorDao
 5 | import app.dao.MLModelDao as MLModelDao
 6 | from app.Utils import *
 7 | 
 8 | 
 9 | def model_operator(operator_id, condition):
10 |     """
11 |     加载模型算子
12 |     :param operator_id:
13 |     :param condition:{"MLModelId": 2, "modelTypeId": 6001}
14 |     :return:
15 |     """
16 | 
17 |     try:
18 |         # 修改计算状态
19 |         OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
20 |         # 评估函数
21 |         model_file_url = model_operator_core(condition)
22 |         # 修改计算状态
23 |         run_info = '模型算子执行成功'
24 |         OperatorDao.update_operator_by_id(operator_id, 'success', model_file_url, run_info)
25 |         return [model_file_url]
26 | 
27 |     except Exception as e:
28 |         run_info = str(e)
29 |         OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
30 |         traceback.print_exc()
31 |     return []
32 | 
33 | 
34 | def model_operator_core(condition):
35 |     # 查询 ml_model
36 |     ml_model_id = condition['MLModelId']
37 |     ml_model = MLModelDao.get_ml_model(ml_model_id)
38 |     return ml_model.model_url
39 | 


--------------------------------------------------------------------------------
/app/service/ml/MultipleClassifition.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | """
  3 | 多分类
  4 | """
  5 | 
  6 | import app.dao.OperatorDao as OperatorDao
  7 | from app.Utils import *
  8 | from pyspark.ml.linalg import Vectors
  9 | from pyspark.ml.classification import LogisticRegression, MultilayerPerceptronClassifier
 10 | from pyspark.sql.types import Row
 11 | 
 12 | 
 13 | def model_url():
 14 |     """
 15 |     二分类模型保存地址
 16 |     :return:
 17 |     """
 18 |     return const.MIDDATA + 'model/multipleClassification'
 19 | 
 20 | 
 21 | def lr(spark_session, operator_id, file_url, condition):
 22 |     """
 23 |     逻辑回归多分类
 24 |     :param spark_session:
 25 |     :param operator_id:
 26 |     :param file_url:
 27 |     :param condition:
 28 |     :return:
 29 |     """
 30 |     try:
 31 |         # 修改计算状态
 32 |         OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
 33 |         # 读取数据
 34 |         df = read_data(spark_session, file_url)
 35 |         # svm_core函数
 36 |         result_model_url = lr_core(df, condition)
 37 |         # 修改计算状态
 38 |         run_info = '逻辑回归多分类算子执行成功'
 39 |         OperatorDao.update_operator_by_id(operator_id, 'success', result_model_url, run_info)
 40 |         return [result_model_url]
 41 | 
 42 |     except Exception as e:
 43 |         run_info = str(e)
 44 |         OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
 45 |         traceback.print_exc()
 46 |     return []
 47 | 
 48 | 
 49 | def lr_core(df, condition):
 50 |     """
 51 |     lr多分类核心函数
 52 |     :param spark_session:
 53 |     :param df:
 54 |     :param condition:{"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20,"regParam":0.0,"elasticNetParam":0.0,"tol":0.000006,"fitIntercept":True}
 55 |     :return:
 56 |     """
 57 |     # 参数
 58 |     label_index = condition['label']  # 标签列(列名或列号)
 59 |     feature_indexs = condition['features']  # 特征列(列名或列号)
 60 |     iterations = condition['iterations']  # 最大迭代次数(默认100)
 61 |     regParam = condition['regParam']  # 正则化参数（默认0.0）
 62 |     # ElasticNet混合参数，范围为[0，1]。对于alpha = 0，惩罚是L2惩罚。对于alpha = 1，这是L1惩罚（默认值：0.0)
 63 |     elasticNetParam = condition['elasticNetParam']
 64 |     tol = condition['tol']  # 迭代算法的收敛容限（> = 0）（默认值：1e-06即 0.000006）
 65 |     fitIntercept = condition['fitIntercept']  # 是否训练截距项（默认值："True","False"可选)
 66 | 
 67 |     # 参数类型转换
 68 |     if isinstance(iterations, str):
 69 |         iterations = int(iterations)
 70 |     if isinstance(regParam, str):
 71 |         regParam = float(regParam)
 72 |     if isinstance(elasticNetParam, str):
 73 |         elasticNetParam = float(elasticNetParam)
 74 |     if isinstance(tol, str):
 75 |         tol = float(tol)
 76 |     if isinstance(fitIntercept, str):
 77 |         if fitIntercept == 'False':
 78 |             fitIntercept = False
 79 |         else:
 80 |             fitIntercept = True
 81 | 
 82 |     # 1. 准备数据
 83 |     def func(x):
 84 |         features_data = []
 85 |         for feature in feature_indexs:
 86 |             features_data.append(x[feature])
 87 |         return Row(label=x[label_index], features=Vectors.dense(features_data))
 88 | 
 89 |     training_set = df.rdd.map(lambda x: func(x)).toDF()
 90 | 
 91 |     # 2.训练模型
 92 |     lr_param = LogisticRegression(featuresCol="features", labelCol="label", predictionCol="prediction",
 93 |                                   maxIter=iterations, regParam=regParam, elasticNetParam=elasticNetParam, tol=tol,
 94 |                                   fitIntercept=fitIntercept, probabilityCol="probability",
 95 |                                   rawPredictionCol="rawPrediction", standardization=True, aggregationDepth=2,
 96 |                                   family="multinomial")
 97 |     lr_model = lr_param.fit(training_set)
 98 |     print(lr_model.coefficientMatrix)  # 系数
 99 |     print(lr_model.interceptVector)  # 截距
100 |     print(lr_model.explainParams())  # 参数以及其注解
101 | 
102 |     # 3.保存模型
103 |     lr_model_path = model_url() + '/lr/' + str(uuid.uuid1())
104 |     deltree(lr_model_path)  # 删除已经存在的模型
105 |     lr_model.write().overwrite().save(lr_model_path)
106 | 
107 |     return lr_model_path
108 | 
109 | 
110 | def mpc(spark_session, operator_id, file_url, condition):
111 |     """
112 |     mpc多分类
113 |     Classifier trainer based on the Multilayer Perceptron.
114 |     Each layer has sigmoid activation function, output layer has softmax.
115 |     Number of inputs has to be equal to the size of feature vectors.
116 |     Number of outputs has to be equal to the total number of labels.
117 | 
118 |     :param spark_session:
119 |     :param operator_id:
120 |     :param file_url:
121 |     :param condition:
122 |     :return:
123 |     """
124 |     try:
125 |         # 修改计算状态
126 |         OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
127 |         # 读取数据
128 |         df = read_data(spark_session, file_url)
129 |         # svm_core函数
130 |         result_model_url = mpc_core(df, condition)
131 |         # 修改计算状态
132 |         run_info = 'mpc多分类算子执行成功'
133 |         OperatorDao.update_operator_by_id(operator_id, 'success', result_model_url, run_info)
134 |         return [result_model_url]
135 | 
136 |     except Exception as e:
137 |         run_info = str(e)
138 |         OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
139 |         traceback.print_exc()
140 |     return []
141 | 
142 | 
143 | def mpc_core(df, condition):
144 |     """
145 |     mpc多分类核心函数
146 |     :param df:
147 |     :param condition:{"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20,"regParam":0.0,"elasticNetParam":0.0,"tol":0.000006,"fitIntercept":True}
148 |     :return:
149 |     """
150 |     {"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20, "seed": 1, "layers": [4, 2, 2],
151 |      "stepSize": 0.03, "tol": 0.000001, "blockSize": 128, "solver": "l-bfgs"}
152 | 
153 |     # maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, solver="l-bfgs"
154 |     label_index = condition['label']  # 标签列(列名或列号)
155 |     feature_indexs = condition['features']  # 特征列(列名或列号)
156 |     iterations = condition['iterations']  # 最大迭代次数(默认100)
157 |     tol = condition['tol']  # 迭代算法的收敛容限（> = 0）（默认值：1e-06即 0.000001）
158 |     seed = condition['seed']  # 随机种子
159 |     layers = condition['layers']  # Sizes of layers from input layer to output layer
160 |     blockSize = condition['blockSize']  # Block size for stacking input data in matrices.
161 |     stepSize = condition['stepSize']  # 步长，默认值：0.03
162 |     solver = condition['solver']  # 是否训练截距项（默认值："l-bfgs","gd"可选)
163 | 
164 |     # 参数类型转换
165 |     if isinstance(iterations, str):
166 |         iterations = int(iterations)
167 |     if isinstance(tol, str):
168 |         tol = float(tol)
169 |     if isinstance(seed, str):
170 |         seed = int(seed)
171 |     if isinstance(layers, list):
172 |         for i in range(len(layers)):
173 |             if isinstance(layers[i], str):
174 |                 layers[i] = int(layers[i])
175 |     if isinstance(blockSize, str):
176 |         blockSize = int(blockSize)
177 |     if isinstance(stepSize, str):
178 |         stepSize = float(stepSize)
179 | 
180 |     # 1. 准备数据
181 |     def func(x):
182 |         features_data = []
183 |         for feature in feature_indexs:
184 |             features_data.append(x[feature])
185 |         return Row(label=x[label_index], features=Vectors.dense(features_data))
186 | 
187 |     training_set = df.rdd.map(lambda x: func(x)).toDF()
188 | 
189 |     # 2.训练模型
190 |     mpc_param = MultilayerPerceptronClassifier(maxIter=iterations, tol=tol, seed=seed, layers=layers,
191 |                                                blockSize=blockSize, stepSize=stepSize, solver=solver)
192 |     mpc_model = mpc_param.fit(training_set)
193 | 
194 |     # 3.保存模型
195 |     mpc_model_path = model_url() + '/mpc/' + str(uuid.uuid1())
196 |     deltree(mpc_model_path)  # 删除已经存在的模型
197 |     mpc_model.write().overwrite().save(mpc_model_path)
198 | 
199 |     return mpc_model_path
200 | 


--------------------------------------------------------------------------------
/app/service/ml/PredictService.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | """
  3 | 二分类
  4 | """
  5 | from pyspark.mllib.classification import SVMModel
  6 | from pyspark.mllib.regression import LabeledPoint
  7 | import app.dao.OperatorDao as OperatorDao
  8 | from app.Utils import *
  9 | from pyspark.ml.linalg import Vectors
 10 | from pyspark.sql.types import Row
 11 | from pyspark.ml.classification import GBTClassificationModel, LogisticRegressionModel, \
 12 |     MultilayerPerceptronClassificationModel
 13 | 
 14 | 
 15 | def ml_predict(spark_session, operator_id, file_urls, condition):
 16 |     """
 17 |     机器学习模型预测函数
 18 |     :param spark_session:
 19 |     :param operator_id:
 20 |     :param file_urls: ["modelUrl","predictDataUrl"]
 21 |     # 两个输入源 一个是模型 一个是预测数据
 22 |     :param condition:
 23 |     :return:
 24 |     """
 25 |     try:
 26 |         # 修改计算状态
 27 |         OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
 28 |         # 读取数据
 29 |         for url in file_urls:
 30 |             print("------fileUrl:", file_urls)
 31 |             if url[-4:] == ".csv":
 32 |                 url1 = url
 33 |             else:
 34 |                 url0 = url
 35 |         df = read_data(spark_session, url1)
 36 |         # 预测函数
 37 |         result_df = ml_predict_core(spark_session, operator_id, df, url0, condition)
 38 |         if isinstance(result_df, str):
 39 |             OperatorDao.update_operator_by_id(operator_id, 'error', '', result_df)
 40 |         else:
 41 |             # 存储结果
 42 |             result_df.show()
 43 |             result_file_url = save_data(result_df)
 44 |             run_info = '预测算子执行成功'
 45 |             # 修改计算状态
 46 |             OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url, run_info)
 47 |             return [result_file_url]
 48 | 
 49 |     except Exception as e:
 50 |         run_info = str(e)
 51 |         OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
 52 |         traceback.print_exc()
 53 |     return []
 54 | 
 55 | 
 56 | def ml_predict_core(spark_session, operator_id, df, model_url, condition):
 57 |     """
 58 |     路由控制加载哪种模型进行预测
 59 |     :param spark_session:
 60 |     :param operator_id:
 61 |     :param df:
 62 |     :param model_url:
 63 |     :param condition:
 64 |     :return:  预测结果 sparkframe
 65 |     """
 66 | 
 67 |     # 父节点是什么组件
 68 |     operator = OperatorDao.get_operator_by_id(operator_id)
 69 |     father_ids = operator.father_operator_ids.split(',')
 70 |     print("**********", operator.father_operator_ids)
 71 |     for father_id in father_ids:
 72 |         father = OperatorDao.get_operator_by_id(father_id)
 73 |         print("***************", father.operator_type_id)
 74 |         print("---------------", father.operator_type_id == 6001)
 75 |         operator_type_flag = father.operator_type_id
 76 | 
 77 |         # 模型加载节点
 78 |         if operator_type_flag == 8000:
 79 |             operator_type_flag = json.loads(father.operator_config)['parameter']['modelTypeId']
 80 | 
 81 |         if operator_type_flag == 6001:  # svm二分类
 82 |             prediction_df = svm_second_predict(spark_session, model_url, df, condition)
 83 |         elif operator_type_flag == 6002:  # gbdt二分类
 84 |             prediction_df = gbdt_second_predict(model_url, df, condition)
 85 |         elif operator_type_flag == 6003:  # lr二分类
 86 |             prediction_df = lr_second_predict(model_url, df, condition)
 87 |         elif operator_type_flag == 6004:  # lr多分类
 88 |             prediction_df = lr_multiple_predict(model_url, df, condition)
 89 |         elif operator_type_flag == 6005:  # mpc多分类
 90 |             prediction_df = mpc_multiple_predict(model_url, df, condition)
 91 | 
 92 |     # 根据父组件的类型决定加载哪种模型
 93 |     return prediction_df
 94 | 
 95 | 
 96 | def svm_second_predict(spark_session, svm_model_path, df, condition):
 97 |     """
 98 |     支持向量机二分类预测
 99 |     :param spark_session: spark 会话
100 |     :param svm_model_path: 模型地址
101 |     :param df: 数据
102 |     :param condition: {"features": [12, 13, 14, 15], "label": "label"}
103 |     特征列
104 |     :return: 预测结果 sparkframe
105 |     """
106 |     feature_indexs = condition['features']
107 |     label_index = condition['label']
108 |     if label_index is None or label_index == "":  # 无标签列
109 |         # 1. 准备数据
110 |         def func(x):
111 |             features_data = []
112 |             for feature in feature_indexs:
113 |                 features_data.append(x[feature])
114 |             return features_data
115 | 
116 |         predict_data = df.rdd.map(lambda x: func(x))
117 |         print(predict_data.take(10))
118 | 
119 |         # 2.加载模型
120 |         svm_model = SVMModel.load(spark_session.sparkContext, svm_model_path)
121 | 
122 |         # 3.预测
123 |         def f(x):
124 |             return {"prediction_result": x}
125 | 
126 |         prediction_rdd = svm_model.predict(predict_data)
127 |         print(prediction_rdd.take(10))
128 |         prediction_df = prediction_rdd.map(lambda x: Row(**f(x))).toDF()
129 |         return prediction_df
130 |     else:  # 有标签列
131 |         # 1. 准备数据
132 |         def func(x):
133 |             features_data = []
134 |             for feature in feature_indexs:
135 |                 features_data.append(x[feature])
136 |             return LabeledPoint(label=x[label_index], features=features_data)
137 | 
138 |         predict_label_data = df.rdd.map(lambda x: func(x))
139 |         print(predict_label_data.take(10))
140 | 
141 |         # 2.加载模型
142 |         svm_model = SVMModel.load(spark_session.sparkContext, svm_model_path)
143 | 
144 |         # 3.预测
145 |         from pyspark.sql.types import Row
146 | 
147 |         def f(x):
148 |             return {"prediction_result": x[0], label_index: x[1]}
149 | 
150 |         prediction_rdd = predict_label_data.map(lambda x: (svm_model.predict(x.features), x.label))
151 |         print(prediction_rdd.take(10))
152 |         prediction_df = prediction_rdd.map(lambda x: Row(**f(x))).toDF()
153 |         return prediction_df
154 | 
155 | 
156 | def gbdt_second_predict(gbdt_model_path, df, condition):
157 |     """
158 |     gbdt二分类预测
159 |     :param gbdt_model_path: 模型地址
160 |     :param df: 数据
161 |     :param condition: {"features": [12, 13, 14, 15], "label": "label"}
162 |     特征列
163 |     :return: 预测结果 sparkframe
164 |     """
165 |     feature_indexs = condition['features']
166 |     label_index = condition['label']
167 | 
168 |     if label_index is None or label_index == "":  # 无标签列
169 |         # 1. 准备数据
170 |         def func(x):
171 |             features_data = []
172 |             for feature in feature_indexs:
173 |                 features_data.append(x[feature])
174 |             return Row(features=Vectors.dense(features_data))
175 | 
176 |         training_set = df.rdd.map(lambda x: func(x)).toDF()
177 | 
178 |         # 2.加载模型
179 |         gbdt_model = GBTClassificationModel.load(gbdt_model_path)
180 | 
181 |         # 3.预测
182 |         prediction_df = gbdt_model.transform(training_set).select("prediction", "features")
183 |         return prediction_df
184 |     else:  # 有标签列
185 |         # 1. 准备数据
186 |         def func(x):
187 |             features_data = []
188 |             for feature in feature_indexs:
189 |                 features_data.append(x[feature])
190 |             return Row(label=x[label_index], features=Vectors.dense(features_data))
191 | 
192 |         training_set = df.rdd.map(lambda x: func(x)).toDF()
193 | 
194 |         # 2.加载模型
195 |         print("****gbdt_model_path:", gbdt_model_path)
196 |         gbdt_model = GBTClassificationModel.load(gbdt_model_path)
197 | 
198 |         # 3.预测
199 |         prediction_df = gbdt_model.transform(training_set).select("prediction", "label", "features")
200 |         return prediction_df
201 | 
202 | 
203 | def lr_second_predict(lr_model_path, df, condition):
204 |     """
205 |     lr二分类预测
206 |     :param lr_model_path: 模型地址
207 |     :param df: 数据
208 |     :param condition: {"features": [12, 13, 14, 15], "label": "label"}
209 |     特征列
210 |     :return: 预测结果 spark dataframe
211 |     """
212 |     feature_indexs = condition['features']
213 |     label_index = condition['label']
214 | 
215 |     if label_index is None or label_index == "":  # 无标签列
216 |         # 1. 准备数据
217 |         def func(x):
218 |             features_data = []
219 |             for feature in feature_indexs:
220 |                 features_data.append(x[feature])
221 |             return Row(features=Vectors.dense(features_data))
222 | 
223 |         training_set = df.rdd.map(lambda x: func(x)).toDF()
224 | 
225 |         # 2.加载模型
226 |         lr_model = LogisticRegressionModel.load(lr_model_path)
227 | 
228 |         # 3.预测
229 |         prediction_df = lr_model.transform(training_set).select("prediction", "features")
230 |         return prediction_df
231 |     else:  # 有标签列
232 |         # 1. 准备数据
233 |         def func(x):
234 |             features_data = []
235 |             for feature in feature_indexs:
236 |                 features_data.append(x[feature])
237 |             return Row(label=x[label_index], features=Vectors.dense(features_data))
238 | 
239 |         training_set = df.rdd.map(lambda x: func(x)).toDF()
240 | 
241 |         # 2.加载模型
242 |         print("*****lr_model_path:", lr_model_path)
243 |         lr_model = LogisticRegressionModel.load(lr_model_path)
244 | 
245 |         # 3.预测
246 |         prediction_df = lr_model.transform(training_set).select("prediction", "label", "features")
247 |         return prediction_df
248 | 
249 | 
250 | """ 多分类 """
251 | 
252 | 
253 | def lr_multiple_predict(lr_model_path, df, condition):
254 |     """
255 |     lr多分类预测
256 |     :param lr_model_path: 模型地址
257 |     :param df: 数据
258 |     :param condition: {"features": [12, 13, 14, 15], "label": "label"}
259 |     特征列
260 |     :return: 预测结果 sparkframe
261 |     """
262 |     return lr_second_predict(lr_model_path, df, condition)
263 | 
264 | 
265 | def mpc_multiple_predict(mpc_model_path, df, condition):
266 |     """
267 |     mpc多分类预测
268 |     :param mpc_model_path: 模型地址
269 |     :param df: 数据
270 |     :param condition: {"features": [12, 13, 14, 15], "label": "label"}
271 |     特征列
272 |     :return: 预测结果 sparkframe
273 |     """
274 |     feature_indexs = condition['features']
275 |     label_index = condition['label']
276 | 
277 |     if label_index is None or label_index == "":  # 无标签列
278 |         # 1. 准备数据
279 |         def func(x):
280 |             features_data = []
281 |             for feature in feature_indexs:
282 |                 features_data.append(x[feature])
283 |             return Row(features=Vectors.dense(features_data))
284 | 
285 |         training_set = df.rdd.map(lambda x: func(x)).toDF()
286 | 
287 |         # 2.加载模型
288 |         mpc_model = MultilayerPerceptronClassificationModel.load(mpc_model_path)
289 | 
290 |         # 3.预测
291 |         prediction_df = mpc_model.transform(training_set).select("prediction", "features")
292 |         return prediction_df
293 |     else:  # 有标签列
294 |         # 1. 准备数据
295 |         def func(x):
296 |             features_data = []
297 |             for feature in feature_indexs:
298 |                 features_data.append(x[feature])
299 |             return Row(label=x[label_index], features=Vectors.dense(features_data))
300 | 
301 |         training_set = df.rdd.map(lambda x: func(x)).toDF()
302 | 
303 |         # 2.加载模型
304 |         print("*****mpc_model_path:", mpc_model_path)
305 |         mpc_model = MultilayerPerceptronClassificationModel.load(mpc_model_path)
306 | 
307 |         # 3.预测
308 |         prediction_df = mpc_model.transform(training_set).select("prediction", "label", "features")
309 |         return prediction_df
310 | 


--------------------------------------------------------------------------------
/app/service/ml/SecondClassification.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | """
  3 | 二分类
  4 | """
  5 | from pyspark.mllib.classification import SVMWithSGD
  6 | from pyspark.mllib.regression import LabeledPoint
  7 | import app.dao.OperatorDao as OperatorDao
  8 | from app.Utils import *
  9 | 
 10 | from pyspark.ml.linalg import Vectors
 11 | from pyspark.ml.classification import GBTClassifier, LogisticRegression
 12 | from pyspark.ml.feature import StringIndexer
 13 | from pyspark.sql.types import Row
 14 | 
 15 | 
 16 | def model_url():
 17 |     """
 18 |     二分类模型保存地址
 19 |     :return:
 20 |     """
 21 |     return const.MIDDATA + 'model/secondClassification'
 22 | 
 23 | 
 24 | def svm(spark_session, operator_id, file_url, condition):
 25 |     """
 26 |     支持向量机二分类
 27 |     :param spark_session:
 28 |     :param operator_id:
 29 |     :param file_url:
 30 |     :param condition:
 31 |     :return:
 32 |     """
 33 |     try:
 34 |         # 修改计算状态
 35 |         OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
 36 |         # 读取数据
 37 |         df = read_data(spark_session, file_url)
 38 |         # svm_core函数
 39 |         result_model_url = svm_core(spark_session, df, condition)
 40 |         # 修改计算状态
 41 |         run_info = '支持向量机二分类算子执行成功'
 42 |         OperatorDao.update_operator_by_id(operator_id, 'success', result_model_url, run_info)
 43 |         return [result_model_url]
 44 | 
 45 |     except Exception as e:
 46 |         run_info = str(e)
 47 |         OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
 48 |         traceback.print_exc()
 49 |     return []
 50 | 
 51 | 
 52 | def svm_core(spark_session, df, condition):
 53 |     """
 54 |     支持向量机二分类核心函数
 55 |     :param spark_session:
 56 |     :param df:
 57 |     :param condition:
 58 |     {"label": "", "features": [12, 13, 14, 15], "iterations": 20, "step": 1.0, "regParam": 0.01, "regType": "l2", "convergenceTol": 0.001}
 59 |     :return:
 60 |     """
 61 | 
 62 |     # 参数
 63 |     label_index = condition['label']  # 标签列(列名或列号)
 64 |     feature_indexs = condition['features']  # 特征列(列名或列号)
 65 |     iterations = condition['iterations']  # 迭代轮数
 66 |     step = condition['step']  # 步长
 67 |     reg_param = condition['regParam']  # 正则化系数
 68 |     reg_type = condition['regType']  # 正则化
 69 |     convergence_tol = condition['convergenceTol']  # 收敛系数
 70 | 
 71 |     # 1. 准备数据
 72 |     def func(x):
 73 |         features_data = []
 74 |         for feature in feature_indexs:
 75 |             features_data.append(x[feature])
 76 |         return LabeledPoint(label=x[label_index], features=features_data)
 77 | 
 78 |     training_data = df.rdd.map(lambda x: func(x))
 79 | 
 80 |     # 2. 训练
 81 |     svm_model = SVMWithSGD.train(training_data, iterations=iterations, step=step, regParam=reg_param,
 82 |                                  miniBatchFraction=1.0, initialWeights=None, regType=reg_type,
 83 |                                  intercept=False, validateData=True, convergenceTol=convergence_tol)
 84 | 
 85 |     # 3.保存模型
 86 |     svm_model_path = model_url() + '/svm/' + str(uuid.uuid1())
 87 |     deltree(svm_model_path)  # 删除已经存在的模型
 88 |     svm_model.save(spark_session.sparkContext, svm_model_path)
 89 | 
 90 |     return svm_model_path
 91 | 
 92 | 
 93 | def gbdt(spark_session, operator_id, file_url, condition):
 94 |     """
 95 |     # GBDT(Gradient Boosting Decision Tree) 又叫 MART（Multiple Additive Regression Tree)，是一种迭代的决策树算法，
 96 |     # 该算法由多棵决策树组成，所有树的结论累加起来做最终答案。
 97 |     :param spark_session:
 98 |     :param operator_id:
 99 |     :param file_url:
100 |     :param condition:
101 |     :return:
102 |     """
103 |     try:
104 |         # 修改计算状态
105 |         OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
106 |         # 读取数据
107 |         df = read_data(spark_session, file_url)
108 |         # svm_core函数
109 |         result_model_url = gbdt_core(df, condition)
110 |         # 修改计算状态
111 |         run_info = 'GBDT二分类算子执行成功'
112 |         OperatorDao.update_operator_by_id(operator_id, 'success', result_model_url, run_info)
113 |         return [result_model_url]
114 | 
115 |     except Exception as e:
116 |         run_info = str(e)
117 |         OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
118 |         traceback.print_exc()
119 |     return []
120 | 
121 | 
122 | def gbdt_core(df, condition):
123 |     """
124 |     gdbt二分类核心函数
125 |     :param spark_session:
126 |     :param df:
127 |     :param condition:{"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20, "step": 0.1, "maxDepth": 5, "minInstancesPerNode": 1, "seed": 1}
128 |     :return:
129 |     """
130 | 
131 |     # 参数
132 |     label_index = condition['label']  # 标签列(列名或列号)
133 |     feature_indexs = condition['features']  # 特征列(列名或列号)
134 |     iterations = condition['iterations']  # 迭代次数
135 |     step = condition['step']  # 学习速率(0-1)
136 |     max_depth = condition['maxDepth']  # 数的最大深度[1,100]
137 |     minInstancesPerNode = condition['minInstancesPerNode']  # 叶子节点最少样本数[1,1000]
138 |     seed = condition['seed']  # 随机数产生器种子[0,10]
139 | 
140 |     # 1. 准备数据
141 |     def func(x):
142 |         features_data = []
143 |         for feature in feature_indexs:
144 |             features_data.append(x[feature])
145 |         return Row(label=x[label_index], features=Vectors.dense(features_data))
146 | 
147 |     training_set = df.rdd.map(lambda x: func(x)).toDF()
148 | 
149 |     string_indexer = StringIndexer(inputCol="label", outputCol="indexed")
150 |     si_model = string_indexer.fit(training_set)
151 |     tf = si_model.transform(training_set)
152 | 
153 |     # 2. 训练
154 |     gbdt = GBTClassifier(labelCol="indexed",
155 |                          maxIter=iterations, stepSize=step, maxDepth=max_depth, minInstancesPerNode=minInstancesPerNode,
156 |                          seed=seed)
157 |     gbdt_model = gbdt.fit(tf)
158 |     print(gbdt_model.featureImportances)
159 | 
160 |     # 3.保存模型
161 |     svm_model_path = model_url() + '/gbdt/' + str(uuid.uuid1())
162 |     deltree(svm_model_path)  # 删除已经存在的模型
163 |     gbdt_model.write().overwrite().save(svm_model_path)
164 | 
165 |     return svm_model_path
166 | 
167 | 
168 | def lr(spark_session, operator_id, file_url, condition):
169 |     """
170 |     逻辑回归二分类
171 |     :param spark_session:
172 |     :param operator_id:
173 |     :param file_url:
174 |     :param condition:
175 |     :return:
176 |     """
177 |     try:
178 |         # 修改计算状态
179 |         OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
180 |         # 读取数据
181 |         df = read_data(spark_session, file_url)
182 |         # svm_core函数
183 |         result_model_url = lr_core(df, condition)
184 |         # 修改计算状态
185 |         run_info = '逻辑回归二分类算子执行成功'
186 |         OperatorDao.update_operator_by_id(operator_id, 'success', result_model_url, run_info)
187 |         return [result_model_url]
188 | 
189 |     except Exception as e:
190 |         run_info = str(e)
191 |         OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
192 |         traceback.print_exc()
193 |     return []
194 | 
195 | 
196 | def lr_core(df, condition):
197 |     """
198 |     lr二分类核心函数
199 |     :param spark_session:
200 |     :param df:
201 |     :param condition:{"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20,"regParam":0.0,"elasticNetParam":0.0,"tol":0.000006,"fitIntercept":True,"threshold":0.5}
202 |     :return:
203 |     """
204 |     # 参数
205 |     label_index = condition['label']  # 标签列(列名或列号)
206 |     feature_indexs = condition['features']  # 特征列(列名或列号)
207 |     iterations = condition['iterations']  # 最大迭代次数(默认100)
208 |     regParam = condition['regParam']  # 正则化参数（默认0.0）
209 |     # ElasticNet混合参数，范围为[0，1]。对于alpha = 0，惩罚是L2惩罚。对于alpha = 1，这是L1惩罚（默认值：0.0)
210 |     elasticNetParam = condition['elasticNetParam']
211 |     tol = condition['tol']  # 迭代算法的收敛容限（> = 0）（默认值：1e-06即 0.000006）
212 |     fitIntercept = condition['fitIntercept']  # 是否训练截距项（默认值："True","False"可选)
213 |     threshold = condition['threshold']  # 二进制分类预测中的阈值，范围为[0，1]（默认值：0.5）
214 | 
215 |     # 参数类型转换
216 |     if isinstance(iterations, str):
217 |         iterations = int(iterations)
218 |     if isinstance(regParam, str):
219 |         regParam = float(regParam)
220 |     if isinstance(elasticNetParam, str):
221 |         elasticNetParam = float(elasticNetParam)
222 |     if isinstance(tol, str):
223 |         tol = float(tol)
224 |     if isinstance(fitIntercept, str):
225 |         if fitIntercept == 'False':
226 |             fitIntercept = False
227 |         else:
228 |             fitIntercept = True
229 |     if isinstance(threshold, str):
230 |         threshold = float(threshold)
231 | 
232 |     # 1. 准备数据
233 |     def func(x):
234 |         features_data = []
235 |         for feature in feature_indexs:
236 |             features_data.append(x[feature])
237 |         return Row(label=x[label_index], features=Vectors.dense(features_data))
238 | 
239 |     training_set = df.rdd.map(lambda x: func(x)).toDF()
240 | 
241 |     # 2.训练模型
242 |     lr_param = LogisticRegression(featuresCol="features", labelCol="label", predictionCol="prediction",
243 |                                   maxIter=iterations, regParam=regParam, elasticNetParam=elasticNetParam, tol=tol,
244 |                                   fitIntercept=fitIntercept, threshold=threshold, probabilityCol="probability",
245 |                                   rawPredictionCol="rawPrediction", standardization=True,
246 |                                   aggregationDepth=2, family="auto")
247 |     lr_model = lr_param.fit(training_set)
248 |     print(lr_model.coefficients)  # 系数
249 |     print(lr_model.intercept)  # 截距
250 |     print(lr_model.explainParams())  # 参数以及其注解
251 | 
252 |     # 3.保存模型
253 |     lr_model_path = model_url() + '/lr/' + str(uuid.uuid1())
254 |     deltree(lr_model_path)  # 删除已经存在的模型
255 |     lr_model.write().overwrite().save(lr_model_path)
256 | 
257 |     return lr_model_path
258 | 
259 | 
260 | '''
261 | 错误 'PipelinedRDD' object has no attribute 'show'
262 | 报这个错，是因为 df.show() is only for spark DataFrame 所致。
263 | '''
264 | 


--------------------------------------------------------------------------------
/app/service/ml/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | 机器学习Service
3 | """


--------------------------------------------------------------------------------
/app/test/FPGrowthTest.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext
 2 | sc = SparkContext("local","testing")
 3 | from  pyspark.mllib.fpm import FPGrowth
 4 | data = [["A", "B", "C", "E", "F","O"], ["A", "C", "G"], ["E","I"], ["A", "C","D","E","G"], ["A", "C", "E","G","L"],
 5 |        ["E","J"],["A","B","C","E","F","P"],["A","C","D"],["A","C","E","G","M"],["A","C","E","G","N"]]
 6 | # 转换成RDD 参数numSlices指定了将数据集切分为几份，这里不设置，Spark会尝试根据集群的状况，来自动设定slices的数目
 7 | rdd = sc.parallelize(data)
 8 | #支持度阈值为20%
 9 | model = FPGrowth.train(rdd, 0.3, 2)
10 | print(sorted(model.freqItemsets().collect()))
11 | 
12 | 
13 | from  pyspark.mllib.fpm import PrefixSpan
14 | data = [
15 |    [['a'],["a", "b", "c"], ["a","c"],["d"],["c", "f"]],
16 |    [["a","d"], ["c"],["b", "c"], ["a", "e"]],
17 |    [["e", "f"], ["a", "b"], ["d","f"],["c"],["b"]],
18 |    [["e"], ["g"],["a", "f"],["c"],["b"],["c"]]
19 |    ]
20 | rdd = sc.parallelize(data)
21 | model = PrefixSpan.train(rdd, 0.5,4)
22 | print(sorted(model.freqItemsets().collect()))
23 | 


--------------------------------------------------------------------------------
/app/test/PySparkTest.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | import pyspark.rdd
 3 | 
 4 | APP_NAME = "Random Forest Example2"
 5 | SPARK_URL = "spark://10.108.211.130:7077"
 6 | spark = SparkSession.builder \
 7 |         .appName(APP_NAME) \
 8 |         .master(SPARK_URL) \
 9 |         .getOrCreate()
10 | # data = spark.sparkContext.parallelize([('Ferrari','fast'),{'Porsche':10000},['Spain','visited',4504]]).collect()
11 | # arr = []
12 | # arr.append(data)
13 | # print(data[1]['Porsche'])
14 | # print(data[2][1])
15 | #
16 | # df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age"))
17 | # df = spark.createDataFrame([(1, "John Doe", 22)], ("id", "name", "age"))
18 | # df.show()
19 | # arr2 = []
20 | # arr2.append(df)
21 | # for d in arr2:
22 | #     d.show()
23 | data = spark.sparkContext.textFile("hdfs://10.108.211.130/user/yufeng/files/spam.txt")
24 | data2 = data.map(lambda x: len(x))
25 | sum = data2.fold(0, (lambda x, y: x + y))
26 | 
27 | sum
28 | 


--------------------------------------------------------------------------------
/app/test/RandomForestTest.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Random Forest Classification Example.
 3 | """
 4 | from pyspark import SparkContext
 5 | from pyspark.sql import SparkSession
 6 | 
 7 | if __name__ == "__main__":
 8 | 
 9 |     CSV_PATH = "/home/zk/data/creditcard.csv"
10 |     APP_NAME = "Random Forest Example"
11 |     SPARK_URL = "local[*]"
12 |     RANDOM_SEED = 13579
13 |     TRAINING_DATA_RATIO = 0.7
14 |     RF_NUM_TREES = 3
15 |     RF_MAX_DEPTH = 4
16 |     RF_MAX_BINS = 32
17 | 
18 |     spark = SparkSession.builder \
19 |         .appName(APP_NAME) \
20 |         .master(SPARK_URL) \
21 |         .getOrCreate()
22 | 
23 |     df = spark.read \
24 |         .options(header="true", inferschema="true") \
25 |         .csv(CSV_PATH)
26 | 
27 |     print("Total number of rows: %d" % df.count())
28 | 
29 |     from pyspark.mllib.linalg import Vectors
30 |     from pyspark.mllib.regression import LabeledPoint
31 | 
32 |     transformed_df = df.rdd.map(lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1])))
33 | 
34 |     splits = [TRAINING_DATA_RATIO, 1.0 - TRAINING_DATA_RATIO]
35 |     training_data, test_data = transformed_df.randomSplit(splits, RANDOM_SEED)
36 | 
37 |     print("Number of training set rows: %d" % training_data.count())
38 |     print("Number of test set rows: %d" % test_data.count())
39 | 
40 |     from pyspark.mllib.tree import RandomForest
41 |     from time import *
42 |     import shutil, os
43 | 
44 |     start_time = time()
45 | 
46 |     model = RandomForest.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={}, \
47 |                                          numTrees=RF_NUM_TREES, featureSubsetStrategy="auto", impurity="gini", \
48 |                                          maxDepth=RF_MAX_DEPTH, maxBins=RF_MAX_BINS, seed=RANDOM_SEED)
49 |     if os.path.exists("myRandomForestClassificationModel"):
50 |         shutil.rmtree("myRandomForestClassificationModel")
51 |     model.save(spark.sparkContext, "myRandomForestClassificationModel")
52 | 
53 |     print('Learned classification forest model:')
54 |     print(model.numTrees())
55 |     print(model.totalNumNodes())
56 |     print(model.toDebugString())
57 |     end_time = time()
58 |     elapsed_time = end_time - start_time
59 |     print("Time to train model: %.3f seconds" % elapsed_time)
60 | 
61 |     predictions = model.predict(test_data.map(lambda x: x.features))
62 |     labels_and_predictions = test_data.map(lambda x: x.label).zip(predictions)
63 |     acc = labels_and_predictions.filter(lambda x: x[0] == x[1]).count() / float(test_data.count())
64 |     print("Model accuracy: %.3f%%" % (acc * 100))
65 | 
66 |     from pyspark.mllib.evaluation import BinaryClassificationMetrics
67 | 
68 |     start_time = time()
69 | 
70 |     metrics = BinaryClassificationMetrics(labels_and_predictions)
71 |     print("Area under Precision/Recall (PR) curve: %.f" % (metrics.areaUnderPR * 100))
72 |     print("Area under Receiver Operating Characteristic (ROC) curve: %.3f" % (metrics.areaUnderROC * 100))
73 | 
74 |     end_time = time()
75 |     elapsed_time = end_time - start_time
76 |     print("Time to evaluate model: %.3f seconds" % elapsed_time)
77 | 


--------------------------------------------------------------------------------
/app/test/Test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | from app.Utils import getProjectCurrentDataUrl
 3 | import pandas as pd
 4 | 
 5 | def fullTableStatistics2():
 6 |     # columnNames = request.form.getlist("columns")
 7 |     # projectName = request.form.getlist("projectName")
 8 |     columnNames = [ "行 ID",
 9 |                     "订单 ID",
10 |                     "订购日期",
11 |                     "装运日期",
12 |                     "装运方式",
13 |                     "客户 ID",
14 |                     "客户名称",
15 |                     "细分市场",
16 |                     "邮政编码 (Postal Code)",
17 |                     "城市 (City)",
18 |                     "省/市/自治区 (State/Province)",
19 |                     "国家/地区 (Country)",
20 |                     "地区",
21 |                     "市场",
22 |                     "产品 ID",
23 |                     "类别",
24 |                     "子类别",
25 |                     "产品名称",
26 |                     "销售额",
27 |                     "数量",
28 |                     "折扣",
29 |                     "利润",
30 |                     "装运成本",
31 |                     "订单优先级"]
32 |     projectName = "爱德信息分析项目"
33 |     fileUrl = getProjectCurrentDataUrl(projectName)
34 |     if fileUrl[-4:] == ".csv":
35 |         df_excel = pd.read_csv(fileUrl, encoding="utf-8")
36 |     else:
37 |         df_excel = pd.read_excel(fileUrl, encoding="utf-8")
38 |     res = []
39 |     statistics = ['字段名','类型','总数','最小值','最小值位置','25%分位数','中位数','75%分位数','均值','最大值','最大值位置','平均绝对偏差','方差','标准差','偏度','峰度']
40 |     for columnName in columnNames:
41 |         info = {}.fromkeys(statistics)
42 |         info['字段名'] = columnName
43 |         info['类型'] = df_excel[columnName].dtype
44 |         if info['类型'] == 'int64' or info['类型'] == 'float64':
45 |             info['总数'] = df_excel[columnName].count()
46 |             info['最小值'] = df_excel[columnName].min()
47 |             info['最小值位置'] = df_excel[columnName].idxmin()
48 |             info['25%分位数'] = df_excel[columnName].quantile(.25)
49 |             info['中位数'] = df_excel[columnName].median()
50 |             info['75%分位数'] = df_excel[columnName].quantile(.75)
51 |             info['均值'] = df_excel[columnName].mean()
52 |             info['最大值'] = df_excel[columnName].max()
53 |             info['最大值位置'] = df_excel[columnName].idxmax()
54 |             info['平均绝对偏差'] = df_excel[columnName].mad()
55 |             info['方差'] = df_excel[columnName].var()
56 |             info['标准差'] = df_excel[columnName].std()
57 |             info['偏度'] = df_excel[columnName].skew()
58 |             info['峰度'] = df_excel[columnName].kurt()
59 |         else:
60 |             info['类型'] = "text"
61 |         res.append(info)
62 |     # print(res)
63 | 
64 | # fullTableStatistics2()
65 | 
66 | str1 = "利润,+,100,+,数量,*,0.0001,newCol"
67 | print(len(str1.split(';'))) #1
68 | str1 = "利润,均值填充;数量,最大值填充"
69 | print(len(str1.split(';'))) #2
70 | str1 = "利润,均值填充;数量,最大值填充;"
71 | print(len(str1.split(';'))) #3


--------------------------------------------------------------------------------
/app/test/zhoukang:
--------------------------------------------------------------------------------
 1 | 
 2 | #数据源 过滤 排序
 3 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"data2" :{"type" : "data","name" : 5001,"location" : {"x":"12px", "y":"23px"},"config" : {"fileId" : 2,"fileUrl" : [{"data2":"/home/zk/data/订单信息.csv"}]},"next" : ["pre1"],"pre" : []},"exp1" : {"type" : "exploration","name" : 1001,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" : {"userId":"1","projectId":32,"parameter":[{"colName":"利润", "operate":">", "value":"100", "relation":"AND"},{"colName":"装运方式", "operate":"==", "value":"一级", "relation":""}]},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]},"pre1" : {"type" : "preprocess","name" : 1002,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter":{"userId": "1", "projectId": 32, "columnName": "利润", "sortType": "升序"},"fileUrl":[{"data2":0}]},"next" : [],"pre" : ["data2"]}}
 4 | 
 5 | # 排序
 6 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 1002,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" : {"userId":"1","projectId":32,"parameter":{"userId":1,"projectId":32,"columnName":"利润","sortType":"降序"}},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}}
 7 | 
 8 | #数据列拆分
 9 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 1003,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" : {"userId": 1, "projectId": 32, "columnName": "订购日期", "delimiter": "/", "newColumnNames": ["year", "月"]},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}}
10 | 
11 | # 数据列合并
12 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 1005,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" : {"userId": 1, "projectId": 32, "columnNames": ["类别", "子类别", "产品名称"], "connector": "-", "newColumnName": "品类名称"},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}}
13 | 
14 | # 替换
15 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 1006,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" : {"userId": 1, "projectId": 32, "columnNames": ["类别", "子类别", "客户名称"],"replaceCharacters":[{"source":"技术","target":"技术copy"},{"source":"电话","target":"电话copy"}]},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}}
16 | 
17 | #填充空值
18 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 1007,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" : {"userId":1,"projectId":32,"parameter":[{"operate":"均值填充","colName":"利润"},{"operate":"均值填充","colName":"数量"}]},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}}
19 | 
20 | # 列映射
21 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 1008,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"userId":1,"projectId":32,"parameter":[{"colName_1":"利润", "operate_1":"+","value_1":"100","operate":"+","colName_2":"数量", "operate_2":"*","value_2":"0.0001","newName":"newCol1"},{"colName_1":"利润", "operate_1":"+","value_1":"10","operate":"*","colName_2":"数量", "operate_2":"*","value_2":"0.1","newName":"newCol2"}]},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}}
22 | 
23 | # 分位数离散化
24 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 2001,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"userId":1,"projectId":32,"columnName":"装运成本","newColumnName":"装运成本(分位数离散化)","numBuckets":10},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}}
25 | 
26 | # 向量索引
27 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 2002,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"userId":1,"projectId":32,"columnNames":["装运成本"],"newColumnName":"向量索引转换结果","maxCategories":50},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}}
28 | 
29 | # 标准化
30 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 2003,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"projectId":32,"columnNames":["利润"],"newColumnName":"利润(标准化)"},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}}
31 | 
32 | # pca
33 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 2004,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"userId":1,"projectId":32,"columnNames":["销售额","数量","折扣","利润","装运成本"],"newColumnName":"降维结果","k":4},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}}
34 | 
35 | # 字符串转标签
36 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 2005,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"userId":1,"projectId":32,"columnName":"客户名称","newColumnName":"客户名称(标签化，按频率排序，0为频次最高)"},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}}
37 | 
38 | # 独热编码
39 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 2006,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"userId":1,"projectId":32,"columnNames":["数量","数量"],"newColumnNames":["独热编码1","独热编码2"]},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}}
40 | 
41 | # 多项式扩展
42 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 2007,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"projectId":32,"columnNames":["数量","折扣","装运成本"],"newColumnName":"多项式扩展"},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}}
43 | 
44 | # 卡方选择
45 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 2008,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"userId":"1","projectId":"订单分析","columnNames":["折扣","装运成本"],"columnName_label":"数量","newColumnName":"卡方选择","numTopFeatures":2},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}}
46 | 
47 | # 全表统计
48 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 3001,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"projectId": 32, "columnNames": ["利润"]},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}}
49 | 
50 | # 频率统计
51 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 3002,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"projectId":32,"columnName":"类别"},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}}
52 | 
53 | # 相关系数
54 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 3003,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"projectId": 32, "columnNames": ["销售额", "折扣", "装运成本"]},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}}
55 | 
56 | # 支持向量机二分类
57 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 6001,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20, "step": 1.0, "regParam": 0.01, "regType": "l2", "convergenceTol": 0.001},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}}
58 | 
59 | # gbdt二分类
60 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 6002,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter": {"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20, "step": 0.1, "maxDepth": 5, "minInstancesPerNode": 1, "seed": 1},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}}
61 | 
62 | # 逻辑回归二分类
63 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 6003,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20,"regParam":0.0,"elasticNetParam":0.0,"tol":0.000006,"fitIntercept":"True","threshold":0.5},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}}
64 | 
65 | # 逻辑回归多分类
66 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 6004,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20,"regParam":0.0,"elasticNetParam":0.0,"tol":0.000006,"fitIntercept":"True"},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}}
67 | 
68 | # 多层感知机多分类
69 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 6005,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20, "seed": 1, "stepSize": 0.03,"layers": [4, 2, 2],"tol": 0.000001, "blockSize": 128, "solver": "l-bfgs"},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}}
70 | 
71 | ## 预测
72 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "数据预处理","name" : 6001,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20, "step": 1.0, "regParam": 0.01, "regType": "l2", "convergenceTol": 0.001},"fileUrl":[{"data1":0}]},"next" : ["pre1"],"pre" : ["data1"]},"pre1" : {"type" : "机器学习","name" : 6000,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"label":"标签","features": [12, 13, 14, 15]},"fileUrl":[{"data1":0},{"exp1":0}]},"next" : [],"pre" : ["exp1","data1"]}}
73 | 
74 | # 拆分数据
75 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 1009,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" : {"proportion1": 0.7, "proportion2": 0.3, "seed": 10},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}}
76 | 
77 | # 评估
78 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "数据预处理","name" : 6001,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20, "step": 1.0, "regParam": 0.01, "regType": "l2", "convergenceTol": 0.001},"fileUrl":[{"data1":0}]},"next" : ["pre1"],"pre" : ["data1"]},"pre1" : {"type" : "机器学习","name" : 6000,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"label":"标签","features": [12, 13, 14, 15]},"fileUrl":[{"data1":0},{"exp1":0}]},"next" : ["eva1"],"pre" : ["exp1","data1"]},"eva1" : {"type" : "机器学习","name" : 7001,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"label":"标签"},"fileUrl":[]},"next" : [],"pre" : ["pre1"]}}
79 | 
80 | 
81 | 
82 | # 加载数据 + 加载gbdt二分类 + 预测
83 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["pre1"],"pre" : [""]},"model1" : {"type" : "exploration","name" : 8000,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter": {"userId": 1, "projectId": 37, "MLModelId": 3, "modelTypeId": 6002},"fileUrl":[]},"next" : ["pre1"],"pre" : []},"pre1" : {"type" : "exploration","name" : 6000,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter": {"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"]},"fileUrl":[{"data1":0},{"model1":0}]},"next" : [],"pre" : ["data1","model1"]}}
84 | 
85 | # 加载数据 + 加载逻辑回归二分类 + 预测
86 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["pre1"],"pre" : [""]},"model1" : {"type" : "exploration","name" : 8000,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter": {"userId": 1, "projectId": 32, "MLModelId": 5, "modelTypeId": 6003},"fileUrl":[]},"next" : ["pre1"],"pre" : []},"pre1" : {"type" : "exploration","name" : 6000,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter": {"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"]},"fileUrl":[{"data1":0},{"model1":0}]},"next" : [],"pre" : ["data1","model1"]}}
87 | 
88 | # 加载数据 + 加载多层感知机多分类 + 预测
89 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["pre1"],"pre" : [""]},"model1" : {"type" : "exploration","name" : 8000,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter": {"userId": 1, "projectId": 32, "MLModelId": 7, "modelTypeId": 6005},"fileUrl":[]},"next" : ["pre1"],"pre" : []},"pre1" : {"type" : "exploration","name" : 6000,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter": {"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"]},"fileUrl":[{"data1":0},{"model1":0}]},"next" : [],"pre" : ["data1","model1"]}}
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/app/views/OperateFlow.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | from flask import flash, get_flashed_messages, redirect, render_template, request, session, url_for, jsonify, Response, \
  3 |     abort
  4 | from flask.json import jsonify
  5 | from app import app
  6 | import json
  7 | import os
  8 | import time
  9 | from app.Utils import *
 10 | from app.views import Process
 11 | import app.Utils as apus
 12 | import pandas as pd
 13 | from pyspark.sql import SparkSession
 14 | import random
 15 | import string
 16 | 
 17 | 
 18 | # 解决 list, dict 不能返回的问题
 19 | class MyResponse(Response):
 20 |     @classmethod
 21 |     def force_type(cls, response, environ=None):
 22 |         if isinstance(response, (list, dict)):
 23 |             response = jsonify(response)
 24 |         return super(Response, cls).force_type(response, environ)
 25 | 
 26 | 
 27 | app.response_class = MyResponse
 28 | 
 29 | 
 30 | # 解析filter参数函数
 31 | def parsingFilterParameters(str):
 32 |     condition = []
 33 |     strList = str.split(';')
 34 |     for i in range(len(strList)):
 35 |         ll = strList[i].split(',', 3)
 36 |         con = {}
 37 |         con['name'] = ll[0]
 38 |         con['operate'] = ll[1]
 39 |         con['value'] = ll[2]
 40 |         con['relation'] = ll[3]
 41 |         condition.append(con)
 42 |     return condition
 43 | 
 44 | 
 45 | # 查看处理流程
 46 | @app.route("/getOperateFlow", methods=['POST'])
 47 | def getOperateFlow():
 48 |     projectName = request.form.get('projectName')
 49 |     userId = request.form.get('userId')
 50 |     project = getProjectByNameAndUserId(projectName, userId)
 51 |     # print(project)
 52 |     processflow = getProcessFlowByProjectId(project.id)
 53 |     operates = json.loads(processflow.operates)
 54 |     # print(operates)
 55 |     # for item in operates:
 56 |     #     # print(item)
 57 |     #     # print(item['type'])
 58 |     #     # print(item['operate'])
 59 |     #     if (item['type'] == '1'):
 60 |     #         item['operate'] = parsingFilterParameters(item['operate'])
 61 |     print(operates)
 62 |     return operates
 63 | 
 64 | 
 65 | @app.route("/executeAgain", methods=['POST'])
 66 | def executeAgain():
 67 |     """
 68 |     重新执行处理流程（DAG）。
 69 |     请求，判断这个节点的父节点是否执行完成，如果完成 拿父节点输出的数据 作为输入，处理后存储数据并标记该节点已经完成。
 70 |     :return:
 71 |     """
 72 |     projectName = request.form.get('projectName')
 73 |     userId = request.form.get('userId')
 74 |     nodeId = request.form.get('nodeId') # 节点开始执行的
 75 |     project = getProjectByNameAndUserId(projectName, userId)
 76 |     # print(project)
 77 |     processflow = getProcessFlowByProjectId(project.id)
 78 |     operates = json.loads(processflow.operates)
 79 |     fileUrl = getProjectCurrentDataUrl(projectName)['fileUrl']
 80 |     # print(operates)
 81 |     functionName = projectName + "-executeAgain"
 82 | 
 83 |     # spark会话
 84 |     spark = getSparkSession(userId, functionName)
 85 | 
 86 |     # 获取数据
 87 |     df = spark.read.format("CSV").option("header", "true").load(fileUrl)
 88 | 
 89 |     # 执行DAG图
 90 |     for item in operates:
 91 |         if (item['type'] == '1'):
 92 |             # 解析参数格式
 93 |             condition = parsingFilterParameters(item['operate'])
 94 |             # 过滤函数
 95 |             df = Process.filterCore(spark, df, condition)
 96 |             df.show()
 97 | 
 98 |     # 处理后的数据写入文件
 99 |     df.toPandas().to_csv("/home/zk/data/test.csv", header=True)
100 |     # 返回前50条数据
101 |     data2 = df.limit(50).toJSON().collect()
102 |     print(data2)
103 |     data3 = ",".join(data2)
104 |     print(data3)
105 |     data4 = '[' + data3 + ']'
106 |     print(data4)
107 |     return jsonify({'length': df.count(), 'data': json.loads(data4)})
108 | 


--------------------------------------------------------------------------------
/app/views/OperateType.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | from flask import jsonify, Response
 3 | from app import app
 4 | from app.Utils import *
 5 | import app.dao.OperatorTypeDao as OperatorTypeDao
 6 | 
 7 | 
 8 | # 解决 list, dict 不能返回的问题
 9 | class MyResponse(Response):
10 |     @classmethod
11 |     def force_type(cls, response, environ=None):
12 |         if isinstance(response, (list, dict)):
13 |             response = jsonify(response)
14 |         return super(Response, cls).force_type(response, environ)
15 | 
16 | 
17 | app.response_class = MyResponse
18 | 
19 | 
20 | @app.route('/operateType/getAll', methods=['GET', 'POST'])
21 | def get_all_operate_type():
22 |     """
23 |     获取所有的算子种类
24 |     :return:
25 |     """
26 |     operator_types = OperatorTypeDao.get_all_operator_type()
27 | 
28 |     aaa = dict()
29 |     for i in operator_types:
30 |         if i.type_label not in aaa.keys():
31 |             aaa[i.type_label] = [{"id": i.id, "name": i.type_name}]
32 |         else:
33 |             aaa.get(i.type_label).append({"id": i.id, "name": i.type_name})
34 |             # 和Java一样 存的是数组的引用呀
35 | 
36 |     item_list = []
37 |     for name in aaa.keys():
38 |         list = aaa.get(name)
39 |         item_list.append({'name': name, 'list': list})
40 | 
41 |     result = {'list': item_list}
42 |     return result
43 | 


--------------------------------------------------------------------------------
/app/views/Operator.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | from flask import jsonify, Response, request
  3 | from app import app
  4 | from app.Utils import *
  5 | import app.dao.OperatorDao as OperatorDao
  6 | import pandas as pd
  7 | import app.service.MLModelService as MLModelService
  8 | 
  9 | 
 10 | # 解决 list, dict 不能返回的问题
 11 | class MyResponse(Response):
 12 |     @classmethod
 13 |     def force_type(cls, response, environ=None):
 14 |         if isinstance(response, (list, dict)):
 15 |             response = jsonify(response)
 16 |         return super(Response, cls).force_type(response, environ)
 17 | 
 18 | 
 19 | app.response_class = MyResponse
 20 | 
 21 | 
 22 | @app.route('/operate/getOperateResultData', methods=['GET', 'POST'])
 23 | def get_operate_result_data():
 24 |     """
 25 |     查看算子运行结果数据
 26 |     :return:
 27 |     """
 28 |     operator_id = request.form.get('operatorId')
 29 |     start = int(request.form.get('start'))
 30 |     end = int(request.form.get('end'))
 31 |     print(operator_id, start, end)
 32 |     operator = OperatorDao.get_operator_by_id(operator_id)
 33 |     if operator.status != "success":
 34 |         return "请执行该节点"
 35 |     if operator.operator_output_url is not None:
 36 |         operator_output_url = operator.operator_output_url.split('*,')
 37 |     else:
 38 |         return "没有运行结果"
 39 |     result_arr = []
 40 |     try:
 41 |         for i in range(len(operator_output_url)):
 42 |             data = pd.read_csv(operator_output_url[i], encoding='utf-8')
 43 |             if len(data) < end:
 44 |                 end = len(data)
 45 |             if start > end:
 46 |                 result_arr.append({'length': len(data), 'data': "请输入合法参数", 'position': i})
 47 |             else:
 48 |                 data2 = data[int(start):int(end)].to_json(orient='records', force_ascii=False)
 49 |                 result_arr.append({'length': len(data), 'data': json.loads(data2), 'position': i})
 50 |         return jsonify(result_arr)
 51 |     except:
 52 |         traceback.print_exc()
 53 |         return "Error，please contact the administrator "
 54 | 
 55 | 
 56 | @app.route('/operate/saveOperateModel', methods=['GET', 'POST'])
 57 | def save_operate_model():
 58 |     """
 59 |     对于模型算子 保存模型
 60 |     :return:
 61 |     """
 62 |     operator_id = request.form.get('operatorId')
 63 |     user_id = request.form.get('userId')
 64 |     name = request.form.get('name')
 65 | 
 66 |     try:
 67 |         result = MLModelService.save_ml_model(operator_id, user_id, name)
 68 |         if isinstance(result, str):
 69 |             return result
 70 |         if isinstance(result, bool):
 71 |             if result is True:
 72 |                 return "success"
 73 |         return "fail"
 74 |     except:
 75 |         traceback.print_exc()
 76 |         return "Error，please contact the administrator "
 77 | 
 78 | 
 79 | @app.route('/operate/getOperateModel', methods=['GET', 'POST'])
 80 | def get_operate_model():
 81 |     """
 82 |     获取保存的模型
 83 |     :return:
 84 |     """
 85 |     ml_model_id = request.args.get('MLModelId')
 86 |     project_id = request.args.get('projectId')
 87 |     user_id = request.args.get('userId')
 88 |     model_id = request.args.get('modelId')
 89 |     name = request.args.get('name')
 90 |     status = request.args.get('status')
 91 | 
 92 |     print(ml_model_id, project_id, user_id, model_id, name, status)
 93 |     try:
 94 |         results = MLModelService.get_ml_model(ml_model_id, project_id, user_id, model_id, name, status)
 95 |         return jsonify(results)
 96 |     except:
 97 |         traceback.print_exc()
 98 |     return "Error，please contact the administrator "
 99 | 
100 | 
101 | @app.route('/operate/deleteOperateModel', methods=['GET', 'POST'])
102 | def delete_operate_model():
103 |     """
104 |     删除保存的模型
105 |     :return:
106 |     """
107 |     ml_model_id = request.form.get('MLModelId')
108 |     try:
109 |         model = MLModelService.delete_ml_model(ml_model_id)
110 |         return 'success'
111 |     except:
112 |         traceback.print_exc()
113 |     return "Error，please contact the administrator "
114 | 


--------------------------------------------------------------------------------
/app/views/Project.py:
--------------------------------------------------------------------------------
  1 | # encoding=utf8
  2 | import sys
  3 | from importlib import reload
  4 | 
  5 | reload(sys)
  6 | 
  7 | from flask import request, jsonify, Response
  8 | from flask.json import jsonify
  9 | from app import app
 10 | from app import db
 11 | from app.models.MSEntity import DataSource, Project, Model
 12 | import os
 13 | from app.Utils import mkdir, getProjectByNameAndUserId
 14 | from app.ConstFile import const
 15 | 
 16 | 
 17 | class MyResponse(Response):
 18 |     """解决 list, dict 不能返回的问题"""
 19 | 
 20 |     @classmethod
 21 |     def force_type(cls, response, environ=None):
 22 |         if isinstance(response, (list, dict)):
 23 |             response = jsonify(response)
 24 |         return super(Response, cls).force_type(response, environ)
 25 | 
 26 | 
 27 | app.response_class = MyResponse
 28 | 
 29 | 
 30 | @app.route('/project/testList', methods=['GET', 'POST'])
 31 | def test_list():
 32 |     """
 33 |     获取项目列表
 34 |     :return:
 35 |     """
 36 |     result = []
 37 | 
 38 |     return jsonify(result)
 39 | 
 40 | 
 41 | @app.route('/project/getAll', methods=['GET', 'POST'])
 42 | def get_all():
 43 |     """
 44 |     获取项目列表
 45 |     :return:
 46 |     """
 47 |     data_sources = Project.query.all()
 48 |     result = []
 49 |     for i in data_sources:
 50 |         result.append({"id": i.id, "name": i.project_name})
 51 |     return jsonify(result)
 52 | 
 53 | 
 54 | @app.route('/project/create', methods=['GET', 'POST'])
 55 | def create():
 56 |     """
 57 |     创建项目。
 58 |     创建项目的时候 创建一个model，现在项目和model是 1：1对应的关系
 59 | 
 60 |     :return:
 61 |     """
 62 |     if request.method == 'GET':
 63 |         projectName = request.form.get('projectName')
 64 |         userId = request.form.get('userId')
 65 |     else:
 66 |         projectName = request.form.get('projectName')
 67 |         userId = request.form.get('userId')
 68 |     # 弃用，因此默认 1
 69 |     data_source_id = 1
 70 |     print('projectName: {}, dataSourceId: {}, userId: {}'.format(projectName, data_source_id, userId))
 71 | 
 72 |     root_url = const.ROOTURL
 73 | 
 74 |     # 数据库中添加Project记录
 75 |     project = Project(project_name=projectName, project_address=root_url + projectName, user_id=userId,
 76 |                       dataSource_id=data_source_id)
 77 |     db.session.add(project)
 78 | 
 79 |     # 数据库中添加Model记录
 80 |     # 格式化成2016-03-20 11:45:39形式
 81 |     import time
 82 |     project = getProjectByNameAndUserId(projectName, userId)
 83 |     model = Model(model_name=projectName, project_id=project.id, start_nodes="",
 84 |                   create_time=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
 85 |     db.session.add(model)
 86 |     db.session.commit()
 87 | 
 88 |     # 创建项目目录
 89 |     try:
 90 |         if not (os.path.exists(root_url + projectName)):
 91 |             filters = {
 92 |                 DataSource.id == data_source_id
 93 |             }
 94 |             data_sources = DataSource.query.filter(*filters).first()
 95 |             db.session.commit()
 96 |             mkdir(root_url + projectName)
 97 |             print(data_sources.file_url)
 98 |             return get_all()
 99 |         else:
100 |             return "Double name"
101 |     except:
102 |         return "error"
103 | 


--------------------------------------------------------------------------------
/app/views/ProjectModel.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | from flask import request, jsonify, Response
  3 | from app import app
  4 | from app.Utils import *
  5 | from app.dao.ModelDao import *
  6 | import app.service.ModelService as ModelService
  7 | 
  8 | 
  9 | # 解决 list, dict 不能返回的问题
 10 | class MyResponse(Response):
 11 |     @classmethod
 12 |     def force_type(cls, response, environ=None):
 13 |         if isinstance(response, (list, dict)):
 14 |             response = jsonify(response)
 15 |         return super(Response, cls).force_type(response, environ)
 16 | 
 17 | 
 18 | app.response_class = MyResponse
 19 | 
 20 | 
 21 | @app.route("/model/updateFlow", methods=['POST'])
 22 | def update_flow():
 23 |     """
 24 |     新建 处理流程
 25 |     :return:
 26 |     """
 27 |     user_id = request.form.get('userId')
 28 |     project_id = request.form.get('projectId')
 29 |     config = request.form.get('config')
 30 |     start_nodes = request.form.get('startNode')
 31 |     relationship = request.form.get('relationship')
 32 |     config_order = request.form.get('configOrder')
 33 | 
 34 |     print('------updateFlow', user_id, project_id, config, start_nodes, relationship, config_order)
 35 |     # 更新 model（流程图）
 36 |     result = ModelService.update_model(project_id, start_nodes, config, relationship, config_order)
 37 | 
 38 |     if result is not False:
 39 |         return "保存成功"
 40 |     else:
 41 |         return "保存失败，请重试！"
 42 | 
 43 | 
 44 | @app.route("/model/getFlow", methods=['POST'])
 45 | def get_flow():
 46 |     """
 47 |     查看model（执行流程）
 48 |     :return:
 49 |     """
 50 |     project_id = request.form.get('projectId')
 51 |     user_id = request.form.get('userId')
 52 | 
 53 |     print(project_id, user_id)
 54 |     flow = ModelService.get_model_by_project_id(project_id)
 55 |     if flow is False:
 56 |         return "获取执行流程图失败，请联系工作人员"
 57 | 
 58 |     return flow
 59 | 
 60 | 
 61 | @app.route("/model/getRunStatus", methods=['POST'])
 62 | def get_run_status():
 63 |     """
 64 |     查看model（执行流程）中每个节点的运行状态
 65 |     :return:
 66 |     """
 67 |     project_id = request.form.get('projectId')
 68 |     user_id = request.form.get('userId')
 69 |     model_execute_id = request.form.get('modelExecuteId')
 70 | 
 71 |     print(project_id, user_id, model_execute_id)
 72 |     # 查看状态
 73 |     flow = ModelService.get_run_status_by_project_id(project_id, model_execute_id)
 74 | 
 75 |     if flow is False:
 76 |         return "获取执行流程图失败，请联系工作人员"
 77 | 
 78 |     return flow
 79 | 
 80 | 
 81 | @app.route("/model/executeAll", methods=['POST'])
 82 | def model_execute_all():
 83 |     """
 84 |     从model（执行流程）中的某个节点开始执行
 85 |     :return:
 86 |     """
 87 |     import _thread
 88 | 
 89 |     project_id = request.form.get('projectId')
 90 |     user_id = request.form.get('userId')
 91 |     print('-----/model/executeAll-----', user_id, project_id)
 92 | 
 93 |     try:
 94 |         param = ModelService.run_execute_status_from_start(user_id, project_id)
 95 |         _thread.start_new_thread(ModelService.model_execute, (user_id, project_id, param))
 96 |         return {'model_execute_id': param['model_execute_id']}
 97 |     except:
 98 |         traceback.print_exc()
 99 |         print("Error: 无法启动线程")
100 |         return '启动失败'
101 | 
102 | 
103 | @app.route("/model/executeFromOne", methods=['POST'])
104 | def model_execute_from_one():
105 |     """
106 |     从model（执行流程）中的某个节点开始执行
107 |     :return:
108 |     """
109 |     import _thread
110 | 
111 |     project_id = request.form.get('projectId')
112 |     user_id = request.form.get('userId')
113 |     operator_id = request.form.get('operatorId')
114 |     print('-----/model/executeFromOne-----', user_id, project_id, operator_id)
115 | 
116 |     try:
117 |         param = ModelService.run_execute_status_from_one(user_id, operator_id)
118 |         _thread.start_new_thread(ModelService.model_execute, (user_id, project_id, param))
119 |         return {'model_execute_id': param['model_execute_id']}
120 |     except:
121 |         print("Error: 无法启动线程")
122 |         return '启动失败'
123 | 
124 |     return '启动成功'
125 | 


--------------------------------------------------------------------------------
/app/views/Report.py:
--------------------------------------------------------------------------------
  1 | # encoding=utf8
  2 | from flask import request
  3 | from flask.json import jsonify
  4 | from app import app
  5 | from app import db
  6 | from app.models.MSEntity import Report
  7 | import traceback
  8 | 
  9 | 
 10 | @app.route('/report/getAll', methods=['GET', 'POST'])
 11 | def report_get_all():
 12 |     """
 13 |     获取所有报告
 14 |     :return:
 15 |     """
 16 |     reports = Report.query.all()
 17 |     result = []
 18 |     for i in reports:
 19 |         result.append({"id": i.id, "userId": i.user_id, "title": i.report_title, "content": i.report_content})
 20 |     return jsonify(result)
 21 | 
 22 | 
 23 | @app.route('/report/getOne', methods=['GET'])
 24 | def report_get_one():
 25 |     """
 26 |     获取一个报告
 27 |     :return:
 28 |     """
 29 |     report_id = int(request.args.get('reportId'))
 30 |     report = db.session.query(Report).filter(Report.id == report_id).first()
 31 | 
 32 |     return {"id": report.id, "userId": report.user_id, "title": report.report_title, "content": report.report_content}
 33 | 
 34 | 
 35 | @app.route('/report/getReportByUserId', methods=['GET', 'POST'])
 36 | def report_get_by_user_id():
 37 |     """
 38 |     获取某个用户的所有报告
 39 |     :return:
 40 |     """
 41 |     user_id = request.args.get('userId')
 42 |     reports = db.session.query(Report).filter(Report.user_id == user_id)
 43 |     result = []
 44 |     for i in reports:
 45 |         result.append({"id": i.id, "userId": i.user_id, "title": i.report_title, "content": i.report_content})
 46 |     return jsonify(result)
 47 | 
 48 | 
 49 | @app.route('/report/deleteOne', methods=['POST'])
 50 | def report_delete_one():
 51 |     """
 52 |     删除一个报告
 53 |     :return:
 54 |     """
 55 |     try:
 56 |         report_id = int(request.form.get('reportId'))
 57 |         db.session.query(Report).filter(Report.id == report_id).delete()
 58 |         db.session.commit()
 59 |         return {"status": True, "message": "成功"}
 60 |     except:
 61 |         return {"status": False, "message": "失败"}
 62 | 
 63 | 
 64 | @app.route('/report/updateOne', methods=['POST'])
 65 | def report_update_one():
 66 |     """
 67 |     更新一个报告 title、content 传值为""是不更新
 68 |     :return:
 69 |     """
 70 |     try:
 71 |         report_id = request.form.get('reportId')
 72 |         report_title = request.form.get('title')
 73 |         report_content = request.form.get('content')
 74 | 
 75 |         update_content = {}
 76 |         if (report_title is not None) and (report_title != ''):
 77 |             update_content[Report.report_title] = report_title
 78 |         if (report_content is not None) and (report_content != ''):
 79 |             update_content[Report.report_content] = report_content
 80 | 
 81 |         db.session.query(Report).filter(Report.id == report_id).update(update_content)
 82 |         db.session.commit()
 83 |         return {"status": True, "message": "成功"}
 84 |     except:
 85 |         traceback.print_exc()
 86 |         return {"status": False, "message": "失败"}
 87 | 
 88 | 
 89 | @app.route('/report/save', methods=['POST'])
 90 | def report_save_one():
 91 |     """
 92 |     保存一个报告
 93 |     :return:
 94 |     """
 95 |     try:
 96 |         user_id = request.form.get('userId')
 97 |         report_title = request.form.get('title')
 98 |         report_content = request.form.get('content')
 99 | 
100 |         report = Report(user_id=user_id, report_title=report_title, report_content=report_content)
101 |         db.session.add(report)
102 |         db.session.commit()
103 |         return {"status": True, "message": "成功"}
104 |     except:
105 |         traceback.print_exc()
106 |         return {"status": False, "message": "失败"}
107 | 


--------------------------------------------------------------------------------
/app/views/Test.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | import app.views.Process as process
3 | 
4 | process.filterMultiConditions()


--------------------------------------------------------------------------------
/app/views/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kugomusic/Easy_Data/a74c3cd2c9c3b0e5a9298f8c3b7af2a2f5caf260/app/views/__init__.py


--------------------------------------------------------------------------------
/app/views/datasource/DataSource.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | from flask import request, send_from_directory, Response
  4 | from app.views.datasource.werkzeug.utils import secure_filename
  5 | from app import app, db
  6 | from app.models.MSEntity import DataSource
  7 | from sqlalchemy.sql import and_, or_, text
  8 | 
  9 | from flask.json import jsonify
 10 | import json
 11 | import pandas as pd
 12 | 
 13 | ALLOWED_EXTENSIONS = set(['txt', 'csv', 'pdf', 'png', 'jpg', 'jpeg', 'gif'])
 14 | 
 15 | app.config['UPLOAD_FOLDER'] = os.getcwd() + '/../data'
 16 | app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 * 1024
 17 | 
 18 | html = '''
 19 |     <!DOCTYPE html>
 20 |     <title>Upload File</title>
 21 |     <h1>文件上传</h1>
 22 |     <form method=post enctype=multipart/form-data>
 23 |          <input type=file name=file>
 24 |          <input type=submit value=上传>
 25 |     </form>
 26 |     '''
 27 | 
 28 | 
 29 | # 解决 list, dict 不能返回的问题
 30 | class MyResponse(Response):
 31 |     @classmethod
 32 |     def force_type(cls, response, environ=None):
 33 |         if isinstance(response, (list, dict)):
 34 |             response = jsonify(response)
 35 |         return super(Response, cls).force_type(response, environ)
 36 | 
 37 | 
 38 | app.response_class = MyResponse
 39 | 
 40 | 
 41 | def allowed_file(filename):
 42 |     return '.' in filename and \
 43 |            filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS
 44 | 
 45 | 
 46 | @app.route('/uploads/<filename>')
 47 | def uploaded_file(filename):
 48 |     return send_from_directory(app.config['UPLOAD_FOLDER'], filename)
 49 | 
 50 | 
 51 | @app.route('/dataSource/upload', methods=['GET', 'POST'])
 52 | def upload_file():
 53 |     """
 54 |     上传文件
 55 |     :return:
 56 |     """
 57 |     if request.method == 'POST':
 58 |         file = request.files['file']
 59 |         # create_user = request.form.get('userId')
 60 |         # open_level = request.form.get('openLevel')
 61 |         create_user = 1
 62 |         open_level = 1
 63 |         if file and allowed_file(file.filename):
 64 |             # 保存文件
 65 |             from unicodedata import normalize
 66 |             filename = secure_filename(normalize('NFKD', file.filename).encode('utf-8', 'ignore').decode('utf-8'))
 67 |             print('文件保存路径：', os.path.join(app.config['UPLOAD_FOLDER'], filename))
 68 |             file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
 69 |             # file_url = url_for('uploaded_file', filename=filename)
 70 |             file_url = os.path.join(app.config['UPLOAD_FOLDER'], filename)
 71 | 
 72 |             # 保存文件记录
 73 |             import time
 74 |             # 格式化成2016-03-20 11:45:39形式
 75 |             create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
 76 |             add_DataSource(file.filename[:-4], file_url, "CSV", create_user, open_level, create_time)
 77 |             return '文件上传成功'
 78 |     return html
 79 | 
 80 | 
 81 | @app.route('/dataSource/getFiles', methods=['GET', 'POST'])
 82 | def get_file():
 83 |     """
 84 |     查询数据源列表
 85 |     :return:
 86 |     """
 87 |     # 多条件并集查询
 88 |     condition = (DataSource.open_level == '0')  # 公开的数据集
 89 |     # 根据条件筛选酒店信息
 90 |     if request.args.get('userId'):  #
 91 |         userId = request.args.get('userId')
 92 |         condition = or_(condition, DataSource.create_user == userId)
 93 | 
 94 |     # 排序方式
 95 |     order_string = 'create_time' + ' desc'
 96 |     dataSource = DataSource.query.filter(condition).order_by(text(order_string))  # 执行查询
 97 | 
 98 |     result = []
 99 |     for i in dataSource:
100 |         result.append({"fileId": i.id, "fileName": i.file_name, 'fileUrl': i.file_url, 'fileType': i.file_type,
101 |                        'createUser': i.create_user, 'openLevel': i.open_level, 'createTime': i.create_time})
102 |     return jsonify(result)
103 | 
104 | 
105 | def add_DataSource(file_name, file_url, file_type, create_user, open_level, create_time):
106 |     """
107 |     新增数据源
108 |     :param file_name:
109 |     :param file_url:
110 |     :param file_type:
111 |     :param create_user:
112 |     :param open_level:
113 |     :param create_time:
114 |     :return:
115 |     """
116 |     source = DataSource(file_name=file_name, file_url=file_url, file_type=file_type, create_user=create_user,
117 |                         open_level=open_level, create_time=create_time)
118 |     db.session.add(source)
119 |     db.session.commit()
120 | 
121 | 
122 | @app.route('/dataSource/dataPreview', methods=['GET', 'POST'])
123 | def data_preview():
124 |     """
125 |     查看数据
126 |     :return:
127 |     """
128 |     if request.method == 'POST':
129 |         userId = request.form.get('userId')
130 |         fileId = request.form.get('fileId')
131 |         fileUrl = request.form.get('fileUrl')
132 |         start = request.form.get('start')
133 |         end = request.form.get('end')
134 | 
135 |     try:
136 |         data = pd.read_csv(fileUrl, encoding='utf-8')
137 |         data2 = data[int(start):int(end)].to_json(orient='records', force_ascii=False)
138 |         return jsonify({'length': len(data), 'data': json.loads(data2)})
139 |     except:
140 |         return "error read"
141 | 
142 | 
143 | @app.route("/dataSource/getColumnNames", methods=['GET', 'POST'])
144 | def get_column_names():
145 |     """
146 |     获取项目对应的当前数据源的所有列名
147 |     :return:
148 |     """
149 |     if request.method == 'GET':
150 |         userId = request.args.get('userId')
151 |         fileId = request.args.get('fileId')
152 |         fileUrl = request.args.get('fileUrl')
153 |     else:
154 |         return "需要get请求"
155 | 
156 |     try:
157 |         data = pd.read_csv(fileUrl, encoding='utf-8')
158 |         return jsonify(data.columns.values.tolist())
159 |     except:
160 |         return "error read"
161 | 
162 | 
163 | @app.route("/dataSource/getColumnNameWithNumberType", methods=['GET', 'POST'])
164 | def get_column_names_with_number_type():
165 |     """
166 |     获取项目对应的当前数据源的 数值型 的列名
167 |     :return:
168 |     """
169 |     if request.method == 'GET':
170 |         userId = request.args.get('userId')
171 |         fileId = request.args.get('fileId')
172 |         fileUrl = request.args.get('fileUrl')
173 |     else:
174 |         return "需要get请求"
175 |     try:
176 |         data = pd.read_csv(fileUrl, encoding='utf-8')
177 |         res = []
178 |         for col in data.columns.values.tolist():
179 |             if (data[col].dtype == 'int64' or data[col].dtype == 'float64'):
180 |                 res.append(col)
181 |         return jsonify(res)
182 |     except:
183 |         return "error read"
184 | 
185 | 
186 | def data_read(session, type, url):
187 |     """
188 |     读取数据
189 |     :param session: Spark Session
190 |     :param type: 数据源类型
191 |     :param url: 数据源地址
192 |     :return Spark DataFrame:
193 |     """
194 |     try:
195 |         # type = 'CSV'
196 |         df = session.read.csv(url, header=True, inferSchema=True)
197 |         return df
198 |     except:
199 |         return "error"
200 | 


--------------------------------------------------------------------------------
/app/views/datasource/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kugomusic/Easy_Data/a74c3cd2c9c3b0e5a9298f8c3b7af2a2f5caf260/app/views/datasource/__init__.py


--------------------------------------------------------------------------------
/app/views/datasource/werkzeug/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | flask(werkzeug.util)的secure_filename的中文上传问题(不支持中文文件名，自动过滤掉了中文)
3 | 重写---werkzeug.util
4 | 参考资料---http://www.voidcn.com/article/p-cxjdqemb-bpc.html
5 | '''
6 | 


--------------------------------------------------------------------------------
/app/views/v1/Mysql.py:
--------------------------------------------------------------------------------
  1 | # encoding=utf8
  2 | import sys
  3 | from importlib import reload
  4 | 
  5 | """
  6 | v1版本，弃用
  7 | """
  8 | reload(sys)
  9 | 
 10 | from flask import request, jsonify, Response
 11 | from flask.json import jsonify
 12 | from app import app
 13 | from app import db
 14 | from app.models.MSEntity import DataSource, Project, ProcessFlow, initdb
 15 | import shutil
 16 | import json
 17 | import pandas as pd
 18 | import os
 19 | from app.Utils import mkdir, getProjectCurrentDataUrl, getProjectByNameAndUserId
 20 | from app.ConstFile import const
 21 | from app.models.ServerNameMap import ServerNameMap
 22 | 
 23 | 
 24 | # 解决 list, dict 不能返回的问题
 25 | class MyResponse(Response):
 26 |     @classmethod
 27 |     def force_type(cls, response, environ=None):
 28 |         if isinstance(response, (list, dict)):
 29 |             response = jsonify(response)
 30 |         return super(Response, cls).force_type(response, environ)
 31 | 
 32 | 
 33 | app.response_class = MyResponse
 34 | 
 35 | 
 36 | # 初始化表，在mysql中新建表，对已存在的表无影响
 37 | # initdb()
 38 | 
 39 | # # 得到数据源列表
 40 | # @app.route('/getDataSource', methods=['GET', 'POST'])
 41 | # def getDataSource():
 42 | #     DSs = DataSource.query.all()
 43 | #     result = []
 44 | #     for i in DSs:
 45 | #         result.append({"id": i.id, "name": i.file_name})
 46 | #     return result
 47 | 
 48 | 
 49 | # 创建项目
 50 | @app.route('/creatProject', methods=['GET', 'POST'])
 51 | def creatProject():
 52 |     if request.method == 'GET':
 53 |         projectName = request.form.get('projectName')
 54 |         dataSourceId = request.form.get('dataSourceId')
 55 |         userId = request.form.get('userId')
 56 |     else:
 57 |         projectName = request.form.get('projectName')
 58 |         dataSourceId = request.form.get('dataSourceId')
 59 |         userId = request.form.get('userId')
 60 |     print('projectName: {}, dataSourceId: {}, userId: {}'.format(projectName, dataSourceId, userId))
 61 |     rootUrl = const.ROOTURL
 62 |     # 数据库中添加Project记录
 63 |     project = Project(project_name=projectName, project_address=rootUrl + projectName, user_id=userId,
 64 |                       dataSource_id=dataSourceId)
 65 |     db.session.add(project)
 66 |     # 数据库中添加ProcessFlow记录
 67 |     pro = getProjectByNameAndUserId(projectName, userId)
 68 |     processs = ProcessFlow(project_id=pro.id, operates="[]")
 69 |     db.session.add(processs)
 70 |     db.session.commit()
 71 |     try:
 72 |         if (not (os.path.exists(rootUrl + projectName))):
 73 |             filters = {
 74 |                 DataSource.id == dataSourceId
 75 |             }
 76 |             DSs = DataSource.query.filter(*filters).first()
 77 |             db.session.commit()
 78 |             mkdir(rootUrl + projectName)
 79 |             print(DSs.file_url)
 80 |             print(rootUrl + projectName)
 81 |             shutil.copyfile(DSs.file_url, rootUrl + projectName + '/' + DSs.file_name + '.csv')
 82 |             return getProjectList()
 83 |         else:
 84 |             return "Double name"
 85 |     except:
 86 |         return "error"
 87 | 
 88 | 
 89 | # 获取项目列表
 90 | @app.route('/getProjectList', methods=['GET', 'POST'])
 91 | def getProjectList():
 92 |     DSs = Project.query.all()
 93 |     result = []
 94 |     for i in DSs:
 95 |         result.append({"id": i.id, "name": i.project_name})
 96 |     return result
 97 | 
 98 | 
 99 | # 获取项目处理流程
100 | @app.route('/getProcessFlowByProjectId', methods=['GET', 'POST'])
101 | def getProcessFlowByProjectId():
102 |     # 接收参数
103 |     if request.method == 'GET':
104 |         projectId = request.args.get("projectId")
105 |     else:
106 |         projectId = request.form.get("projectId")
107 | 
108 |     # 定义返回状态
109 |     result = {}
110 |     result['success'] = True
111 |     result['errorCode'] = 200
112 |     result['errorMrg'] = ''
113 | 
114 |     # 查询projectId 对应的处理流程
115 |     try:
116 |         DSs = ProcessFlow.query.filter(ProcessFlow.project_id == projectId).one()
117 |     except:
118 |         result['success'] = False
119 |         result['errorCode'] = 550
120 |         result['errorMrg'] = '没有该项目ID：' + projectId
121 |         return result
122 |     print(DSs)
123 | 
124 |     # 整理返回格式
125 |     result['id'] = DSs.id
126 |     if (DSs.links is None) or (DSs.links == ''):
127 |         result['linkDataArray'] = ''
128 |     else:
129 |         result['linkDataArray'] = json.loads(DSs.links)
130 | 
131 |     if (DSs.operates is None) or (DSs.operates == ''):
132 |         operates = []
133 |     else:
134 |         operates = json.loads(DSs.operates)
135 |     for operate in operates:
136 |         operate['operateId'] = operate['type']
137 |         operate['operate'] = json.loads(operate['operate'])
138 |         operate['text'] = ServerNameMap.operateIdToNameMap[operate['operateId']]
139 |         operate['type'] = ServerNameMap.operateIdToTypeMap[operate['operateId']]
140 |         operate['color'] = ServerNameMap.typeToColorMap[operate['type']]
141 |     result['nodeDataArray'] = operates
142 |     result['class'] = 'go.GraphLinksModel'
143 |     return result
144 | 
145 | 
146 | # 原始数据预览
147 | @app.route('/rawDataPreview', methods=['GET', 'POST'])
148 | def rawDataPreview():
149 |     if request.method == 'GET':
150 |         start = request.form.get('start')
151 |         end = request.form.get('end')
152 |         projectName = request.form.get('projectName')
153 |     else:
154 |         start = request.form.get('start')
155 |         end = request.form.get('end')
156 |         projectName = request.form.get('projectName')
157 |     print('start: {}, end: {}, projectName: {}'.format(start, end, projectName))
158 |     try:
159 |         urls = getProjectCurrentDataUrl(projectName)
160 |         fileUrl = urls['fileUrl']
161 |     except:
162 |         return "error"
163 |     try:
164 |         data = pd.read_csv(fileUrl, encoding='utf-8')
165 |         data2 = data[int(start):int(end)].to_json(orient='records', force_ascii=False)
166 |         return jsonify({'length': len(data), 'data': json.loads(data2)})
167 |     except:
168 |         return "error read"
169 | 
170 | 
171 | # 当前数据预览
172 | @app.route('/currentDataPreview', methods=['GET', 'POST'])
173 | def currentDataPreview():
174 |     if request.method == 'GET':
175 |         start = request.form.get('start')
176 |         end = request.form.get('end')
177 |         projectName = request.form.get('projectName')
178 |     else:
179 |         start = request.form.get('start')
180 |         end = request.form.get('end')
181 |         projectName = request.form.get('projectName')
182 |     print('start: {}, end: {}, projectName: {}'.format(start, end, projectName))
183 |     try:
184 |         urls = getProjectCurrentDataUrl(projectName)
185 |         # print(urls)
186 |         fileUrl = urls['fileUrl']
187 |         # print(fileUrl)
188 |     except:
189 |         return "error"
190 |     try:
191 |         data = pd.read_csv(fileUrl, encoding='utf-8')
192 |         data2 = data[int(start):int(end)].to_json(orient='records', force_ascii=False)
193 |         return jsonify({'length': len(data), 'data': json.loads(data2)})
194 |     except:
195 |         return "error read"
196 | 


--------------------------------------------------------------------------------
/app/views/v1/Process.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | from flask import request, jsonify, Response
  3 | from app import app
  4 | from app.Utils import *
  5 | from app.enmus.EnumConst import *
  6 | from pyspark.sql.functions import split, explode, concat_ws, regexp_replace
  7 | import random, string
  8 | from app.ConstFile import const
  9 | from datetime import datetime
 10 | 
 11 | """
 12 | v1版本，弃用
 13 | """
 14 | 
 15 | save_dir = const.SAVEDIR
 16 | 
 17 | 
 18 | # 欢迎页面
 19 | @app.route("/", methods=['GET', 'POST'])
 20 | def hello():
 21 |     return "<h1 style='color:blueviolet'> HomePage of Easy_Data</h1>"
 22 | 
 23 | 
 24 | # 解决 list, dict 不能返回的问题
 25 | class MyResponse(Response):
 26 |     @classmethod
 27 |     def force_type(cls, response, environ=None):
 28 |         if isinstance(response, (list, dict)):
 29 |             response = jsonify(response)
 30 |         return super(Response, cls).force_type(response, environ)
 31 | 
 32 | 
 33 | app.response_class = MyResponse
 34 | 
 35 | 
 36 | @app.route("/filter", methods=['GET', 'POST'])
 37 | def filterMultiConditions():
 38 |     """
 39 |     接受请求传参，例如: {"userId":"1","projectName":"特征工程测试项目","parameter":[{"colName":"利润", "operate":">", "value":"100", "relation":"AND"},{"colName":"装运方式", "operate":"==", "value":"一级", "relation":""}]}
 40 | 
 41 |     :return:
 42 |     """
 43 |     if request.method == 'GET':
 44 |         requestStr = request.args.get("requestStr")
 45 |     else:
 46 |         requestStr = request.form.get("requestStr")
 47 |     a = datetime.now()
 48 |     ## 参数测试例子
 49 |     # requestDict = {"userId": "1", "projectName": "特征工程测试项目",
 50 |     #                "parameter": [{"colName": "利润", "operate": ">", "value": "100", "relation": "AND"},
 51 |     #                              {"colName": "装运方式", "operate": "==", "value": "一级", "relation": ""}]}
 52 |     # requestStr = json.dumps(requestDict)
 53 |     requestDict = json.loads(requestStr)
 54 |     projectName = requestDict['projectName']  # 项目名称
 55 |     userId = requestDict['userId']  # 用户ID
 56 |     parameter = requestDict['parameter']
 57 |     functionName = "filter"
 58 |     state = True
 59 |     reason = ""
 60 |     print(functionName, projectName, userId, requestDict)
 61 |     # spark会话
 62 |     ss = getSparkSession(userId, functionName)
 63 |     # 解析项目路径，读取csv
 64 |     df = getProjectCurrentData(ss, projectName)
 65 |     if df == "error: 项目名或项目路径有误":
 66 |         state = False
 67 |         reason = df
 68 |         return returnDataModel(df, state, reason)
 69 |     # 过滤函数
 70 |     sqlDF = filterCore(ss, df, parameter)
 71 |     # 处理后的数据写入文件
 72 |     # sqlDF.write.csv(path='/home/zk/data/test.csv', header=True, sep=",", mode="overwrite")
 73 |     # sqlDF.coalesce(1).write.option("header", "true").csv("/home/zk/data/test.csv")
 74 |     sqlDF.toPandas().to_csv(save_dir, header=True)
 75 |     # 追加处理流程记录
 76 |     resultStr = addProcessingFlow(projectName, userId, OperatorType.FILTER.value, requestStr)
 77 |     if resultStr != "":
 78 |         state = False
 79 |         reason = reason + "  " + resultStr
 80 |     # 返回前50条数据
 81 |     b = datetime.now()
 82 |     print('-------------------------', b - a)
 83 |     return returnDataModel(df, state, reason)
 84 | 
 85 | 
 86 | def filterCoreParameter(projectName, parameterStr):
 87 |     try:
 88 |         urls = getProjectCurrentDataUrl(projectName)
 89 |         fileUrl = urls['fileUrl']
 90 |     except:
 91 |         return "error"
 92 |     parameter = {}
 93 |     parameter['fileUrl'] = fileUrl
 94 |     condition = []
 95 |     strList = parameterStr[0:len(parameterStr) - 1].split(';')
 96 |     for i in range(len(strList) - 1):
 97 |         if strList[i] == "" or strList[i] == None:
 98 |             continue
 99 |         ll = strList[i].split(',', 3)
100 |         con = {}
101 |         con['name'] = ll[0]
102 |         con['operate'] = ll[1]
103 |         con['value'] = ll[2]
104 |         con['relation'] = ll[3]
105 |         condition.append(con)
106 |     ll = strList[len(strList) - 1].split(',', 2)
107 |     con = {}
108 |     con['name'] = ll[0]
109 |     con['operate'] = ll[1]
110 |     con['value'] = ll[2]
111 |     con['relation'] = ""
112 |     condition.append(con)
113 |     parameter['condition'] = condition
114 |     return parameter
115 | 
116 | 
117 | # print(filterCoreParameter('甜点销售数据预处理', '列名一,关系,值,组合关系;列名一,关系,值,'))
118 | 
119 | def filterCore(spark, df, condition):
120 |     tableName = ''.join(random.sample(string.ascii_letters + string.digits, 8))
121 |     sqlStr = 'select * from ' + tableName + ' where'
122 |     # types = {}
123 |     # for i in df.dtypes:
124 |     #     types[i[0]] = i[1]
125 |     #     print(i)
126 |     for i in condition:
127 |         if is_number(i['value']):
128 |             sqlStr = sqlStr + ' `' + i['colName'] + '` ' + i['operate'] + ' ' + i['value'] + ' ' + i['relation']
129 |         else:
130 |             sqlStr = sqlStr + ' `' + i['colName'] + '` ' + i['operate'] + ' \"' + i['value'] + '\" ' + i['relation']
131 |     print(sqlStr)
132 |     df.createOrReplaceTempView(tableName)
133 |     sqlDF = spark.sql(sqlStr)
134 | 
135 |     return sqlDF
136 | 
137 | 
138 | # 排序 页面路由
139 | @app.route("/sort", methods=['GET', 'POST'])
140 | def sort():
141 |     # 接受请求传参，例如: {"userId":"1","projectName":"订单分析","columnName":"利润","sortType":"降序"}
142 |     ## 接收参数
143 |     if request.method == 'GET':
144 |         requestStr = request.args.get("requestStr")
145 |     else:
146 |         requestStr = request.form.get("requestStr")
147 |     # 对参数格式进行转化：json->字典，并进一步进行解析
148 |     requestDict = json.loads(requestStr)
149 | 
150 |     a = datetime.now()
151 | 
152 |     ## 参数示例
153 |     # requestDict = {"userId": "1", "projectName": "特征工程测试项目", "columnName": "利润", "sortType": "升序"}
154 |     # requestStr = json.dumps(requestDict)
155 | 
156 |     projectName = requestDict['projectName']  # 项目名称
157 |     userId = requestDict['userId']  # 用户ID
158 |     functionName = "sort"
159 |     state = True
160 |     reason = ""
161 |     print(functionName, projectName, userId, requestStr)
162 |     # spark会话
163 |     ss = getSparkSession(userId, functionName)
164 | 
165 |     # 解析项目路径，读取csv
166 |     df = getProjectCurrentData(ss, projectName)
167 |     if df == "error: 项目名或项目路径有误":
168 |         state = False
169 |         reason = df
170 |         return returnDataModel(df, state, reason)
171 | 
172 |     # 执行主函数，获取df(spark格式)
173 |     df = sortCore(requestDict, df)
174 |     if df == "ERROR_NOT_ONLY_ONE_COL":
175 |         state = False
176 |         reason = "error: 只能选择一列进行排序"
177 |         return returnDataModel(df, state, reason)
178 | 
179 |     # 处理后的数据写入文件（借助pandas进行存储、返回）
180 |     df.toPandas().to_csv(save_dir, header=True)
181 | 
182 |     # 追加处理流程记录
183 |     resultStr = addProcessingFlow(projectName, userId, OperatorType.SORT.value, requestStr)
184 |     if resultStr != "":
185 |         state = False
186 |         reason = resultStr
187 |     # 返回前50条数据
188 |     b = datetime.now()
189 |     print('-------------------------', b - a)
190 |     return returnDataModel(df, state, reason)
191 | 
192 | 
193 | # 排序主函数，函数功能包括解析参数、排序；返回df(spark格式)
194 | def sortCore(requestDict, df):
195 |     columnName = requestDict['columnName']  # 排序的列名（只能是一列）
196 |     # 只能输入一列，否则报错
197 |     if len(columnName.split(",")) != 1:
198 |         return "ERROR_NOT_ONLY_ONE_COL"
199 |     # sortType默认为升序，若用户指定，以用户指定为准
200 |     try:
201 |         sortType = requestDict['sortType']  # 排序类型（升序、降序）
202 |         if sortType == "":
203 |             sortType = "升序"
204 |     except:
205 |         sortType = "升序"
206 | 
207 |     # 排序
208 |     if sortType == "降序":
209 |         df = df.sort(columnName, ascending=False)
210 |     else:
211 |         df = df.sort(columnName)
212 |     return df
213 | 
214 | 
215 | # 排序 请求参数及返回值
216 | @app.route("/jsontostrTest", methods=['GET', 'POST'])
217 | def jsontostrTest():
218 |     req = {}
219 |     parameter = {}
220 |     parameter["userId"] = "1"
221 |     parameter["projectName"] = "特征工程测试项目"
222 |     parameter["columnName"] = "利润"
223 |     parameter["sortType"] = "升序"
224 | 
225 |     req["methods"] = "POST"
226 |     req["url"] = "http://10.108.211.130:8993/sort"
227 |     req["parameter"] = json.dumps(parameter, ensure_ascii=False)
228 |     req["result"] = {"data": [], "length": 35}
229 |     # print(json.dumps(parameter))
230 |     return json.dumps(parameter, ensure_ascii=False)
231 | 
232 | 
233 | # 按列拆分页面路由
234 | @app.route("/columnSplit", methods=['GET', 'POST'])
235 | def columnSplit():
236 |     # # 接受请求传参，例如: {"userId":"1","project":"订单分析","columnName":"订购日期","newColumnNames":["年","月","日"]}
237 |     if request.method == 'GET':
238 |         requestStr = request.args.get("requestStr")
239 |     else:
240 |         requestStr = request.form.get("requestStr")
241 | 
242 |     # 对参数格式进行转化：json->字典，并进一步进行解析
243 |     requestDict = json.loads(requestStr)
244 | 
245 |     a = datetime.now()
246 | 
247 |     ## 参数示例
248 |     # requestDict = {"userId": "1", "projectName": "特征工程测试项目", "columnName": "订购日期", "delimiter": "/",
249 |     #                "newColumnNames": ["year", "月"]}
250 |     # requestStr = json.dumps(requestDict)
251 | 
252 |     userId = requestDict['userId']  # 用户ID
253 |     projectName = requestDict['projectName']
254 |     functionName = "columnSplit"
255 |     state = True
256 |     reason = ""
257 |     print(functionName, projectName, userId, requestStr)
258 |     # spark会话
259 |     ss = getSparkSession(userId, functionName)
260 | 
261 |     # 解析项目路径，读取csv
262 |     df = getProjectCurrentData(ss, projectName)
263 |     if df == "error: 项目名或项目路径有误":
264 |         state = False
265 |         reason = df
266 |         return returnDataModel(df, state, reason)
267 | 
268 |     # 执行主函数，获取df(spark格式)
269 |     df = columnSplitCore(ss, df, requestDict)
270 |     if df == "ERROR_NOT_ONLY_ONE_COL":
271 |         state = False
272 |         reason = "error: 只能选择一列进行排序"
273 |         return returnDataModel(df, state, reason)
274 | 
275 |     # 处理后的数据写入文件（借助pandas进行存储、返回）
276 |     df_pandas = df.toPandas()
277 |     df_pandas.to_csv(save_dir, header=True)
278 | 
279 |     # 追加处理流程记录
280 |     resultStr = addProcessingFlow(projectName, userId, OperatorType.COLUMNSPLIT.value, requestStr)
281 |     if resultStr != "":
282 |         state = False
283 |         reason = resultStr
284 |     # 返回前50条数据
285 |     b = datetime.now()
286 |     print('-------------------------', b - a)
287 |     return returnDataModel(df, state, reason)
288 | 
289 | 
290 | # 按列拆分主函数，函数功能包括解析参数、拆分；返回df(spark格式)
291 | def columnSplitCore(ss, df, requestDict):
292 |     # 对参数格式进行转化：json->字典，并进一步进行解析
293 |     columnName = requestDict['columnName']
294 |     delimiter = requestDict['delimiter']
295 |     # 只能输入一列，否则报错
296 |     if len(columnName.split(",")) != 1:
297 |         return "ERROR_NOT_ONLY_ONE_COL"
298 |     # 获取拆分出的新列的列名，若未指定，暂时存储为空列表，后续根据拆分数填充成为[拆分列1,拆分列2,拆分列3,...]
299 |     try:
300 |         newColumnNames = requestDict['newColumnNames']
301 |     except:
302 |         newColumnNames = []
303 |     # 将指定列columnName按splitSymbol拆分，存入"splitColumn"列，列内数据格式为[a, b, c, ...]
304 |     print('---------------------------------------------------')
305 |     # df.show()
306 |     # df.head()
307 |     first_row = df.first()
308 |     df_split = df.withColumn("splitColumn", split(df[columnName], delimiter))
309 |     splitNumber = len(first_row[columnName].split(delimiter))
310 |     # 若用户为指定拆分出的新列的列名，根据拆分数填充
311 |     if newColumnNames == []:
312 |         for i in range(splitNumber):
313 |             newColumnNames.append("拆分列" + str(i + 1))
314 |     # 给新列名生成索引，格式为：[('年', 0), ('月', 1), ('日', 2)]，方便后续操作
315 |     sc = ss.sparkContext
316 |     newColumnNames_with_index = sc.parallelize(newColumnNames).zipWithIndex().collect()
317 |     # 遍历生成新列
318 |     for name, index in newColumnNames_with_index:
319 |         df_split = df_split.withColumn(name, df_split["splitColumn"].getItem(index))
320 |     df = df_split.drop("splitColumn")
321 |     return df
322 | 
323 | 
324 | # 识别符号
325 | def symbolRecognition(splitStr):
326 |     splitSymbol = ''
327 |     symbolList = ['/', '-', '_', '#', '@', '!', '$', '|', '=', ',', '.', '*']
328 |     for i in list(splitStr):
329 |         if i in symbolList:
330 |             splitSymbol = i
331 |             break
332 |     return splitSymbol
333 | 
334 | 
335 | # 按行拆分页面路由
336 | @app.route("/rowSplit", methods=['GET', 'POST'])
337 | def rowSplit():
338 |     # 接受请求传参，例如: {"project":"订单分析","columnName":"订单ID","newColumnName":"订单ID分割"}
339 |     if request.method == 'GET':
340 |         requestStr = request.args.get("requestStr")
341 |     else:
342 |         requestStr = request.form.get("requestStr")
343 | 
344 |     # 执行主函数，获取df(spark格式)
345 |     df = rowSplitCore(requestStr)
346 |     if df == "error_projectUrl":
347 |         return "error: 项目名或项目路径有误"
348 |     elif df == "error_columnInputNumSingle":
349 |         return "error: 只能选择一列进行拆分"
350 |     elif df == "error_splitSymbol":
351 |         return "error: 您指定的列中无可供拆分的符号"
352 | 
353 |     # 处理后的数据写入文件（借助pandas进行存储、返回）
354 |     df_pandas = df.toPandas()
355 |     df_pandas.to_csv(save_dir, header=True)
356 | 
357 |     return jsonify({'length': df.count(), 'data': df_pandas.to_json(force_ascii=False)})
358 | 
359 | 
360 | # 按行拆分主函数，函数功能包括解析参数、拆分；返回df(spark格式)
361 | # 自动识别拆分目标列中的符号，如：2019/03/25中的"/"
362 | def rowSplitCore(requestStr):
363 |     # 对参数格式进行转化：json->字典，并进一步进行解析
364 |     requestDict = json.loads(requestStr)
365 |     projectName = requestDict['projectName']
366 |     columnName = requestDict['columnName']
367 |     userId = '1'
368 |     functionName = "rowSplit"
369 |     print(functionName, projectName, userId, requestDict)
370 |     # 只能输入一列，否则报错
371 |     if len(columnName.split(",")) != 1:
372 |         return "error_columnInputNumSingle"
373 |     # newColumnName默认为columnName+“拆分”，若用户指定，以用户指定为准
374 |     try:
375 |         newColumnName = requestDict['newColumnName']
376 |     except:
377 |         newColumnName = columnName + "拆分"
378 | 
379 |     # spark会话
380 |     ss = getSparkSession(userId, functionName)
381 | 
382 |     # 解析项目路径，读取csv
383 |     df = getProjectCurrentData(ss, projectName)
384 |     if df == "error: 项目名或项目路径有误":
385 |         return df
386 | 
387 |     # 拆分
388 |     first_row = df.first()
389 |     splitStr = first_row[columnName]
390 |     # 识别splitStr中的符号
391 |     splitSymbol = symbolRecognition(splitStr)
392 |     if splitSymbol == '':
393 |         return "error_splitSymbol"  # 错误类型：指定列中不含可供拆分的符号
394 | 
395 |     # 将指定列columnName按splitSymbol拆分，存入newColumnName列的多行
396 |     df = df.withColumn(newColumnName, explode(split(df[columnName], splitSymbol)))
397 |     df.show()
398 | 
399 |     # 追加处理流程记录
400 |     addProcessingFlow(projectName, userId, OperatorType.ROWSPLIT.value, requestStr)
401 |     return df
402 | 
403 | 
404 | # 多列合并页面路由
405 | @app.route("/columnsMerge", methods=['GET', 'POST'])
406 | def columnsMerge():
407 |     # 接受请求传参，例如: {"projectName":"订单分析","columnNames":["类别","子类别","产品名称"],"newColumnName":"品类名称","splitSymbol":"-"}
408 |     if request.method == 'GET':
409 |         requestStr = request.args.get("requestStr")
410 |     else:
411 |         requestStr = request.form.get("requestStr")
412 |     # 对参数格式进行转化：json->字典，并进一步进行解析
413 |     requestDict = json.loads(requestStr)
414 | 
415 |     a = datetime.now()
416 | 
417 |     ## 参数示例
418 |     # requestDict = {"userId": "1", "projectName": "订单分析", "columnNames": ["类别", "子类别", "产品名称"], "connector": "-",
419 |     #                "newColumnName": "品类名称"}
420 |     # requestStr = json.dumps(requestDict)
421 | 
422 |     projectName = requestDict['projectName']
423 |     userId = '1'
424 |     functionName = "columnsMerge"
425 |     state = True
426 |     reason = ""
427 |     print(functionName, projectName, userId, requestDict)
428 |     # spark会话
429 |     ss = getSparkSession(userId, functionName)
430 | 
431 |     # 解析项目路径，读取csv
432 |     df = getProjectCurrentData(ss, projectName)
433 |     if df == "error: 项目名或项目路径有误":
434 |         state = False
435 |         reason = df
436 |         return returnDataModel(df, state, reason)
437 | 
438 |     # 执行主函数，获取df(spark格式)
439 |     df = columnsMergeCore(df, requestDict)
440 | 
441 |     # 处理后的数据写入文件（借助pandas进行存储、返回）
442 |     df_pandas = df.toPandas()
443 |     df_pandas.to_csv(save_dir, header=True)
444 | 
445 |     # 追加处理流程记录
446 |     resultStr = addProcessingFlow(projectName, userId, OperatorType.COLUMNMERGE.value, requestStr)
447 |     if resultStr != "":
448 |         state = False
449 |         reason = resultStr
450 |     # 返回前50条数据
451 |     b = datetime.now()
452 |     print('-------------------------', b - a)
453 |     return returnDataModel(df, state, reason)
454 | 
455 | 
456 | # 多列合并主函数，新增一列，列内的值为指定多列合并而成；返回df(spark格式)
457 | def columnsMergeCore(df, requestDict):
458 |     columnNames = requestDict['columnNames']
459 |     # 默认分隔符是","，若requestStr中指定了分隔符，则以用户指定为准
460 |     try:
461 |         splitSymbol = requestDict['connector']
462 |     except:
463 |         splitSymbol = ','
464 |     # 默认新列名称为：合并结果(col1, col2, col3, ...)，若用户指定，以用户指定为准
465 |     try:
466 |         newColumnName = requestDict['newColumnName']
467 |     except:
468 |         newColumnName = "合并结果" + "(" + str(columnNames).strip("[]") + ")"
469 | 
470 |     # 合并(spark的dataframe操作好蠢，暂时先用笨办法合并吧 >_< )
471 |     if len(columnNames) == 2:
472 |         df = df.withColumn(newColumnName, concat_ws(splitSymbol, df[columnNames[0]], df[columnNames[1]]))
473 |     elif len(columnNames) == 3:
474 |         df = df.withColumn(newColumnName,
475 |                            concat_ws(splitSymbol, df[columnNames[0]], df[columnNames[1]], df[columnNames[2]]))
476 |     elif len(columnNames) == 4:
477 |         df = df.withColumn(newColumnName,
478 |                            concat_ws(splitSymbol, df[columnNames[0]], df[columnNames[1]], df[columnNames[2]],
479 |                                      df[columnNames[3]]))
480 |     return df
481 | 
482 | 
483 | # 数据列替换 页面路由
484 | @app.route("/replace", methods=['GET', 'POST'])
485 | def replace():
486 |     # # 接受请求传参，例如: {"project":"订单分析","columnName":"客户ID","sourceCharacter":"0","targetCharacter":"Q"}
487 |     if request.method == 'GET':
488 |         requestStr = request.args.get("requestStr")
489 |     else:
490 |         requestStr = request.form.get("requestStr")
491 | 
492 |     # 对参数格式进行转化：json->字典，并进一步进行解析
493 |     requestDict = json.loads(requestStr)
494 | 
495 |     a = datetime.now()
496 |     ## 参数示例
497 |     # requestDict = {"userId": "1", "projectName": "订单分析", "columnNames": ["类别", "子类别", "客户名称"],
498 |     #                "sourceCharacters": ["技术", "电话", "CraigReiter", "复印机"],
499 |     #                "targetCharacters": ["技术copy", "电话copy", "CraigReitercopy", "复印机copy"]}
500 |     # requestStr = json.dumps(requestDict)
501 | 
502 |     projectName = requestDict['projectName']
503 |     userId = '1'
504 |     functionName = "replace"
505 |     state = True
506 |     reason = ""
507 |     print(functionName, projectName, userId, requestDict)
508 |     # spark会话
509 |     ss = getSparkSession(userId, functionName)
510 | 
511 |     # 解析项目路径，读取csv
512 |     df = getProjectCurrentData(ss, projectName)
513 |     if df == "error: 项目名或项目路径有误":
514 |         state = False
515 |         reason = df
516 |         return returnDataModel(df, state, reason)
517 | 
518 |     # 执行主函数，获取df(spark格式)
519 |     df = replaceCore(df, requestDict)
520 | 
521 |     # 处理后的数据写入文件（借助pandas进行存储、返回）
522 |     df_pandas = df.toPandas()
523 |     df_pandas.to_csv(save_dir, header=True)
524 | 
525 |     # 追加处理流程记录
526 |     resultStr = addProcessingFlow(projectName, userId, OperatorType.REPLACE.value, requestStr)
527 |     if resultStr != "":
528 |         state = False
529 |         reason = resultStr
530 |     # 返回前50条数据
531 |     b = datetime.now()
532 |     print('-------------------------', b - a)
533 |     return returnDataModel(df, state, reason)
534 | 
535 | 
536 | # 数据列替换主函数, 将某列中的字符进行替换；返回df(spark格式)
537 | def replaceCore(df, requestDict):
538 |     columnNames = requestDict['columnNames']
539 |     sourceCharacters = requestDict['sourceCharacters']
540 |     targetCharacters = requestDict['targetCharacters']
541 | 
542 |     if len(sourceCharacters) != len(targetCharacters):
543 |         return "被替换的字符和用于替换的字符个数不相等"
544 |     for i in range(len(columnNames)):
545 |         # 字符替换
546 |         columnName = columnNames[i]
547 |         df = df.withColumn("temp", (mul_regexp_replace(df[columnName], sourceCharacters, targetCharacters)))
548 |         df = df.drop(columnName)
549 |         df = df.withColumnRenamed("temp", columnName)
550 |     return df
551 | 
552 | 
553 | def mul_regexp_replace(col, sourceCharacters, targetCharacters):
554 |     for i in range(len(sourceCharacters)):
555 |         col = regexp_replace(col, sourceCharacters[i], targetCharacters[i])
556 |     return col
557 | 
558 | 
559 | @app.route("/testTime", methods=['GET', 'POST'])
560 | def testTime():
561 |     a = datetime.now()
562 |     filterMultiConditions()
563 |     sort()
564 |     columnSplit()
565 |     columnsMerge()
566 |     replace()
567 |     b = datetime.now()
568 |     print('-------------------------', b - a)
569 | 


--------------------------------------------------------------------------------
/app/views/v1/Process2.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | from flask import request, jsonify, Response
  3 | from app import app
  4 | from app.Utils import *
  5 | from app.enmus.EnumConst import OperatorType
  6 | from app.ConstFile import const
  7 | 
  8 | """
  9 | v1版本，弃用
 10 | """
 11 | 
 12 | save_dir = const.SAVEDIR
 13 | 
 14 | 
 15 | # 解决 list, dict 不能返回的问题
 16 | class MyResponse(Response):
 17 |     @classmethod
 18 |     def force_type(cls, response, environ=None):
 19 |         if isinstance(response, (list, dict)):
 20 |             response = jsonify(response)
 21 |         return super(Response, cls).force_type(response, environ)
 22 | 
 23 | 
 24 | app.response_class = MyResponse
 25 | 
 26 | 
 27 | # 填充空值
 28 | @app.route("/fillNullValue", methods=['GET', 'POST'])
 29 | def fillNullValue():
 30 |     # 接受请求传参，例如: {"project":"订单分析","columnName":"客户ID","sourceCharacter":"0","targetCharacter":"Q"}
 31 |     if request.method == 'GET':
 32 |         requestStr = request.args.get("requestStr")
 33 |     else:
 34 |         requestStr = request.form.get("requestStr")
 35 | 
 36 |     # 对参数格式进行转化：json->字典，并进一步进行解析
 37 |     requestDict = json.loads(requestStr)
 38 |     projectName = requestDict['projectName']
 39 |     userId = requestDict['userId']
 40 |     parameter = requestDict['parameter']
 41 |     functionName = "fillNullValue"
 42 |     state = True
 43 |     reason = ""
 44 |     print(functionName, projectName, userId, requestDict)
 45 |     ss = getSparkSession(userId, functionName)
 46 |     # 解析项目路径，读取csv
 47 |     df = getProjectCurrentData(ss, projectName)
 48 |     if df == "error: 项目名或项目路径有误":
 49 |         state = False
 50 |         reason = df
 51 |         return returnDataModel(df, state, reason)
 52 |     # 过滤函数
 53 |     sqlDF = fillNullValueCore(ss, df, parameter)
 54 |     # 处理后的数据写入文件
 55 |     sqlDF.toPandas().to_csv(save_dir, header=True)
 56 |     # 追加处理流程记录
 57 |     resultStr = addProcessingFlow(projectName, userId, OperatorType.FILLNULLVALUE.value, requestStr)
 58 |     if resultStr != "":
 59 |         state = False
 60 |         reason = resultStr
 61 |     # 返回前50条数据
 62 |     return returnDataModel(df, state, reason)
 63 | 
 64 | 
 65 | def fillNullValueParameter(projectName, parameterStr):
 66 |     try:
 67 |         urls = getProjectCurrentDataUrl(projectName)
 68 |         fileUrl = urls['fileUrl']
 69 |     except:
 70 |         return "error"
 71 |     parameter = {}
 72 |     parameter['fileUrl'] = fileUrl
 73 |     condition = []
 74 |     strList = parameterStr.split(';')
 75 |     for i in range(len(strList)):
 76 |         if strList[i] == "" or strList[i] == None:
 77 |             continue
 78 |         ll = strList[i].split(',', 1)
 79 |         con = {}
 80 |         con['name'] = ll[0]
 81 |         con['operate'] = ll[1]
 82 |         condition.append(con)
 83 |     parameter['condition'] = condition
 84 |     return parameter
 85 | 
 86 | 
 87 | # print(fillNullValueParameter('特征工程测试项目', '列名一,填充方法;列名一,填充方法'))
 88 | 
 89 | def fillNullValueCore(spark, df, condition):
 90 |     # val fillColValues = Map("StockCode" -> 5, "Description" -> "No value")
 91 |     # df.na.fill(fillColValues)
 92 |     for i in condition:
 93 |         if (i['operate'] == '均值填充'):
 94 |             mean_item = df.select(func.mean(i['colName'])).collect()[0][0]
 95 |             df = df.na.fill({i['colName']: mean_item})
 96 |         elif (i['operate'] == '最大值填充'):
 97 |             mean_item = df.select(func.max(i['colName'])).collect()[0][0]
 98 |             df = df.na.fill({i['colName']: mean_item})
 99 |         elif (i['operate'] == '最小值填充'):
100 |             mean_item = df.select(func.min(i['colName'])).collect()[0][0]
101 |             df = df.na.fill({i['colName']: mean_item})
102 |     return df
103 | 
104 | 
105 | @app.route("/columnMap", methods=['GET', 'POST'])
106 | def columnMap():
107 |     if request.method == 'GET':
108 |         requestStr = request.args.get("requestStr")
109 |     else:
110 |         requestStr = request.form.get("requestStr")
111 |     # 对参数格式进行转化：json->字典，并进一步进行解析
112 |     requestDict = json.loads(requestStr)
113 |     userId = requestDict['userId']  # 用户ID
114 |     projectName = requestDict['projectName']
115 |     parameter = requestDict['parameter']
116 |     functionName = "columnMap"
117 |     state = True
118 |     reason = ""
119 |     print(functionName, projectName, userId, requestDict)
120 |     # 获取sparkSession
121 |     ss = getSparkSession(userId, functionName)
122 |     # 解析项目路径，读取csv
123 |     df = getProjectCurrentData(ss, projectName)
124 |     if df == "error: 项目名或项目路径有误":
125 |         state = False
126 |         reason = df
127 |         return returnDataModel(df, state, reason)
128 |     # 过滤函数
129 |     sqlDF = columnMapCore(ss, df, parameter)
130 |     # 处理后的数据写入文件
131 |     sqlDF.toPandas().to_csv(save_dir, header=True)
132 |     # 追加处理流程记录
133 |     resultStr = addProcessingFlow(projectName, userId, OperatorType.COLUMNMAP.value, requestStr)
134 |     if resultStr != "":
135 |         state = False
136 |         reason = resultStr
137 |     # 返回前50条数据
138 |     return returnDataModel(df, state, reason)
139 | 
140 | 
141 | def columnMapParameter(projectName, parameterStr):
142 |     try:
143 |         urls = getProjectCurrentDataUrl(projectName)
144 |         fileUrl = urls['fileUrl']
145 |     except:
146 |         return "error"
147 |     parameter = {}
148 |     parameter['fileUrl'] = fileUrl
149 |     condition = []
150 |     strList = parameterStr.split(';')
151 |     for i in range(len(strList)):
152 |         if strList[i] == "" or strList[i] == None:
153 |             continue
154 |         ll = strList[i].split(',', 7)
155 |         con = {}
156 |         con['colName_1'] = ll[0]
157 |         con['operate_1'] = ll[1]
158 |         con['value_1'] = ll[2]
159 | 
160 |         con['operate'] = ll[3]
161 |         con['colName_2'] = ll[4]
162 |         con['operate_2'] = ll[5]
163 |         con['value_2'] = ll[6]
164 | 
165 |         con['newName'] = ll[7]
166 |         condition.append(con)
167 |     parameter['condition'] = condition
168 |     return parameter
169 | 
170 | 
171 | from pyspark.sql import functions as func
172 | 
173 | 
174 | def columnMapCore(spark, df, condition):
175 |     # val fillColValues = Map("StockCode" -> 5, "Description" -> "No value")
176 |     # df.na.fill(fillColValues)
177 |     for i in condition:
178 |         name1 = i['colName_1']
179 |         name2 = i['colName_2']
180 |         newName = i['newName']
181 |         if (i['operate_1'] == '+'):
182 |             df = df.withColumn(newName, df[name1] + i['value_1'])
183 |         elif (i['operate_1'] == '-'):
184 |             df = df.withColumn(newName, df[name1] - i['value_1'])
185 |         elif (i['operate_1'] == '*'):
186 |             df = df.withColumn(newName, df[name1] * i['value_1'])
187 |         elif (i['operate_1'] == '/'):
188 |             df = df.withColumn(newName, df[name1] / i['value1_'])
189 |         if (not ((name2 == "") or (name2 == None))):
190 |             newName2 = newName + "_2"
191 |             if (i['operate_2'] == '+'):
192 |                 df = df.withColumn(newName2, df[name2] + i['value_2'])
193 |             elif (i['operate_2'] == '-'):
194 |                 df = df.withColumn(newName2, df[name2] - i['value_2'])
195 |             elif (i['operate_2'] == '*'):
196 |                 df = df.withColumn(newName2, df[name2] * i['value_2'])
197 |             elif (i['operate_2'] == '/'):
198 |                 df = df.withColumn(newName2, df[name2] / i['value_2'])
199 | 
200 |             if (i['operate'] == '+'):
201 |                 df = df.withColumn(newName, df[newName] + df[newName2])
202 |             elif (i['operate'] == '-'):
203 |                 df = df.withColumn(newName, df[newName] - df[newName2])
204 |             elif (i['operate'] == '*'):
205 |                 df = df.withColumn(newName, df[newName] * df[newName2])
206 |             elif (i['operate'] == '/'):
207 |                 df = df.withColumn(newName, df[newName] / df[newName2])
208 |             df = df.drop(newName2)
209 |     return df
210 | 


--------------------------------------------------------------------------------
/app/views/v1/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kugomusic/Easy_Data/a74c3cd2c9c3b0e5a9298f8c3b7af2a2f5caf260/app/views/v1/__init__.py


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import os
 3 | 
 4 | # 24位字符设置
 5 | SECRET_KEY = os.urandom(24)
 6 | 
 7 | # 这里登陆的是root用户，要填上自己的密码，MySQL的默认端口是3306，填上之前创建的数据库名test
 8 | SQLALCHEMY_DATABASE_URI = 'mysql+pymysql://root:mySQL#h@d00p@localhost:3306/visualization'
 9 | # SQLALCHEMY_DATABASE_URI = 'mysql+pymysql://root:0000@localhost:3306/visualization'
10 | # SQLALCHEMY_DATABASE_URI = 'mysql+pymysql://root:cainiaoji@localhost:3306/visualization'
11 | # 设置这一项是每次请求结束后都会自动提交数据库中的变动
12 | SQLALCHEMY_TRACK_MODIFICATIONS = False
13 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | flask
3 | flask_sqlalchemy
4 | pandas/pyspark


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
1 | #!flask/bin/python
2 | from app import app
3 | app.run(debug=True, host='10.108.211.130', port=8993)
4 | # app.run(debug=True, host='0.0.0.0', port=8993)
5 | # app.run(debug=True, host='127.0.0.1', port=8993)


--------------------------------------------------------------------------------