├── .gitignore ├── README.md ├── app ├── ConstFile.py ├── Utils.py ├── __init__.py ├── dao │ ├── MLModelDao.py │ ├── ModelDao.py │ ├── ModelExecuteDao.py │ ├── OperatorDao.py │ ├── OperatorTypeDao.py │ ├── ProjectDao.py │ └── __init__.py ├── enmus │ ├── EnumConst.py │ └── __init__.py ├── ml │ ├── __init__.py │ ├── multipleClassification │ │ ├── LR.py │ │ ├── MPC.py │ │ ├── RF.py │ │ └── __init__.py │ └── secondClassification │ │ ├── GBDT.py │ │ ├── LR.py │ │ ├── SVM.py │ │ └── __init__.py ├── models │ ├── MSEntity.py │ ├── ServerNameMap.py │ └── __init__.py ├── service │ ├── ClearTask.py │ ├── ExplorationService.py │ ├── FEService.py │ ├── MLModelService.py │ ├── ModelExecuteService.py │ ├── ModelService.py │ ├── PreprocessService.py │ ├── __init__.py │ └── ml │ │ ├── Evaluation.py │ │ ├── ModelService.py │ │ ├── MultipleClassifition.py │ │ ├── PredictService.py │ │ ├── SecondClassification.py │ │ └── __init__.py ├── test │ ├── FPGrowthTest.py │ ├── PySparkTest.py │ ├── RandomForestTest.py │ ├── Test.py │ └── zhoukang └── views │ ├── OperateFlow.py │ ├── OperateType.py │ ├── Operator.py │ ├── Project.py │ ├── ProjectModel.py │ ├── Report.py │ ├── Test.py │ ├── __init__.py │ ├── datasource │ ├── DataSource.py │ ├── __init__.py │ └── werkzeug │ │ ├── __init__.py │ │ └── utils.py │ └── v1 │ ├── Exploration.py │ ├── FeatureEngineering.py │ ├── Mysql.py │ ├── Process.py │ ├── Process2.py │ └── __init__.py ├── config.py ├── requirements.txt └── run.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | __pycache__/ 3 | *.pyc 4 | .DS_Store 5 | app/.DS_Store 6 | .xml 7 | config.py 8 | run.py -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | *机器学习可视化项目* 2 | 数据探索-数据预处理-特征工程-模型训练和预测 -------------------------------------------------------------------------------- /app/ConstFile.py: -------------------------------------------------------------------------------- 1 | class Const(object): 2 | class ConstError(TypeError): 3 | pass 4 | 5 | class ConstCaseError(ConstError): 6 | pass 7 | 8 | def __setattr__(self, name, value): 9 | if name in self.__dict__: # 判断是否已经被赋值,如果是则报错 10 | raise self.ConstError("Can't change const.%s" % name) 11 | if not name.isupper(): # 判断所赋值是否是全部大写,用来做第一次赋值的格式判断,也可以根据需要改成其他判断条件 12 | raise self.ConstCaseError('const name "%s" is not all supercase' % name) 13 | 14 | self.__dict__[name] = value 15 | 16 | 17 | const = Const() 18 | 19 | const.ROOTURL = "/home/zk/project/" 20 | # const.ROOTURL = "/Users/kang/PycharmProjects/project" 21 | 22 | # csv文件存储目录(临时) 23 | const.SAVEDIR = "/home/zk/project/test.csv" 24 | # const.SAVEDIR = '/Users/kang/PycharmProjects/project/test.csv' 25 | # const.SAVEDIR = "/Users/tc/Desktop/可视化4.0/Project/test.csv" 26 | 27 | # exploration 临时视图存放 28 | const.JSONFILENAME = 'qazwsxedcrfvtgbyhnujmiopkl' + '.json' 29 | 30 | # 算子运行产生的中间数据 31 | const.MIDDATA = '/home/zk/midData/' 32 | -------------------------------------------------------------------------------- /app/Utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from app.models.MSEntity import Project, ProcessFlow 3 | import os, json, time 4 | from app import db 5 | import pandas as pd 6 | from pyspark.sql import SparkSession 7 | import uuid, shutil, traceback 8 | from flask.json import jsonify 9 | from app.ConstFile import const 10 | 11 | 12 | def list_str_to_list(str): 13 | """ 14 | 字符串数组 15 | :param str: "["1","2"]" 16 | :return: 17 | """ 18 | if str is None or str == '': 19 | return {} 20 | change = json.loads("{\"key\":" + str.replace("'", "\"") + "}") 21 | return change['key'] 22 | 23 | 24 | # 返回数据 25 | def returnDataModel(df, state, reason): 26 | if state: 27 | return jsonify({'state': state, 'reason': reason, 'length': df.count(), 'data': dfToJson(df, 50)}) 28 | else: 29 | return jsonify({'state': state, 'reason': reason, 'length': 0, 'data': {}}) 30 | 31 | 32 | # 获取时间戳 33 | def funTime(): 34 | t = time.time() 35 | return str(int(round(t * 1000))) # 毫秒级时间戳 36 | 37 | 38 | # 获取一个新的SparkSession 39 | def getSparkSession(userId, computationName): 40 | appName = str(userId) + "_" + computationName + '_' + str(funTime()) 41 | print('Spark Session Name: ', appName) 42 | # ss = SparkSession \ 43 | # .builder \ 44 | # .appName(appName) \ 45 | # .master("spark://10.108.211.130:7077") \ 46 | # .getOrCreate() 47 | 48 | ss = SparkSession \ 49 | .builder \ 50 | .appName(appName) \ 51 | .master("local[*]") \ 52 | .getOrCreate() 53 | return ss 54 | 55 | 56 | # 返回前nums条数据(json格式) 57 | def dfToJson(df, nums): 58 | data_1 = df.limit(nums).toJSON().collect() 59 | data_2 = ",".join(data_1) 60 | data_3 = '[' + data_2 + ']' 61 | return json.loads(data_3) 62 | 63 | 64 | # 获取处理流 65 | def getProcessFlowByProjectId(projectId): 66 | try: 67 | filters = { 68 | ProcessFlow.project_id == projectId, 69 | } 70 | return ProcessFlow.query.filter(*filters).first() 71 | except: 72 | return "error" 73 | 74 | 75 | # 追加处理流程记录 76 | def addProcessingFlow(projectName, userId, operateType, operateParameter): 77 | try: 78 | operate = {} 79 | operate['type'] = operateType 80 | operate['key'] = str(uuid.uuid1()) 81 | print(operate['key']) 82 | operate['operate'] = operateParameter 83 | print("追加处理流程", projectName, userId, operate) 84 | pflow = db.session.query(ProcessFlow.id, ProcessFlow.project_id, ProcessFlow.operates, ProcessFlow.cur_ope_id, 85 | ProcessFlow.links). \ 86 | join(Project, Project.id == ProcessFlow.project_id). \ 87 | filter(Project.project_name == projectName). \ 88 | filter(Project.user_id == userId). \ 89 | first() 90 | # 修改 operates 91 | # print(len(pflow)) 92 | if not (pflow[2] == None or pflow[2] == ""): 93 | operates = json.loads(pflow[2]) 94 | else: 95 | operates = [] 96 | operates.append(operate) 97 | operateStr = json.dumps(operates, ensure_ascii=False) 98 | # 修改 links 99 | if not (pflow[3] == None or pflow[3] == ""): 100 | link = {} 101 | link['from'] = pflow[3] 102 | link['to'] = operate['key'] 103 | if not (pflow[4] == None or pflow[4] == ""): 104 | links = json.loads(pflow[4]) 105 | else: 106 | links = [] 107 | links.append(link) 108 | linkStr = json.dumps(links, ensure_ascii=False) 109 | else: 110 | linkStr = pflow[4] 111 | filters = { 112 | ProcessFlow.id == pflow[0], 113 | } 114 | result = ProcessFlow.query.filter(*filters).first() 115 | result.operates = operateStr 116 | result.links = linkStr 117 | result.cur_ope_id = operate['key'] 118 | db.session.commit() 119 | return "" 120 | except Exception: 121 | print('traceback.format_exc():\n%s' % traceback.format_exc()) 122 | print("追加数据流程出错") 123 | return "追加数据流程出错" 124 | 125 | 126 | # addProcessingFlow('甜点销售数据预处理',1,{'type':'1','operate':'列名一,关系,值,组合关系;列名一,关系,值,'}) 127 | 128 | # 获取项目 129 | def getProjectByNameAndUserId(projectName, userId): 130 | try: 131 | print('projectName=', projectName, ' userId=', userId) 132 | return db.session.query(Project).filter(Project.project_name == projectName) \ 133 | .filter(Project.user_id == userId) \ 134 | .first() 135 | except: 136 | return "error" 137 | 138 | 139 | # 获取项目的正在操作的数据文件地址 140 | def getProjectCurrentDataUrl(projectName): 141 | try: 142 | filters = { 143 | Project.project_name == projectName 144 | } 145 | pro = Project.query.filter(*filters).first() 146 | project_address = pro.project_address 147 | filename = '' 148 | for root, dirs, files in os.walk(project_address): 149 | # print(root) #当前目录路径 150 | # print(dirs) #当前路径下所有子目录 151 | # print(files) #当前路径下所有非目录子文件 152 | for file in files: 153 | if file[-4:] == '.csv': 154 | filename = file 155 | break 156 | break 157 | # print(filename) 158 | if filename == '': 159 | return "error" 160 | else: 161 | # return {'fileUrl': ProjectAddress+'/'+filename, 'projectAddress': ProjectAddress} 162 | return {'fileUrl': 'file://' + project_address + '/' + filename, 'projectAddress': project_address} 163 | except: 164 | return "error" 165 | 166 | 167 | # 获取项目的正在操作的文件数据 168 | def getProjectCurrentData(ss, projectName): 169 | # 解析项目路径,读取csv 170 | urls = getProjectCurrentDataUrl(projectName) 171 | if urls == 'error': 172 | return "error: 项目名或项目路径有误" # 错误类型:项目名或项目路径有误 173 | fileUrl = urls['fileUrl'] # 读本地文件 174 | df = ss.read.csv(fileUrl, header=True, inferSchema=True) 175 | # ss. 176 | # import pandas as pd 177 | # sc = ss.sparkContext 178 | # sqlContext = SQLContext(sc) 179 | # df = pd.read_csv(fileUrl) 180 | # df = sqlContext.createDataFrame(df) 181 | 182 | # df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(fileUrl) 183 | return df 184 | 185 | 186 | def read_data_pandas(file_url): 187 | """ 188 | pandas 读取数据 189 | :param file_url: 190 | :return: 191 | """ 192 | if file_url[-4:] == ".csv": 193 | df = pd.read_csv(file_url, encoding="utf-8") 194 | else: 195 | df = pd.read_excel(file_url, encoding="utf-8") 196 | return df 197 | 198 | 199 | def save_data_pandas(data, file_type="", file_url="", index=0): 200 | """ 201 | pandas 写数据 202 | :return: 203 | """ 204 | if file_type == "": 205 | file_type = 'csv' 206 | if file_url == "": 207 | file_url = const.MIDDATA + str(uuid.uuid1()) 208 | 209 | if file_type == 'json': 210 | file_url = file_url + '.json' 211 | json_str = json.dumps(data, ensure_ascii=False) 212 | with open(file_url, "w", encoding="utf-8") as f: 213 | json.dump(json_str, f, ensure_ascii=False) 214 | elif file_type == 'csv': 215 | file_url = file_url + '.csv' 216 | data.to_csv(file_url, header=True, index=index) 217 | 218 | return file_url 219 | 220 | 221 | def read_data(ss, file_url): 222 | """ 223 | spark 读取数据 224 | :param ss:spark session 225 | :param file_url: 226 | :return: 227 | """ 228 | 229 | df = ss.read.csv(file_url, header=True, inferSchema=True) 230 | return df 231 | 232 | 233 | def save_data(df, file_url=""): 234 | """ 235 | 保存数据 236 | :param df: 237 | :param file_url: 238 | :return: 239 | """ 240 | if file_url == "": 241 | file_url = const.MIDDATA + str(uuid.uuid1()) + '.csv' 242 | df.toPandas().to_csv(file_url, header=True, index=0) 243 | return file_url 244 | 245 | 246 | def mkdir(path): 247 | """ 248 | 根据指定路径创建文件夹 249 | :param path: 250 | :return: 251 | """ 252 | import os 253 | 254 | # 去除首位空格 255 | path = path.strip() 256 | # 去除尾部 \ 符号 257 | path = path.rstrip("\\") 258 | 259 | # 判断路径是否存在 260 | # 存在 True 261 | # 不存在 False 262 | isExists = os.path.exists(path) 263 | 264 | # 判断结果 265 | if not isExists: 266 | # 如果不存在则创建目录 267 | print(path + ' 创建成功') 268 | # 创建目录操作函数 269 | os.makedirs(path) 270 | return True 271 | else: 272 | # 如果目录存在则不创建,并提示目录已存在 273 | print(path + ' 目录已存在') 274 | return False 275 | 276 | 277 | def deldir(path): 278 | import os 279 | if os.path.exists(path): 280 | # 删除文件,可使用以下两种方法。 281 | os.remove(path) 282 | return True 283 | else: 284 | print('no such file:%s' % path) 285 | return False 286 | 287 | 288 | def deltree(path): 289 | import os 290 | if os.path.exists(path): 291 | shutil.rmtree(path) # 递归删除文件夹 292 | return True 293 | else: 294 | print('no such path:%s' % path) 295 | return False 296 | 297 | 298 | def is_number(s): 299 | try: 300 | float(s) 301 | return True 302 | except ValueError: 303 | pass 304 | 305 | try: 306 | import unicodedata 307 | unicodedata.numeric(s) 308 | return True 309 | except (TypeError, ValueError): 310 | pass 311 | return False 312 | -------------------------------------------------------------------------------- /app/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | from flask import Flask 4 | from flask_sqlalchemy import SQLAlchemy 5 | 6 | app = Flask(__name__) 7 | 8 | 9 | # 跨域 方法一 :falsk_cors模块 10 | # from flask_cors import CORS 11 | # CORS(app, supports_credentials=True) 12 | 13 | # 跨域支持 方法二:flask 内置的after_request()方法 14 | def after_request(resp): 15 | resp.headers['Access-Control-Allow-Origin'] = '*' 16 | return resp 17 | 18 | 19 | app.after_request(after_request) 20 | 21 | # 加载配置文件 22 | app.config.from_object('config') 23 | db = SQLAlchemy(app) 24 | # 25 | from app.views.datasource import DataSource 26 | from app.views import Project, OperateType, OperateFlow, ProjectModel, Report, Operator 27 | 28 | # from app import test 29 | -------------------------------------------------------------------------------- /app/dao/MLModelDao.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | from app import db 3 | from app.models.MSEntity import MLModel 4 | import traceback 5 | 6 | """ 7 | 提供 ml_model(模型算子训练结果保存表) 的增删改查 8 | """ 9 | 10 | 11 | def create_ml_model(ml_model): 12 | """ 13 | 创建 新的 ml_model记录()保存模型 14 | :param ml_model: 类型 [MLModel] 15 | :return: 16 | """ 17 | try: 18 | session = db.session 19 | session.add(ml_model) 20 | session.commit() 21 | print('成功创建一个算子') 22 | return True 23 | 24 | except Exception: 25 | print(traceback.print_exc()) 26 | return False 27 | 28 | 29 | def get_ml_model(ml_model_id): 30 | """ 31 | 查询记录 32 | :param ml_model_id: 33 | :return: 34 | """ 35 | 36 | filters = {MLModel.id == ml_model_id} 37 | ml_model = MLModel.query.filter(*filters).first() 38 | return ml_model 39 | -------------------------------------------------------------------------------- /app/dao/ModelDao.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | from app.models.MSEntity import Model 3 | from app import db 4 | import traceback 5 | import json 6 | import app.Utils as utils 7 | 8 | """ 9 | 提供 model(项目) 表的增删改查 10 | """ 11 | 12 | 13 | def get_model_by_id(id): 14 | """ 15 | 通过ID 获取 model 16 | :param id: 17 | :return: 18 | """ 19 | 20 | try: 21 | query = db.session.query(Model).filter(Model.id == id).first() 22 | db.session.commit() 23 | return query 24 | 25 | except Exception: 26 | print(traceback.print_exc()) 27 | return False 28 | 29 | 30 | def get_model_by_project_id(project_id): 31 | """ 32 | 通过项目ID 获取 项目对应的model 33 | :return: 34 | """ 35 | 36 | try: 37 | query = db.session.query(Model).filter(Model.project_id == project_id).first() 38 | db.session.commit() 39 | return query 40 | 41 | except Exception: 42 | print(traceback.print_exc()) 43 | return False 44 | 45 | 46 | def update_with_project_id(project_id, start_nodes, relationship, config_order): 47 | """ 48 | 通过项目ID 更新 项目对应的model 49 | :return: 50 | """ 51 | 52 | try: 53 | start_nodes = utils.list_str_to_list(start_nodes) 54 | relationship = utils.list_str_to_list(relationship) 55 | relationship_item_str = [] 56 | for item in relationship: 57 | relationship_item_str.append(str(item)) 58 | config = json.dumps({'config_order': config_order, 'relationship': '*,'.join(relationship_item_str)}, 59 | ensure_ascii=False) 60 | query = db.session.query(Model) 61 | query.filter(Model.project_id == project_id).update( 62 | {Model.start_nodes: ','.join(start_nodes), 63 | Model.config: config}) 64 | db.session.commit() 65 | print('更新完成') 66 | return True 67 | 68 | except Exception: 69 | print(traceback.print_exc()) 70 | return False 71 | -------------------------------------------------------------------------------- /app/dao/ModelExecuteDao.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | from app.models.MSEntity import ModelExecute 3 | from app import db 4 | import traceback 5 | 6 | """ 7 | 模型执行表 增删改查 8 | """ 9 | 10 | 11 | def get_model_execute_by_id(model_execute_id): 12 | """ 13 | 通过 id 查询 model_execute 14 | :param model_execute_id: 15 | :return: 16 | """ 17 | try: 18 | query = db.session.query(ModelExecute).filter(ModelExecute.id == model_execute_id).first() 19 | db.session.commit() 20 | return query 21 | except Exception: 22 | print(traceback.print_exc()) 23 | return False 24 | 25 | 26 | def create_model_execute(model_execute): 27 | """ 28 | 创建 model_execute 29 | :param model_execute: 类型 ModelExecute 30 | :return: 31 | """ 32 | try: 33 | session = db.session 34 | session.add(model_execute) 35 | session.commit() 36 | print('成功创建一条执行记录') 37 | return model_execute 38 | except Exception: 39 | print(traceback.print_exc()) 40 | return False 41 | 42 | 43 | def update_model_execute(model_execute_id, status, run_info, end_time): 44 | """ 45 | 更新 model_execute 46 | 47 | :param model_execute_id: 48 | :param status: 49 | :param run_info: 50 | :param end_time: 51 | :return: 52 | """ 53 | try: 54 | filters = { 55 | ModelExecute.id == model_execute_id, 56 | } 57 | result = ModelExecute.query.filter(*filters).first() 58 | result.status = status 59 | result.run_info = run_info 60 | result.end_time = end_time 61 | db.session.commit() 62 | return True 63 | except Exception as e: 64 | print(traceback.print_exc()) 65 | return False 66 | -------------------------------------------------------------------------------- /app/dao/OperatorDao.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | from app.models.MSEntity import Operator 3 | from app import db 4 | import traceback 5 | 6 | """ 7 | operator(算子)表 增删改查 8 | """ 9 | 10 | 11 | def update_operator_by_id(operator_id, status, operator_output_url="", run_info=""): 12 | """ 13 | 通过 operator_id 更新 operator的执行状态、结果保存路径、运行信息 14 | :param operator_id: 15 | :param status: 16 | :param operator_output_url: 17 | :param run_info: 18 | :return: 19 | """ 20 | try: 21 | filters = { 22 | Operator.id == operator_id, 23 | } 24 | result = Operator.query.filter(*filters).first() 25 | result.status = status 26 | result.operator_output_url = operator_output_url 27 | result.run_info = run_info 28 | db.session.commit() 29 | return True 30 | except Exception as e: 31 | print(traceback.print_exc()) 32 | return False 33 | 34 | 35 | def update_operator_input_url(operator_id, operator_input_url): 36 | """ 37 | 通过 operator_id 更新 operator的输入路径 38 | :param operator_id: 39 | :param operator_input_url: 40 | :return: 41 | """ 42 | try: 43 | filters = { 44 | Operator.id == operator_id, 45 | } 46 | result = Operator.query.filter(*filters).first() 47 | result.operator_input_url = operator_input_url 48 | db.session.commit() 49 | return True 50 | except Exception as e: 51 | print(traceback.print_exc()) 52 | return False 53 | 54 | 55 | def get_operator_by_id(operator_id): 56 | """ 57 | 通过 id 查询 operator 58 | :param operator_id: 59 | :return: 60 | """ 61 | try: 62 | query = db.session.query(Operator).filter(Operator.id == operator_id).first() 63 | db.session.commit() 64 | return query 65 | except Exception: 66 | print(traceback.print_exc()) 67 | return False 68 | 69 | 70 | def get_operator_by_ids(operator_ids): 71 | """ 72 | TODO:不好使待调试 73 | 通过 id集合 查询 operator 74 | :param operator_ids:[] 75 | :return: 76 | """ 77 | try: 78 | query = db.session.query(Operator).filter(Operator.id in operator_ids) 79 | db.session.commit() 80 | return query 81 | except Exception: 82 | print(traceback.print_exc()) 83 | return False 84 | 85 | 86 | def get_operator_by_model_id(model_id): 87 | """ 88 | 通过 model_id 查询 operator 89 | :param model_id: 90 | :return: 91 | """ 92 | try: 93 | query = db.session.query(Operator).filter(Operator.model_id == model_id) 94 | db.session.commit() 95 | return query 96 | except Exception: 97 | print(traceback.print_exc()) 98 | return False 99 | 100 | 101 | def delete_operator_by_model_id(model_id): 102 | """ 103 | 通过 model_id 删除 operator 104 | :param model_id: 105 | :return: 106 | """ 107 | try: 108 | db.session.query(Operator).filter(Operator.model_id == model_id).delete() 109 | db.session.commit() 110 | return True 111 | except Exception: 112 | print(traceback.print_exc()) 113 | return False 114 | 115 | 116 | def delete_operator_by_id(operator_id): 117 | """ 118 | 通过 id 删除 operator 119 | :param operator_id: 120 | :return: 121 | """ 122 | try: 123 | db.session.query(Operator).filter(Operator.id == operator_id).delete() 124 | db.session.commit() 125 | return True 126 | except Exception: 127 | print(traceback.print_exc()) 128 | return False 129 | 130 | 131 | def create_operator(operators): 132 | """ 133 | 创建 新的operator(算子) 134 | :param operators: 类型 [Operator] 135 | :return: 136 | """ 137 | try: 138 | session = db.session 139 | session.add_all(operators) 140 | session.commit() 141 | return True 142 | 143 | except Exception: 144 | print(traceback.print_exc()) 145 | return False 146 | -------------------------------------------------------------------------------- /app/dao/OperatorTypeDao.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | from app.models.MSEntity import OperatorType 3 | 4 | """ 5 | operator_type(算子种类)表 增删改查 6 | """ 7 | 8 | 9 | def get_all_operator_type(): 10 | """ 11 | 查询所有的 算子种类 12 | :return: 13 | """ 14 | return OperatorType.query.all() 15 | -------------------------------------------------------------------------------- /app/dao/ProjectDao.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | from app.models.MSEntity import Project 3 | 4 | """ 5 | 提供 project(项目) 表的增删改查 6 | """ 7 | 8 | 9 | def get_project_by_id(project_id): 10 | """ 11 | 通过项目ID获取项目 12 | :return: 13 | """ 14 | 15 | try: 16 | filters = { 17 | Project.id == project_id, 18 | } 19 | return Project.query.filter(*filters).first() 20 | except Exception: 21 | return "error" 22 | -------------------------------------------------------------------------------- /app/dao/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | dao 层 3 | 提供操作数据库的功能函数 4 | """ 5 | -------------------------------------------------------------------------------- /app/enmus/EnumConst.py: -------------------------------------------------------------------------------- 1 | # 导入枚举类 2 | from enum import Enum 3 | 4 | """ 5 | v1版本,弃用 6 | """ 7 | 8 | 9 | # 继承枚举类 10 | # 算子编码枚举类 11 | class OperatorType(Enum): 12 | ## 数据预处理 13 | # 过滤 14 | FILTER = '1001' 15 | # 排序 16 | SORT = '1002' 17 | # 按列拆分 18 | COLUMNSPLIT = '1003' 19 | # 按行拆分 20 | ROWSPLIT = '1004' 21 | # 多列合并 22 | COLUMNMERGE = '1005' 23 | # 数据列替换 24 | REPLACE = '1006' 25 | # 空值填充 26 | FILLNULLVALUE = '1007' 27 | # 列映射 28 | COLUMNMAP = '1008' 29 | 30 | ## 特征工程 31 | # 分位数离散化 32 | QUANTILEDISCRETIZATION = '2001' 33 | # 向量索引转换 34 | VECTORINDEXER = '2002' 35 | # 标准化列 36 | STANDARDSCALER = '2003' 37 | # 降维 38 | PCA = '2004' 39 | # 字符串转标签 40 | STRINGINDEXER = '2005' 41 | # 独热编码 42 | ONEHOTENCODER = '2006' 43 | # 多项式扩展 44 | POLYNOMIALEXPANSION = '2007' 45 | # 卡放选择 46 | CHISQSELECTOR = '2008' 47 | -------------------------------------------------------------------------------- /app/enmus/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kugomusic/Easy_Data/a74c3cd2c9c3b0e5a9298f8c3b7af2a2f5caf260/app/enmus/__init__.py -------------------------------------------------------------------------------- /app/ml/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kugomusic/Easy_Data/a74c3cd2c9c3b0e5a9298f8c3b7af2a2f5caf260/app/ml/__init__.py -------------------------------------------------------------------------------- /app/ml/multipleClassification/LR.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel, LogisticRegressionSummary, \ 3 | LogisticRegressionTrainingSummary 4 | from pyspark.sql import Row, DataFrame 5 | from pyspark.ml.linalg import Vectors 6 | from app.Utils import * 7 | 8 | ''' 9 | class pyspark.ml.classification.LogisticRegression(featuresCol='features', labelCol='label', predictionCol='prediction', 10 | maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-06, fitIntercept=True, threshold=0.5, thresholds=None, 11 | probabilityCol='probability', rawPredictionCol='rawPrediction', standardization=True, weightCol=None, 12 | aggregationDepth=2, family='auto', lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, 13 | lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None) 14 | 15 | featuresCol # 特征列 16 | labelCol # 标签列 17 | predictionCol # 预测输出列 18 | maxIter # 最大迭代轮数 19 | regParam # 正则化参数 20 | elasticNetParam # ElasticNet混合参数,范围为[0,1]。对于alpha = 0,惩罚是L2惩罚。对于alpha = 1,这是L1惩罚(默认值:0.0) 21 | tol # 迭代算法的收敛容限(> = 0)(默认值:1e-06) 22 | fitIntercept # 是否训练截距项(默认值:True) 23 | threshold # 二进制分类预测中的阈值,范围为[0,1](默认值:0.5)。 24 | thresholds # 多类别分类中的阈值,用于调整预测每个类别的概率。数组的长度必须等于类的数量,其值必须大于0,但最多一个值可以为0,这是预测值。p/t最大的类是可预测的,其中p是该类的原始概率,t是该类的概率阈值(未定义) 25 | rawPredictionCol:原始预测(也称为置信度)列名称(默认值:rawPrediction) 26 | standardization: whether to standardize the training features before fitting the model (default: True) 27 | weightCol: weight column name. If this is not set or empty, we treat all instance weights as 1.0 (current: weight) 28 | aggregationDepth # suggested depth for treeAggregate (>= 2) (default: 2) 29 | family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial. (default: auto) 30 | lowerBoundsOnCoefficients:如果在边界约束优化下拟合,则系数的下界。 (未定义) 31 | lowerBoundsOnIntercepts:如果在边界约束优化下拟合,则截距的下限。 (未定义) 32 | upperBoundsOnIntercepts:如果在边界约束优化下拟合,则截距的上限。 (未定义) 33 | 34 | ''' 35 | 36 | 37 | def is_number(s): 38 | try: 39 | float(s) 40 | return True 41 | except ValueError: 42 | pass 43 | 44 | try: 45 | import unicodedata 46 | unicodedata.numeric(s) 47 | return True 48 | except (TypeError, ValueError): 49 | pass 50 | 51 | return False 52 | 53 | 54 | def project_url(projectName): 55 | urls = getProjectCurrentDataUrl(projectName) 56 | if urls == 'error': 57 | return "error: 项目名或项目路径有误" # 错误类型:项目名或项目路径有误 58 | return urls['projectAddress'] # 项目地址 59 | 60 | 61 | def lr(ss, data, label_index, feature_indexs, project_url): 62 | # 1.构造训练数据集 63 | def func(x): 64 | features_data = [] 65 | for feature in feature_indexs: 66 | if (is_number(x[feature])): 67 | features_data.append(float(x[feature])) 68 | else: 69 | features_data.append(0.0) 70 | label_data = 0.0 71 | if (is_number(x[label_index])): 72 | label_data = float(x[label_index]) 73 | return Row(label=label_data, features=Vectors.dense(features_data)) 74 | 75 | training_set = data.rdd.map(list).map(lambda x: func(x)).toDF() 76 | 77 | # 2.训练模型 78 | lr_param = LogisticRegression(regParam=0.01, family='multinomial') 79 | lr_model = lr_param.fit(training_set) 80 | print(lr_model.coefficientMatrix) # 系数 81 | print(lr_model.interceptVector) # 截距 82 | # print(lr_model.explainParams()) # 参数以及其注解 83 | 84 | # 3.保存模型 85 | # model_path = project_url + '/model/multipleClassification/lr' 86 | # lr_model.write().overwrite().save(model_path) 87 | # 88 | # # 4.读取模型 89 | # lr2 = lr_model.load(model_path) 90 | 91 | # 5.预测 92 | result = lr_model.transform(training_set).head() 93 | print(result.prediction) 94 | 95 | LogisticRegressionTrainingSummary 96 | sum = lr_model.summary 97 | 98 | # 6.评估 99 | summary = lr_model.evaluate(training_set) 100 | summary.show() 101 | 102 | 103 | userId = 1 104 | functionName = 'lr' 105 | projectName = '订单分析' 106 | label = 0 # 标签列 107 | features = [2, 4, 10, 11, 12] # 特征列 108 | project_path = project_url(projectName) # 项目路径 109 | # spark会话 110 | ss = getSparkSession(userId, functionName) 111 | # 解析项目路径,读取csv 112 | fileUrl = '/home/zk/data/adult.csv' 113 | df = ss.read.csv(fileUrl) 114 | df.filter 115 | print(df.dtypes) 116 | 117 | df.show() 118 | # df = getProjectCurrentData(ss, projectName) 119 | # 罗辑回归二分类 120 | lr(ss, df, label, features, project_path) 121 | -------------------------------------------------------------------------------- /app/ml/multipleClassification/MPC.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from pyspark.ml.classification import MultilayerPerceptronClassifier, MultilayerPerceptronClassificationModel 3 | from pyspark.sql import Row, DataFrame 4 | from pyspark.ml.linalg import Vectors 5 | from app.Utils import * 6 | 7 | 8 | # userId = 1 9 | # functionName = 'gdbt' 10 | # projectName = '订单分析' 11 | # # spark会话 12 | # spark = getSparkSession(userId, functionName) 13 | # df = spark.createDataFrame([ 14 | # (0.0, Vectors.dense([0.0, 0.0])), 15 | # (1.0, Vectors.dense([0.0, 1.0])), 16 | # (1.0, Vectors.dense([1.0, 0.0])), 17 | # (0.0, Vectors.dense([1.0, 1.0]))], ["label", "features"]) 18 | # mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[2, 2, 2], blockSize=1, seed=123) 19 | # model = mlp.fit(df) 20 | # print(model.layers) 21 | # print(model.weights.size) 22 | # testDF = spark.createDataFrame([ 23 | # (Vectors.dense([1.0, 0.0]),), 24 | # (Vectors.dense([0.0, 0.0]),)], ["features"]) 25 | # model.transform(testDF).select("features", "prediction").show() 26 | 27 | 28 | def project_url(projectName): 29 | urls = getProjectCurrentDataUrl(projectName) 30 | if urls == 'error': 31 | return "error: 项目名或项目路径有误" # 错误类型:项目名或项目路径有误 32 | return urls['projectAddress'] # 项目地址 33 | 34 | 35 | def mpc(ss, data, label_index, feature_indexs, project_url): 36 | # 1.构造训练数据集 37 | def func(x): 38 | features_data = [] 39 | for feature in feature_indexs: 40 | features_data.append(x[feature]) 41 | return Row(label=label_index, features=Vectors.dense(features_data)) 42 | 43 | training_set = data.rdd.map(lambda x: func(x)).toDF() 44 | 45 | # 2.训练模型 46 | # maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, solver="l-bfgs", initialWeights=None 47 | mpc_param = MultilayerPerceptronClassifier(maxIter=100, tol=1e-6, blockSize=128, stepSize=0.03, solver="l-bfgs") 48 | mpc_param.setSeed(1) 49 | mpc_param.setLayers([4, 2, 2]) 50 | mpc_model = mpc_param.fit(training_set) 51 | 52 | # 3.保存模型 53 | model_path = project_url + '/model/multipleClassification/mpc' 54 | mpc_model.write().overwrite().save(model_path) 55 | 56 | # 4.读取模型 57 | mpc2 = MultilayerPerceptronClassificationModel.load(model_path) 58 | 59 | # 5.预测 60 | result = mpc2.transform(training_set).select("prediction", "features").show() 61 | 62 | 63 | userId = 1 64 | functionName = 'mpc' 65 | projectName = '订单分析' 66 | label = 0 # 标签列 67 | features = [12, 13, 14, 15] # 特征列 68 | project_path = project_url(projectName) # 项目路径 69 | # spark会话 70 | ss = getSparkSession(userId, functionName) 71 | # 解析项目路径,读取csv 72 | df = getProjectCurrentData(ss, projectName) 73 | # 罗辑回归二分类 74 | mpc(ss, df, label, features, project_path) 75 | -------------------------------------------------------------------------------- /app/ml/multipleClassification/RF.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel, LogisticRegressionSummary, \ 3 | LogisticRegressionTrainingSummary, RandomForestClassifier 4 | from pyspark.sql import Row, DataFrame 5 | from pyspark.ml.linalg import Vectors 6 | from app.Utils import * 7 | 8 | ''' 9 | It supports both binary and multiclass labels, as well as both continuous and categoricaleatures. 10 | 11 | ''' 12 | 13 | 14 | def is_number(s): 15 | try: 16 | float(s) 17 | return True 18 | except ValueError: 19 | pass 20 | 21 | try: 22 | import unicodedata 23 | unicodedata.numeric(s) 24 | return True 25 | except (TypeError, ValueError): 26 | pass 27 | 28 | return False 29 | 30 | 31 | def project_url(projectName): 32 | urls = getProjectCurrentDataUrl(projectName) 33 | if urls == 'error': 34 | return "error: 项目名或项目路径有误" # 错误类型:项目名或项目路径有误 35 | return urls['projectAddress'] # 项目地址 36 | 37 | 38 | def rf(ss, data, label_index, feature_indexs, project_url): 39 | # 1.构造训练数据集 40 | def func(x): 41 | features_data = [] 42 | for feature in feature_indexs: 43 | features_data.append(x[feature]) 44 | return Row(label=label_index, features=Vectors.dense(features_data)) 45 | 46 | training_set = data.rdd.map(list).map(lambda x: func(x)).toDF() 47 | 48 | # 2.训练模型 49 | rf_param = RandomForestClassifier(numTrees=50) 50 | rf_model = rf_param.fit(training_set) 51 | 52 | # 3.保存模型 53 | model_path = project_url + '/model/multipleClassification/rf' 54 | rf_model.write().overwrite().save(model_path) 55 | 56 | # 4.读取模型 57 | rf2 = rf_model.load(model_path) 58 | 59 | # 5.预测 60 | rf_pred = rf2.transform(training_set) 61 | rf_pred.select("prediction", "features").show() 62 | 63 | from pyspark.ml.evaluation import MulticlassClassificationEvaluator 64 | # 6.评估 65 | rf_accuracy = MulticlassClassificationEvaluator(metricName='accuracy').evaluate(rf_pred) 66 | print("RF's accuracy is %f" % rf_accuracy) 67 | rf_precision = MulticlassClassificationEvaluator(metricName='weightedPrecision').evaluate(rf_pred) 68 | print("RF's precision is %f" % rf_precision) 69 | 70 | 71 | userId = 1 72 | functionName = 'lr' 73 | projectName = '订单分析' 74 | label = 0 # 标签列 75 | features = [2, 4, 10, 11, 12] # 特征列 76 | project_path = project_url(projectName) # 项目路径 77 | # spark会话 78 | ss = getSparkSession(userId, functionName) 79 | # 解析项目路径,读取csv 80 | fileUrl = '/home/zk/data/adult.csv' 81 | df = ss.read.csv(fileUrl) 82 | df.filter 83 | print(df.dtypes) 84 | 85 | df.show() 86 | # df = getProjectCurrentData(ss, projectName) 87 | # 罗辑回归二分类 88 | rf(ss, df, label, features, project_path) 89 | -------------------------------------------------------------------------------- /app/ml/multipleClassification/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kugomusic/Easy_Data/a74c3cd2c9c3b0e5a9298f8c3b7af2a2f5caf260/app/ml/multipleClassification/__init__.py -------------------------------------------------------------------------------- /app/ml/secondClassification/GBDT.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from pyspark.ml.linalg import Vectors 3 | from pyspark.ml.classification import GBTClassifier, GBTClassificationModel 4 | from pyspark.ml.feature import StringIndexer 5 | from pyspark.sql.types import Row 6 | from app.Utils import * 7 | 8 | # GBDT(Gradient Boosting Decision Tree) 又叫 MART(Multiple Additive Regression Tree),是一种迭代的决策树算法, 9 | # 该算法由多棵决策树组成,所有树的结论累加起来做最终答案。 10 | 11 | 12 | # GBTC API 13 | # GBTClassifier(featuresCol='features', labelCol='label', predictionCol='prediction', 14 | # maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, 15 | # cacheNodeIds=False, checkpointInterval=10, lossType='logistic', maxIter=20, stepSize=0.1, 16 | # seed=None, subsamplingRate=1.0, featureSubsetStrategy='all') 17 | 18 | trainDataRatio = 0.75 # 训练数据比例 19 | maxIter = 20 # 迭代次数 20 | stepSize = 0.1 # 学习速率(0-1) 21 | maxDepth = 5 # 数的最大深度[1,100] 22 | minInstancesPerNode = 1 # 叶子节点最少样本数[1,1000] 23 | seed = 1 # 随机数产生器种子[0,10] 24 | maxBins = 32 # 一个特征分裂的最大数量[1,1000] 25 | 26 | 27 | def model_url(projectName): 28 | urls = getProjectCurrentDataUrl(projectName) 29 | if urls == 'error': 30 | return "error: 项目名或项目路径有误" # 错误类型:项目名或项目路径有误 31 | return urls['projectAddress'] + '/model/secondClassification' # 项目地址 32 | 33 | 34 | def gbdt(data, label_index, feature_indexs, project_url): 35 | 36 | # 2.构造训练数据集 37 | data_set = data.rdd.map(list) 38 | (train_data, test_data) = data_set.randomSplit([trainDataRatio, 1 - trainDataRatio]) 39 | data.show() 40 | 41 | def func(x): 42 | features_data = [] 43 | for feature in feature_indexs: 44 | features_data.append(x[feature]) 45 | return Row(label=label_index, features=Vectors.dense(features_data)) 46 | 47 | training_set = train_data.map(list).map(lambda x: func(x)).toDF() 48 | training_set.show() 49 | train_num = training_set.count() 50 | print("训练样本数:{}".format(train_num)) 51 | 52 | # 3.使用GBDT进行训练 53 | string_indexer = StringIndexer(inputCol="label", outputCol="indexed") 54 | si_model = string_indexer.fit(training_set) 55 | tf = si_model.transform(training_set) 56 | 57 | gbdt = GBTClassifier(labelCol="indexed", 58 | maxIter=maxIter, stepSize=stepSize, maxDepth=maxDepth, minInstancesPerNode=minInstancesPerNode, 59 | seed=seed) 60 | gbdt_model = gbdt.fit(tf) 61 | print(gbdt_model.featureImportances) 62 | # 保存模型 63 | model_path = project_url + '/gbdt' 64 | gbdt_model.write().overwrite().save(model_path) 65 | 66 | # 加载模型 67 | gbdt_model2 = GBTClassificationModel.load(model_path) 68 | 69 | # 预测 70 | gbdt_model2.transform(training_set).select("prediction", "label", "features").show(5) 71 | 72 | 73 | userId = 1 74 | functionName = 'gdbt' 75 | projectName = '订单分析' 76 | label = 0 # 标签列 77 | features = [12, 13, 14, 15] # 特征列 78 | model_path = model_url(projectName) # 项目路径 79 | 80 | # spark会话 81 | ss = getSparkSession(userId, functionName) 82 | # 解析项目路径,读取csv 83 | df = getProjectCurrentData(ss, projectName) 84 | # if df == "error: 项目名或项目路径有误": 85 | # state = False 86 | # reason = df 87 | # return returnDataModel(df, state, reason) 88 | gbdt(df, label, features, model_path) # 二分类 89 | 90 | # 错误 'PipelinedRDD' object has no attribute '_jdf' 91 | # 报这个错,是因为导入的机器学习包错误所致。 92 | # pyspark.ml 是用来处理DataFrame。 93 | # pyspark.mllib是用来处理RDD。 94 | -------------------------------------------------------------------------------- /app/ml/secondClassification/LR.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel, LogisticRegressionSummary 3 | from pyspark.sql import Row 4 | from pyspark.ml.linalg import Vectors 5 | from app.Utils import * 6 | 7 | ''' 8 | class pyspark.ml.classification.LogisticRegression(featuresCol='features', labelCol='label', predictionCol='prediction', 9 | maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-06, fitIntercept=True, threshold=0.5, thresholds=None, 10 | probabilityCol='probability', rawPredictionCol='rawPrediction', standardization=True, weightCol=None, 11 | aggregationDepth=2, family='auto', lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, 12 | lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None) 13 | 14 | featuresCol # 特征列 15 | labelCol # 标签列 16 | predictionCol # 预测输出列 17 | maxIter # 最大迭代轮数 18 | regParam # 正则化参数 19 | elasticNetParam # ElasticNet混合参数,范围为[0,1]。对于alpha = 0,惩罚是L2惩罚。对于alpha = 1,这是L1惩罚(默认值:0.0) 20 | tol # 迭代算法的收敛容限(> = 0)(默认值:1e-06) 21 | fitIntercept # 是否训练截距项(默认值:True) 22 | threshold # 二进制分类预测中的阈值,范围为[0,1](默认值:0.5)。 23 | thresholds # 多类别分类中的阈值,用于调整预测每个类别的概率。数组的长度必须等于类的数量,其值必须大于0,但最多一个值可以为0,这是预测值。p/t最大的类是可预测的,其中p是该类的原始概率,t是该类的概率阈值(未定义) 24 | rawPredictionCol:原始预测(也称为置信度)列名称(默认值:rawPrediction) 25 | standardization: whether to standardize the training features before fitting the model (default: True) 26 | weightCol: weight column name. If this is not set or empty, we treat all instance weights as 1.0 (current: weight) 27 | aggregationDepth # suggested depth for treeAggregate (>= 2) (default: 2) 28 | family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial. (default: auto) 29 | lowerBoundsOnCoefficients:如果在边界约束优化下拟合,则系数的下界。 (未定义) 30 | lowerBoundsOnIntercepts:如果在边界约束优化下拟合,则截距的下限。 (未定义) 31 | upperBoundsOnIntercepts:如果在边界约束优化下拟合,则截距的上限。 (未定义) 32 | 33 | ''' 34 | 35 | 36 | def project_url(projectName): 37 | urls = getProjectCurrentDataUrl(projectName) 38 | if urls == 'error': 39 | return "error: 项目名或项目路径有误" # 错误类型:项目名或项目路径有误 40 | return urls['projectAddress'] # 项目地址 41 | 42 | 43 | def lr(ss, data, label_index, feature_indexs, project_url): 44 | # 1.构造训练数据集 45 | def func(x): 46 | features_data = [] 47 | for feature in feature_indexs: 48 | features_data.append(x[feature]) 49 | return Row(label=label_index, features=Vectors.dense(features_data)) 50 | 51 | training_set = data.rdd.map(list).map(lambda x: func(x)).toDF() 52 | 53 | # 2.训练模型 54 | lr_param = LogisticRegression(regParam=0.01) 55 | lr_model = lr_param.fit(training_set) 56 | print(lr_model.coefficients) # 系数 57 | print(lr_model.intercept) # 截距 58 | print(lr_model.explainParams()) # 参数以及其注解 59 | 60 | # 3.保存模型 61 | model_path = project_url + '/model/secondClassification/lr' 62 | lr_model.write().overwrite().save(model_path) 63 | 64 | # 4.读取模型 65 | lr2 = lr_model.load(model_path) 66 | 67 | # 5.预测 68 | result = lr2.transform(training_set).head() 69 | print(result.prediction) 70 | 71 | sum = lr_model.summary 72 | print('------roc--', sum.areaUnderROC) 73 | 74 | # 6.评估 75 | # summary = lr_model.evaluate(training_set) 76 | 77 | 78 | userId = 1 79 | functionName = 'gdbt' 80 | projectName = '订单分析' 81 | label = 0 # 标签列 82 | features = [12, 13, 14, 15] # 特征列 83 | project_path = project_url(projectName) # 项目路径 84 | # spark会话 85 | ss = getSparkSession(userId, functionName) 86 | # 解析项目路径,读取csv 87 | df = getProjectCurrentData(ss, projectName) 88 | # 罗辑回归二分类 89 | lr(ss, df, label, features, project_path) 90 | -------------------------------------------------------------------------------- /app/ml/secondClassification/SVM.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from pyspark.mllib.classification import SVMModel 3 | from pyspark.mllib.classification import SVMWithSGD 4 | from pyspark.mllib.regression import LabeledPoint 5 | from pyspark.mllib.evaluation import BinaryClassificationMetrics 6 | from app.Utils import * 7 | import numpy as np 8 | 9 | 10 | def model_url(projectName): 11 | urls = getProjectCurrentDataUrl(projectName) 12 | if urls == 'error': 13 | return "error: 项目名或项目路径有误" # 错误类型:项目名或项目路径有误 14 | return urls['projectAddress'] + '/model/secondClassification' # 项目地址 15 | 16 | 17 | def svm(ss, data, label_index, feature_indexs, model_url): 18 | # 1. 准备数据 19 | def func(x): 20 | features_data = [] 21 | for feature in feature_indexs: 22 | features_data.append(x[feature]) 23 | return LabeledPoint(label=np.random.randint(0, 2), features=features_data) 24 | 25 | training_data = data.rdd.map(lambda x: func(x)) 26 | 27 | # 2. 训练 28 | svm_model = SVMWithSGD.train(training_data, iterations=20, step=1.0, regParam=0.01, 29 | miniBatchFraction=1.0, initialWeights=None, regType="l2", 30 | intercept=False, validateData=True, convergenceTol=0.001) 31 | 32 | # 3.预测 33 | predict_data = training_data.map(lambda x: x.features) 34 | prediction = svm_model.predict(predict_data) 35 | print(prediction.take(10)) 36 | # print("真实值:{},预测值{}".format(prediction, training_data.first().label)) 37 | 38 | # # 4.保存模型 39 | # svm_model_path = model_url + '/svm' 40 | # deltree(svm_model_path) # 删除已经存在的模型 41 | # svm_model.save(ss.sparkContext, svm_model_path) 42 | # 43 | # # 5.加载模型 44 | # same_model = SVMModel.load(ss.sparkContext, svm_model_path) 45 | 46 | # 6.模型评估 47 | evl(training_data, svm_model) 48 | 49 | 50 | def evl(data, svmModel): 51 | ## 正确率和错误率 52 | # lrTotalCorrect = data.map(lambda r: 1 if (lrModel.predict(r.features) == r.label) else 0).reduce(lambda x, y: x + y) 53 | # lrAccuracy = lrTotalCorrect / float(data.count()) # 0.5136044023234485 54 | 55 | svmTotalCorrect = data.map(lambda r: 1 if (svmModel.predict(r.features) == r.label) else 0).reduce( 56 | lambda x, y: x + y) 57 | svmAccuracy = svmTotalCorrect / float(data.count()) # 0.5136044023234485 58 | # 59 | # nbTotalCorrect = data_for_bayes.map(lambda r: 1 if (bayesModel.predict(r.features) == r.label) else 0).reduce( 60 | # lambda x, y: x + y) 61 | # nbAccuracy = nbTotalCorrect / float(data_for_bayes.count()) # 0.5799449709568939 62 | 63 | # dt_predictions = dtModel.predict(data.map(lambda x: x.features)) 64 | # labelsAndPredictions = data.map(lambda x: x.label).zip(dt_predictions) 65 | # dtTotalCorrect = labelsAndPredictions.map(lambda r: 1 if (r[0] == r[1]) else 0).reduce(lambda x, y: x + y) 66 | # dtAccuracy = dtTotalCorrect / float(data.count()) # 0.654234179150107 67 | 68 | # Compute raw scores on the test set 69 | # lrPredictionAndLabels = data.map(lambda lp: (float(lrModel.predict(lp.features)), lp.label)) 70 | # # Instantiate metrics object 71 | # lrmetrics = BinaryClassificationMetrics(lrPredictionAndLabels) 72 | # # Area under precision-recall curve 73 | # print("Area under PR = %s" % lrmetrics.areaUnderPR) 74 | # # Area under ROC curve 75 | # print("Area under ROC = %s" % lrmetrics.areaUnderROC) 76 | 77 | # 清除默认阈值,这样会输出原始的预测评分,即带有确信度的结果 78 | svmModel.clearThreshold() 79 | predict_data = data.map(lambda x: x.features) 80 | prediction = svmModel.predict(predict_data) 81 | print(prediction.take(10)) 82 | 83 | svmPredictionAndLabels = data.map(lambda lp: (float(svmModel.predict(lp.features)), lp.label)) 84 | svmMetrics = BinaryClassificationMetrics(svmPredictionAndLabels) 85 | print("Area under PR = %s" % svmMetrics.areaUnderPR) 86 | print("Area under ROC = %s" % svmMetrics.areaUnderROC) 87 | 88 | # bayesPredictionAndLabels = data_for_bayes.map(lambda lp: (float(bayesModel.predict(lp.features)), lp.label)) 89 | # bayesMetrics = BinaryClassificationMetrics(bayesPredictionAndLabels) 90 | # print("Area under PR = %s" % bayesMetrics.areaUnderPR) 91 | # print("Area under ROC = %s" % bayesMetrics.areaUnderROC) 92 | 93 | 94 | userId = 1 95 | functionName = 'gdbt' 96 | projectName = '订单分析' 97 | label = 0 # 标签列 98 | features = ["数量", "折扣", "利润", "装运成本"] # 特征列 99 | model_path = model_url(projectName) # 项目路径 100 | 101 | # spark会话 102 | ss = getSparkSession(userId, functionName) 103 | # 解析项目路径,读取csv 104 | df = getProjectCurrentData(ss, projectName) 105 | # svm二分类 106 | svm(ss, df, label, features, model_path) 107 | 108 | ''' 109 | 错误 'PipelinedRDD' object has no attribute 'show' 110 | 报这个错,是因为 df.show() is only for spark DataFrame 所致。 111 | ''' 112 | -------------------------------------------------------------------------------- /app/ml/secondClassification/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kugomusic/Easy_Data/a74c3cd2c9c3b0e5a9298f8c3b7af2a2f5caf260/app/ml/secondClassification/__init__.py -------------------------------------------------------------------------------- /app/models/MSEntity.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from app import db 3 | import click 4 | 5 | 6 | # @app.cli.command() 7 | def initdb(): 8 | """ 9 | 创建数据库 10 | :return: 11 | """ 12 | db.create_all() 13 | click.echo('Initialized database.') 14 | 15 | 16 | # @app.cli.command() 17 | def dropdb(): 18 | """ 19 | 删除数据库 20 | :return: 21 | """ 22 | db.drop_all() 23 | click.echo('Drop database.') 24 | 25 | 26 | class DataSource(db.Model): 27 | """ 28 | 数据源类 29 | """ 30 | __tablename__ = 'data_source' 31 | id = db.Column(db.Integer, primary_key=True) 32 | file_name = db.Column(db.String(64), unique=True, index=True) 33 | file_url = db.Column(db.Text, unique=True) 34 | file_type = db.Column(db.String(64)) 35 | create_user = db.Column(db.String(64)) 36 | open_level = db.Column(db.String(64)) 37 | create_time = db.Column(db.String(64)) 38 | 39 | 40 | class User(db.Model): 41 | """ 42 | 用户类 43 | """ 44 | __tablename__ = 'user' 45 | id = db.Column(db.Integer, primary_key=True) 46 | user_name = db.Column(db.String(64), unique=True, index=True) 47 | password = db.Column(db.String(128)) 48 | 49 | 50 | class Project(db.Model): 51 | """ 52 | 项目类 53 | """ 54 | __tablename__ = 'project' 55 | id = db.Column(db.Integer, primary_key=True) 56 | project_name = db.Column(db.String(64), unique=True, index=True) 57 | project_address = db.Column(db.String(256), unique=True, index=True) 58 | user_id = db.Column(db.Integer, db.ForeignKey('user.id')) 59 | dataSource_id = db.Column(db.Integer, db.ForeignKey('data_source.id')) 60 | 61 | 62 | class ProcessFlow(db.Model): 63 | """ 64 | 数据处理流程类 65 | """ 66 | __tablename__ = 'process_flow' 67 | id = db.Column(db.Integer, primary_key=True) 68 | project_id = db.Column(db.String(64), unique=True, index=True) 69 | operates = db.Column(db.String(13000)) 70 | cur_ope_id = db.Column(db.String(128)) 71 | links = db.Column(db.String(5000)) 72 | 73 | 74 | class Model(db.Model): 75 | """ 76 | DAG图模型类 77 | """ 78 | __tablename__ = 'model' 79 | id = db.Column(db.Integer, primary_key=True) 80 | model_name = db.Column(db.String(64)) 81 | project_id = db.Column(db.Integer) 82 | start_nodes = db.Column(db.String(2048)) 83 | config = db.Column(db.String(8192)) 84 | create_time = db.Column(db.String(32)) 85 | 86 | 87 | class OperatorType(db.Model): 88 | """ 89 | 算子的类型(如:读数据,filter) 90 | """ 91 | __tablename__ = 'operator_type' 92 | id = db.Column(db.Integer, primary_key=True) 93 | type_name = db.Column(db.String(128)) 94 | type_label = db.Column(db.String(128)) 95 | 96 | 97 | class Operator(db.Model): 98 | """ 99 | 算子类 100 | """ 101 | __tablename__ = 'operator' 102 | id = db.Column(db.String(128), primary_key=True) 103 | operator_name = db.Column(db.String(64)) 104 | father_operator_ids = db.Column(db.String(128)) 105 | child_operator_ids = db.Column(db.String(128)) 106 | model_id = db.Column(db.Integer) 107 | status = db.Column(db.String(32)) 108 | operator_output_url = db.Column(db.String(512)) 109 | operator_input_url = db.Column(db.String(512)) 110 | operator_type_id = db.Column(db.Integer) 111 | operator_config = db.Column(db.String(4096)) 112 | operator_style = db.Column(db.String(4096)) 113 | run_info = db.Column(db.String(8192)) 114 | 115 | 116 | class ModelExecute(db.Model): 117 | """ 118 | 模型执行记录表 119 | """ 120 | __tablename__ = 'model_execute' 121 | id = db.Column(db.Integer, primary_key=True) 122 | model_id = db.Column(db.Integer) 123 | start_nodes = db.Column(db.String(2048)) 124 | status = db.Column(db.String(32)) 125 | execute_user_id = db.Column(db.Integer) 126 | run_info = db.Column(db.String(4096)) 127 | create_time = db.Column(db.String(32)) 128 | end_time = db.Column(db.String(32)) 129 | 130 | 131 | class Report(db.Model): 132 | """ 133 | 报告表 134 | """ 135 | __tablename__ = 'report' 136 | id = db.Column(db.Integer, primary_key=True) 137 | user_id = db.Column(db.String(128)) 138 | report_title = db.Column(db.String(128)) 139 | report_content = db.Column(db.String(20000)) 140 | 141 | 142 | class MLModel(db.Model): 143 | """ 144 | 机器学习算法训练结果模型表 145 | """ 146 | __tablename__ = 'ml_model' 147 | id = db.Column(db.Integer, primary_key=True) 148 | user_id = db.Column(db.Integer) 149 | project_id = db.Column(db.Integer) 150 | model_id = db.Column(db.Integer) 151 | status = db.Column(db.String(32)) 152 | name = db.Column(db.String(32)) 153 | operator_type_id = db.Column(db.Integer) 154 | model_url = db.Column(db.String(256)) 155 | -------------------------------------------------------------------------------- /app/models/ServerNameMap.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | """ 4 | v1版本,弃用 5 | """ 6 | 7 | 8 | class ServerNameMap(): 9 | operateIdToNameMap = { 10 | # 1000 数据预处理 11 | "1001": "过滤", 12 | "1002": "排序", 13 | "1003": "列拆分", 14 | "1004": "行拆分", 15 | "1005": "列合并", 16 | "1006": "替换", 17 | "1007": "填充控制", 18 | "1008": "列变换", 19 | # 2000 特征工程 20 | "2001": "分位数离散化", 21 | "2002": "向量索引转换", 22 | "2003": "标准化", 23 | "2004": "PCA", 24 | "2005": "字符串转标签", 25 | "2006": "独热编码", 26 | "2007": "多项式扩展", 27 | "2008": "卡方选择" 28 | } 29 | operateIdToTypeMap = { 30 | # 1000 数据预处理 31 | "1001": "数据预处理", 32 | "1002": "数据预处理", 33 | "1003": "数据预处理", 34 | "1004": "数据预处理", 35 | "1005": "数据预处理", 36 | "1006": "数据预处理", 37 | "1007": "数据预处理", 38 | "1008": "数据预处理", 39 | # 2000 特征工程 40 | "2001": "特征工程", 41 | "2002": "特征工程", 42 | "2003": "特征工程", 43 | "2004": "特征工程", 44 | "2005": "特征工程", 45 | "2006": "特征工程", 46 | "2007": "特征工程", 47 | "2008": "特征工程" 48 | } 49 | 50 | typeToColorMap = { 51 | "数据预处理": '#5fe0a6', 52 | "特征工程": '#f0bb66' 53 | } 54 | -------------------------------------------------------------------------------- /app/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kugomusic/Easy_Data/a74c3cd2c9c3b0e5a9298f8c3b7af2a2f5caf260/app/models/__init__.py -------------------------------------------------------------------------------- /app/service/ClearTask.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from app.models.MSEntity import Operator, MLModel 3 | from app import db 4 | from app.Utils import deltree, deldir 5 | 6 | """ 7 | 该类的作用:清理无用的中间数据 8 | """ 9 | 10 | import os 11 | 12 | filePath = '/home/zk/midData' 13 | modelPaths = ['/home/zk/midData/model/secondClassification/svm', '/home/zk/midData/model/secondClassification/gbdt'] 14 | 15 | # 数据可用到的中间数据 16 | urls_arr = [] 17 | # 查找operator 18 | query = db.session.query(Operator) 19 | db.session.commit() 20 | for operator in query: 21 | url = operator.operator_output_url 22 | if (url is not None) and (url is not ''): 23 | urls_arr.extend(url.split('*,')) 24 | 25 | # 查找 保存的model 26 | models = db.session.query(MLModel) 27 | db.session.commit() 28 | for model in models: 29 | url = model.model_url 30 | if (url is not None) and (url is not ''): 31 | urls_arr.extend(url.split('*,')) 32 | print(urls_arr) 33 | 34 | # 磁盘上所有中间数据 35 | all_file = [] 36 | for i, j, k in os.walk(filePath): 37 | for item in k: 38 | all_file.append(filePath + '/' + item) 39 | break 40 | 41 | for modelPath in modelPaths: 42 | for i, j, k in os.walk(modelPath): 43 | for item in j: 44 | all_file.append(modelPath + '/' + item) 45 | break 46 | 47 | for url in all_file: 48 | print(url) 49 | 50 | print('******删除一下内容:') 51 | for url in all_file: 52 | if url not in urls_arr: 53 | print("删除:" + url) 54 | if os.path.isdir(url): 55 | deltree(url) 56 | elif os.path.isfile(url): 57 | deldir(url) 58 | -------------------------------------------------------------------------------- /app/service/ExplorationService.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from app.Utils import * 3 | import app.dao.OperatorDao as OperatorDao 4 | import pandas as pd 5 | 6 | 7 | def full_table_statistics(spark_session, operator_id, file_url, condition): 8 | """ 9 | 全表统计 10 | :param spark_session: 11 | :param operator_id: 12 | :param file_url: 13 | :param condition: 14 | :return: 15 | """ 16 | 17 | try: 18 | # 修改计算状态 19 | OperatorDao.update_operator_by_id(operator_id, 'running', '', '') 20 | # 读取数据 21 | df = read_data_pandas(file_url) 22 | # 全表统计函数 23 | result_df = full_table_statistics_core(df, condition) 24 | if isinstance(result_df, str): 25 | OperatorDao.update_operator_by_id(operator_id, 'error', '', result_df) 26 | else: 27 | # 存储结果 28 | result_file_url = save_data_pandas(result_df, '', '', 1) 29 | run_info = '全表统计算子执行成功' 30 | # 修改计算状态 31 | OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url, run_info) 32 | return [result_file_url] 33 | 34 | except Exception as e: 35 | run_info = str(e) 36 | OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info) 37 | traceback.print_exc() 38 | return [] 39 | 40 | 41 | def full_table_statistics_core(df, condition): 42 | """ 43 | 全表统计核心函数 44 | :param df: 数据(pandas) 45 | :param condition: {"projectId": 32, "columnNames": ['利润']} 46 | :return: 47 | """ 48 | column_names = condition['columnNames'] 49 | statistics = ['类型', '总数', '最小值', '最小值位置', '25%分位数', '中位数', '75%分位数', '均值', '最大值', '最大值位置', '平均绝对偏差', '方差', 50 | '标准差', '偏度', '峰度'] 51 | data = {} 52 | for columnName in column_names: 53 | info = [] 54 | column_type = df[columnName].dtype 55 | if column_type == 'int64' or column_type == 'float64': 56 | info.append('number') 57 | info.append(str(df[columnName].count())) 58 | info.append(str(df[columnName].min())) 59 | info.append(str(df[columnName].idxmin())) 60 | info.append(str(df[columnName].quantile(.25))) 61 | info.append(str(df[columnName].median())) 62 | info.append(str(df[columnName].quantile(.75))) 63 | info.append(str(df[columnName].mean())) 64 | info.append(str(df[columnName].max())) 65 | info.append(str(df[columnName].idxmax())) 66 | info.append(str(df[columnName].mad())) 67 | info.append(str(df[columnName].var())) 68 | info.append(str(df[columnName].std())) 69 | info.append(str(df[columnName].skew())) 70 | info.append(str(df[columnName].kurt())) 71 | else: 72 | info.append('text') 73 | info.append(str(df[columnName].count())) 74 | info.append('') 75 | info.append('') 76 | info.append('') 77 | info.append('') 78 | info.append('') 79 | info.append('') 80 | info.append('') 81 | info.append('') 82 | info.append('') 83 | info.append('') 84 | info.append('') 85 | info.append('') 86 | info.append('') 87 | data[columnName] = info 88 | print(pd.DataFrame(data, index=statistics)) 89 | return pd.DataFrame(data, index=statistics) 90 | 91 | 92 | def frequency_statistics(spark_session, operator_id, file_url, condition): 93 | """ 94 | 频次统计 95 | :param spark_session: 96 | :param operator_id: 97 | :param file_url: 98 | :param condition: 99 | :return: 100 | """ 101 | try: 102 | # 修改计算状态 103 | OperatorDao.update_operator_by_id(operator_id, 'running', '', '') 104 | # 读取数据 105 | df = read_data_pandas(file_url) 106 | # 频次统计函数 107 | result_df = frequency_statistics_core(df, condition) 108 | if isinstance(result_df, str): 109 | OperatorDao.update_operator_by_id(operator_id, 'error', '', result_df) 110 | else: 111 | # 存储结果 112 | result_file_url = save_data_pandas(result_df) 113 | run_info = '频次统计算子执行成功' 114 | # 修改计算状态 115 | OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url, run_info) 116 | return [result_file_url] 117 | 118 | except Exception as e: 119 | run_info = str(e) 120 | OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info) 121 | traceback.print_exc() 122 | return [] 123 | 124 | 125 | def frequency_statistics_core(df, condition): 126 | """ 127 | :param df: 128 | :param condition:{"projectId":32,"columnName":"Item"} 129 | :return: 130 | """ 131 | column_name = condition['columnName'] 132 | 133 | s = df[column_name].value_counts() 134 | data = {column_name: s.index, "频率": s.values} 135 | print(pd.DataFrame(data)) 136 | return pd.DataFrame(data) 137 | 138 | 139 | def correlation_coefficient(spark_session, operator_id, file_url, condition): 140 | """ 141 | 相关系数 142 | :param spark_session: 143 | :param operator_id: 144 | :param file_url: 145 | :param condition: 146 | :return: 147 | """ 148 | try: 149 | # 修改计算状态 150 | OperatorDao.update_operator_by_id(operator_id, 'running', '', '') 151 | # 读取数据 152 | df = read_data_pandas(file_url) 153 | # 相关系数函数 154 | result_df = correlation_coefficient_core(df, condition) 155 | if isinstance(result_df, str): 156 | OperatorDao.update_operator_by_id(operator_id, 'error', '', result_df) 157 | else: 158 | # 存储结果 159 | result_file_url = save_data_pandas(result_df, '', '', 1) 160 | run_info = '相关系数算子执行成功' 161 | # 修改计算状态 162 | OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url, run_info) 163 | return [result_file_url] 164 | 165 | except Exception as e: 166 | run_info = str(e) 167 | OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info) 168 | traceback.print_exc() 169 | return [] 170 | 171 | 172 | def correlation_coefficient_core(df, condition): 173 | """ 174 | :param df: 175 | :param condition: {"projectId": 32, "columnNames": ["销售额", "折扣", "装运成本"]} 176 | :return: 177 | """ 178 | column_names = condition['columnNames'] 179 | # 报错信息:如果所选列不是数值型,则报错 180 | accept_types = ['int64', 'float64'] 181 | for columnName in column_names: 182 | if df[columnName].dtype not in accept_types: 183 | return "只能计算数值型列的相关系数,但是 <" + columnName + "> 的类型为 " + str(df[columnName].dtype) 184 | # 计算出相关系数矩阵df 185 | return df.corr() 186 | -------------------------------------------------------------------------------- /app/service/MLModelService.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | import app.dao.ModelDao as ModelDao 4 | import app.dao.MLModelDao as MLModelDao 5 | import app.dao.OperatorDao as OperatorDao 6 | import app.dao.OperatorTypeDao as OperatorTypeDao 7 | import app.dao.ModelExecuteDao as ModelExecuteDao 8 | from app.models.MSEntity import Operator, ModelExecute, MLModel 9 | import app.service.ModelExecuteService as ModelExecuteService 10 | from app.Utils import * 11 | 12 | """ 13 | 关于ml_model 的处理方法 14 | """ 15 | 16 | 17 | def save_ml_model(operator_id, user_id, name): 18 | """ 19 | 保存训练模型 20 | :param operator_id: 21 | :param user_id: 22 | :param name: 23 | :return: 24 | """ 25 | # 查看算子 26 | operator = OperatorDao.get_operator_by_id(operator_id) 27 | if operator.operator_type_id > 7000 or operator.operator_type_id < 6001: 28 | return "所选择的节点并不是模型算子节点" 29 | if operator.status != "success": 30 | return "请执行该节点" 31 | if operator.operator_output_url is not None: 32 | operator_output_url = operator.operator_output_url.split('*,') 33 | else: 34 | return "没有运行结果" 35 | 36 | model_url = operator_output_url[0] 37 | operator_type_id = operator.operator_type_id 38 | model_id = operator.model_id 39 | 40 | # 查看执行流程model 41 | model = ModelDao.get_model_by_id(model_id) 42 | project_id = model.project_id 43 | 44 | ml_model = MLModel(user_id=user_id, project_id=project_id, model_id=model_id, status='save', name=name, 45 | operator_type_id=operator_type_id, model_url=model_url) 46 | return MLModelDao.create_ml_model(ml_model) 47 | 48 | 49 | def get_ml_model(ml_model_id, project_id, user_id, model_id, name, status): 50 | """ 51 | 按照条件查询ml_model 52 | :param ml_model_id: 53 | :param project_id: 54 | :param user_id: 55 | :param model_id: 56 | :param name: 57 | :param status: 58 | :return: 59 | """ 60 | ml_models = MLModel.query 61 | if (ml_model_id is not None) and (ml_model_id is not ''): 62 | ml_models = ml_models.filter(MLModel.id == ml_model_id) 63 | if (project_id is not None) and (project_id is not ''): 64 | ml_models = ml_models.filter(MLModel.project_id == project_id) 65 | if (user_id is not None) and (user_id is not ''): 66 | ml_models = ml_models.filter(MLModel.user_id == user_id) 67 | if (model_id is not None) and (model_id is not ''): 68 | ml_models = ml_models.filter(MLModel.model_id == model_id) 69 | if (name is not None) and (name is not ''): 70 | ml_models = ml_models.filter(MLModel.name == name) 71 | if (status is not None) and (status is not ''): 72 | ml_models = ml_models.filter(MLModel.status == status) 73 | results = [] 74 | for ml_model in ml_models: 75 | results.append({"MLModelId": ml_model.id, "status": ml_model.status, "name": ml_model.name, 76 | "operatorTypeId": ml_model.operator_type_id}) 77 | return results 78 | 79 | 80 | def delete_ml_model(ml_model_id): 81 | """ 82 | 删除 ml_model 83 | :param ml_model_id: 84 | :return: 85 | """ 86 | filters = {MLModel.id: ml_model_id} 87 | MLModel.query.filter(*filters).delete() 88 | db.session.commit() 89 | -------------------------------------------------------------------------------- /app/service/ModelExecuteService.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | import queue 4 | import threading 5 | import traceback 6 | import app.service.FEService as FEService 7 | import app.service.ml.Evaluation as Evaluation 8 | import app.service.ml.ModelService as ModelService 9 | import app.service.PreprocessService as preprocessService 10 | import app.service.ExplorationService as ExplorationService 11 | import app.service.ml.PredictService as PredictService 12 | import app.service.ml.SecondClassification as SecondClassification 13 | import app.service.ml.MultipleClassifition as MultipleClassifition 14 | import app.dao.OperatorDao as OperatorDao 15 | 16 | 17 | def model_thread_execute(spark_session, start_nodes): 18 | """ 19 | 多线程执行 model(执行流程) 20 | :param spark_session: 21 | :param start_nodes:['1','2'] model(执行流程启动的节点) 22 | :return: 23 | """ 24 | 25 | class MyThread(threading.Thread): 26 | def __init__(self, threadID, name, q): 27 | threading.Thread.__init__(self) 28 | self.threadID = threadID 29 | self.name = name 30 | self.q = q 31 | 32 | def run(self): 33 | print("开启线程:" + self.name) 34 | process_data(self.name, self.q) 35 | print("退出线程:" + self.name) 36 | 37 | def process_data(threadName, q): 38 | print('-------进入线程:', threadName) 39 | 40 | while G.noExecFlag or G.execCounter or not workQueue.empty(): 41 | print('-------进入线程内部循环:', not workQueue.empty()) 42 | # TODO 多线程安全 43 | G.noExecFlag = 0 44 | G.execCounter += 1 45 | queueLock.acquire() 46 | if not workQueue.empty(): 47 | operator_id = q.get() 48 | queueLock.release() 49 | # TODO:处理函数 50 | could_execute_operator_ids = operator_execute(spark_session, operator_id) 51 | if could_execute_operator_ids is False: 52 | print("%s processing %s error" % (threadName, operator_id)) 53 | else: 54 | for item_id in could_execute_operator_ids: 55 | if item_id != '' and item_id is not None: 56 | q.put(item_id) 57 | print("%s processing %s add %s to queue" % ( 58 | threadName, operator_id, ','.join(could_execute_operator_ids))) 59 | print("q.size: %s --- workQueue.size: %s" % (q.qsize(), workQueue.qsize())) 60 | else: 61 | queueLock.release() 62 | G.execCounter -= 1 63 | time.sleep(1) 64 | 65 | class G: 66 | # 未开始执行 67 | noExecFlag = 1 68 | # 正在执行operator的个数 69 | execCounter = 0 70 | 71 | threadList = ["Thread-1", "Thread-2", "Thread-3"] 72 | queueLock = threading.Lock() 73 | workQueue = queue.Queue(10) 74 | threads = [] 75 | threadID = 1 76 | 77 | # 填充队列 78 | queueLock.acquire() 79 | for word in start_nodes: 80 | workQueue.put(word) 81 | queueLock.release() 82 | 83 | # 创建新线程 84 | for tName in threadList: 85 | thread = MyThread(threadID, tName, workQueue) 86 | thread.start() 87 | threads.append(thread) 88 | threadID += 1 89 | 90 | # 等待队列清空 91 | while (not workQueue.empty()) or G.execCounter: 92 | pass 93 | 94 | # 等待所有线程完成 95 | for t in threads: 96 | t.join() 97 | print("退出主线程") 98 | 99 | 100 | def operator_execute(spark_session, operator_id): 101 | """ 102 | 执行算子 103 | :param spark_session: 104 | :param operator_id: 105 | :return: 106 | """ 107 | try: 108 | # 查算子 109 | operator = OperatorDao.get_operator_by_id(operator_id) 110 | print("------执行算子------", "operator_id:", operator_id, operator.operator_type_id) 111 | # 获取input_url 112 | config = json.loads(operator.operator_config) 113 | file_url_list = config['fileUrl'] 114 | # 获取输入地址 115 | url_arr = [] 116 | for file_url_dict in file_url_list: 117 | key = '' 118 | for ikey in file_url_dict.keys(): 119 | key = ikey 120 | if operator_id == key: 121 | url_arr.append(file_url_dict[key]) 122 | else: 123 | father = OperatorDao.get_operator_by_id(key) 124 | # 检查父节点是否准备就绪 125 | if father.status != 'success': 126 | return [] 127 | # TODO:暂定从0 开始 128 | father_output_url_index = file_url_dict[key] 129 | father_url_arr = father.operator_output_url.split('*,') 130 | url_arr.append(father_url_arr[father_output_url_index]) 131 | # 算子函数 132 | if operator.operator_type_id == 1001: 133 | preprocessService.filter_multi_conditions(spark_session, operator_id, url_arr[0], 134 | json.loads(operator.operator_config)['parameter']) 135 | elif operator.operator_type_id == 1002: 136 | preprocessService.sort(spark_session, operator_id, url_arr[0], 137 | json.loads(operator.operator_config)['parameter']) 138 | elif operator.operator_type_id == 1003: 139 | preprocessService.column_split(spark_session, operator_id, url_arr[0], 140 | json.loads(operator.operator_config)['parameter']) 141 | elif operator.operator_type_id == 1005: 142 | preprocessService.columns_merge(spark_session, operator_id, url_arr[0], 143 | json.loads(operator.operator_config)['parameter']) 144 | elif operator.operator_type_id == 1006: 145 | preprocessService.replace(spark_session, operator_id, url_arr[0], 146 | json.loads(operator.operator_config)['parameter']) 147 | elif operator.operator_type_id == 1007: 148 | preprocessService.fill_null_value(spark_session, operator_id, url_arr[0], 149 | json.loads(operator.operator_config)['parameter']) 150 | elif operator.operator_type_id == 1008: 151 | preprocessService.column_map(spark_session, operator_id, url_arr[0], 152 | json.loads(operator.operator_config)['parameter']) 153 | elif operator.operator_type_id == 1009: 154 | preprocessService.random_split(spark_session, operator_id, url_arr[0], 155 | json.loads(operator.operator_config)['parameter']) 156 | elif operator.operator_type_id == 2001: 157 | FEService.quantile_discretization(spark_session, operator_id, url_arr[0], 158 | json.loads(operator.operator_config)['parameter']) 159 | elif operator.operator_type_id == 2002: 160 | FEService.vector_indexer(spark_session, operator_id, url_arr[0], 161 | json.loads(operator.operator_config)['parameter']) 162 | elif operator.operator_type_id == 2003: 163 | FEService.standard_scaler(spark_session, operator_id, url_arr[0], 164 | json.loads(operator.operator_config)['parameter']) 165 | elif operator.operator_type_id == 2004: 166 | FEService.pca(spark_session, operator_id, url_arr[0], 167 | json.loads(operator.operator_config)['parameter']) 168 | elif operator.operator_type_id == 2005: 169 | FEService.string_indexer(spark_session, operator_id, url_arr[0], 170 | json.loads(operator.operator_config)['parameter']) 171 | elif operator.operator_type_id == 2006: 172 | FEService.one_hot_encoder(spark_session, operator_id, url_arr[0], 173 | json.loads(operator.operator_config)['parameter']) 174 | elif operator.operator_type_id == 2007: 175 | FEService.polynomial_expansion(spark_session, operator_id, url_arr[0], 176 | json.loads(operator.operator_config)['parameter']) 177 | elif operator.operator_type_id == 2008: 178 | FEService.chiSqSelector(spark_session, operator_id, url_arr[0], 179 | json.loads(operator.operator_config)['parameter']) 180 | elif operator.operator_type_id == 3001: 181 | ExplorationService.full_table_statistics(spark_session, operator_id, url_arr[0], 182 | json.loads(operator.operator_config)['parameter']) 183 | elif operator.operator_type_id == 3002: 184 | ExplorationService.frequency_statistics(spark_session, operator_id, url_arr[0], 185 | json.loads(operator.operator_config)['parameter']) 186 | elif operator.operator_type_id == 3003: 187 | ExplorationService.correlation_coefficient(spark_session, operator_id, url_arr[0], 188 | json.loads(operator.operator_config)['parameter']) 189 | elif operator.operator_type_id == 5001: 190 | preprocessService.read_data_with_update_record(spark_session, operator_id, url_arr[0]) 191 | elif operator.operator_type_id == 6000: 192 | PredictService.ml_predict(spark_session, operator_id, url_arr, 193 | json.loads(operator.operator_config)['parameter']) 194 | elif operator.operator_type_id == 6001: 195 | SecondClassification.svm(spark_session, operator_id, url_arr[0], 196 | json.loads(operator.operator_config)['parameter']) 197 | elif operator.operator_type_id == 6002: 198 | SecondClassification.gbdt(spark_session, operator_id, url_arr[0], 199 | json.loads(operator.operator_config)['parameter']) 200 | elif operator.operator_type_id == 6003: 201 | SecondClassification.lr(spark_session, operator_id, url_arr[0], 202 | json.loads(operator.operator_config)['parameter']) 203 | elif operator.operator_type_id == 6004: 204 | MultipleClassifition.lr(spark_session, operator_id, url_arr[0], 205 | json.loads(operator.operator_config)['parameter']) 206 | elif operator.operator_type_id == 6005: 207 | MultipleClassifition.mpc(spark_session, operator_id, url_arr[0], 208 | json.loads(operator.operator_config)['parameter']) 209 | elif operator.operator_type_id == 7001: 210 | Evaluation.second_evaluation(spark_session, operator_id, 211 | json.loads(operator.operator_config)['parameter']) 212 | elif operator.operator_type_id == 8000: 213 | ModelService.model_operator(operator_id, json.loads(operator.operator_config)['parameter']) 214 | 215 | return operator.child_operator_ids.split(',') 216 | 217 | except: 218 | traceback.print_exc() 219 | return False 220 | -------------------------------------------------------------------------------- /app/service/ModelService.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | import app.dao.ModelDao as ModelDao 4 | import app.dao.OperatorDao as OperatorDao 5 | import app.dao.OperatorTypeDao as OperatorTypeDao 6 | import app.dao.ModelExecuteDao as ModelExecuteDao 7 | from app.models.MSEntity import Operator, ModelExecute 8 | import app.service.ModelExecuteService as ModelExecuteService 9 | from app.Utils import * 10 | 11 | """ 12 | 关于model(执行流程)的处理方法 13 | """ 14 | 15 | 16 | def update_model(project_id, start_nodes, config, relationship, config_order): 17 | """ 18 | 更新 model (处理流程图) 19 | ToDo: 没有考虑数据库操作的原子性 20 | :param project_id: 21 | :param start_nodes: 22 | :param config: 23 | :return: 24 | """ 25 | try: 26 | # 获取model 27 | model = ModelDao.get_model_by_project_id(project_id) 28 | if model is False: 29 | return False 30 | 31 | # 更新model 32 | update_result = ModelDao.update_with_project_id(project_id, start_nodes, relationship, config_order) 33 | if update_result is False: 34 | return False 35 | 36 | # 获取 operator 37 | operator_old = OperatorDao.get_operator_by_model_id(model.id) 38 | 39 | # 新的operator 40 | operators = [] 41 | config_dict = json.loads(config) 42 | for operator_id in config_dict.keys(): 43 | operator_dict = config_dict.get(operator_id) 44 | operator_style = json.dumps({'location': operator_dict['location'], }, ensure_ascii=False) 45 | # json.dumps(operates, ensure_ascii=False) 46 | ope = Operator(id=operator_id, 47 | father_operator_ids=','.join(operator_dict['pre']), 48 | child_operator_ids=','.join(operator_dict['next']), 49 | model_id=model.id, 50 | status='initial', 51 | operator_type_id=operator_dict['name'], 52 | operator_config=json.dumps(operator_dict['config'], ensure_ascii=False), 53 | operator_style=operator_style) 54 | operators.append(ope) 55 | 56 | # 准备删除的算子 57 | operator_delete = [] 58 | # 准备更新的算子 59 | operator_update = [] 60 | for old in operator_old: 61 | flag_exist = False 62 | for new in operators: 63 | if old.id == new.id: 64 | flag_exist = True 65 | operator_update.append([old, new]) 66 | break 67 | if not flag_exist: 68 | operator_delete.append(old) 69 | # 删除算子 70 | for delete in operator_delete: 71 | OperatorDao.delete_operator_by_id(delete.id) 72 | print("********删除算子", delete) 73 | 74 | # 更新算子 75 | for update in operator_update: 76 | update[0].father_operator_ids = update[1].father_operator_ids 77 | update[0].child_operator_ids = update[1].child_operator_ids 78 | update[0].model_id = update[1].model_id 79 | update[0].operator_type_id = update[1].operator_type_id 80 | update[0].operator_config = update[1].operator_config 81 | update[0].operator_style = update[1].operator_style 82 | # 更新算子 83 | OperatorDao.create_operator([update[0]]) 84 | print("*********更新算子", update[0]) 85 | 86 | # 准备添加的算子 87 | operator_add = [] 88 | for new in operators: 89 | flag_exist = False 90 | for old in operator_old: 91 | if old.id == new.id: 92 | flag_exist = True 93 | break 94 | if not flag_exist: 95 | operator_add.append(new) 96 | # 添加算子 97 | OperatorDao.create_operator(operator_add) 98 | print("*********添加算子", operator_add) 99 | return True 100 | except: 101 | traceback.print_exc() 102 | return False 103 | 104 | 105 | def get_model_by_project_id(project_id): 106 | """ 107 | 获取项目对应的model(执行流程) 108 | :param project_id: 109 | :return: 110 | """ 111 | # 获取 model 112 | model = ModelDao.get_model_by_project_id(project_id) 113 | if model is False: 114 | return False 115 | 116 | # 获取 operator 117 | operators = OperatorDao.get_operator_by_model_id(model.id) 118 | if operators is False: 119 | return False 120 | 121 | # 获取 operator_type 122 | operator_types = OperatorTypeDao.get_all_operator_type() 123 | if operator_types is False: 124 | return False 125 | 126 | # TODO : 查询数据源表 127 | 128 | operator_types_dict = dict() 129 | for operator_type in operator_types: 130 | operator_types_dict[operator_type.id] = operator_type 131 | 132 | # 返回结果 133 | config = dict() 134 | for operator in operators: 135 | if operator_types_dict[operator.operator_type_id].id == 5001: 136 | data_operator_type = json.loads(operator.operator_config)['fileUrl'][0][operator.id].split('/')[-1] 137 | else: 138 | data_operator_type = operator_types_dict[operator.operator_type_id].type_name 139 | config[operator.id] = {'type': data_operator_type, 140 | 'name': operator_types_dict[operator.operator_type_id].id, 141 | 'location': json.loads(operator.operator_style)['location'], 142 | 'config': json.loads(operator.operator_config), 143 | 'next': operator.child_operator_ids.split(','), 144 | "pre": operator.father_operator_ids.split(',')} 145 | model_config = json.loads(model.config) 146 | relationship = [] 147 | for item in model_config['relationship'].split('*,'): 148 | relationship.append(list_str_to_list(item)) 149 | config_order = json.loads(model_config['config_order']) 150 | return {'projectId': project_id, 'config': config, 'startNode': model.start_nodes.split(','), 151 | 'relationship': relationship, 'config_order': config_order} 152 | 153 | 154 | def get_status_model_execute_end(project_id, start_operator_ids): 155 | """ 156 | 获取运行结束后的状态 157 | 158 | :param project_id: 159 | :param start_operator_ids: 160 | :return: 161 | """ 162 | # 获取 model 163 | model = ModelDao.get_model_by_project_id(project_id) 164 | if model is False: 165 | return False 166 | 167 | # 获取 operator 168 | operators = OperatorDao.get_operator_by_model_id(model.id) 169 | if operators is False: 170 | return False 171 | 172 | # 构造dict 173 | id_operator_dict = {} 174 | for operator in operators: 175 | id_operator_dict[operator.id] = operator 176 | 177 | operator_from_one_ids = [] 178 | operator_from_one_ids.extend(start_operator_ids) 179 | 180 | # 从此次执行 起始节点及以后节点的状态 181 | status_set = set() 182 | while operator_from_one_ids: 183 | item = operator_from_one_ids.pop(0) 184 | if not (item is None or item == ''): 185 | status_set.add(id_operator_dict[item].status) 186 | operator_from_one_ids.extend(id_operator_dict[item].child_operator_ids.split(',')) 187 | 188 | if len(status_set) == 1 and "success" in status_set: 189 | return "success" 190 | else: 191 | return "error" 192 | 193 | 194 | def get_run_status_by_project_id(project_id, model_execute_id): 195 | """ 196 | 获取某次执行的状态和其中的每个算子的状态 197 | 198 | :param project_id: 199 | :param model_execute_id: model的执行记录ID 200 | :return: 201 | """ 202 | 203 | # 获取 model 204 | model = ModelDao.get_model_by_project_id(project_id) 205 | if model is False: 206 | return False 207 | 208 | # 获取 operator 209 | operators = OperatorDao.get_operator_by_model_id(model.id) 210 | if operators is False: 211 | return False 212 | 213 | # 构造dict 214 | id_operator_dict = {} 215 | for operator in operators: 216 | id_operator_dict[operator.id] = operator 217 | 218 | # 查看此次执行记录(状态、起始节点) 219 | model_execute_ = ModelExecuteDao.get_model_execute_by_id(model_execute_id) 220 | operator_from_one_ids = model_execute_.start_nodes.split(',') 221 | 222 | # 查看此次执行的所有节点的状态 223 | result = dict() 224 | while operator_from_one_ids: 225 | item = operator_from_one_ids.pop(0) 226 | if not (item is None or item == ''): 227 | result[id_operator_dict[item].id] = {"status": id_operator_dict[item].status, 228 | "log": id_operator_dict[item].run_info} 229 | operator_from_one_ids.extend(id_operator_dict[item].child_operator_ids.split(',')) 230 | 231 | return {"modelExecuteStatus": model_execute_.status, "operatorStatus": result} 232 | 233 | 234 | def run_execute_status_from_start(user_id, project_id): 235 | """ 236 | 设置模型运行时状态(从头开始执行) 237 | :param user_id: 238 | :param project_id: 239 | :return: 240 | """ 241 | # 获取 model 242 | model = ModelDao.get_model_by_project_id(project_id) 243 | if model is False: 244 | return False 245 | # 状态初始化 246 | start_nodes = model.start_nodes.split(',') 247 | model_execute_id = initial_execute_status(user_id, start_nodes) 248 | ModelExecuteDao.update_model_execute(model_execute_id, "running", "", "") 249 | return {'model_execute_id': model_execute_id, 'start_nodes': start_nodes} 250 | 251 | 252 | def run_execute_status_from_one(user_id, operator_id): 253 | """ 254 | 设置模型运行时状态(从某个节点开始执行) 255 | :param user_id: 256 | :param operator_id: 257 | :return: 258 | """ 259 | # 状态初始化 260 | start_nodes = [operator_id] 261 | model_execute_id = initial_execute_status(user_id, start_nodes) 262 | ModelExecuteDao.update_model_execute(model_execute_id, "running", "", "") 263 | return {'model_execute_id': model_execute_id, 'start_nodes': start_nodes} 264 | 265 | 266 | def model_execute(user_id, project_id, param): 267 | """ 268 | 执行模型 269 | :param user_id: 1 270 | :param project_id: 32 271 | :param param: {'model_execute_id': model_execute_id, 'start_nodes': start_nodes} 272 | :return: 273 | """ 274 | model_execute_id = param['model_execute_id'] 275 | start_nodes = param['start_nodes'] 276 | # spark会话 277 | spark_session = getSparkSession(user_id, "executeModel") 278 | # 多线程执行 279 | print("-----model_execute_from_start------", "start_nodes", ','.join(start_nodes)) 280 | ModelExecuteService.model_thread_execute(spark_session, start_nodes) 281 | # 执行完毕,更改执行状态 282 | end_status = get_status_model_execute_end(project_id, start_nodes) 283 | ModelExecuteDao.update_model_execute(model_execute_id, end_status, "", 284 | time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) 285 | return model_execute_id 286 | 287 | 288 | def initial_execute_status(execute_user_id, start_nodes): 289 | """ 290 | 每次执行model时,初始化执行状态 291 | :param execute_user_id: 292 | :param start_nodes: [] 293 | :return: 294 | """ 295 | # 查找参与运行的 operator 296 | operator_list = [] 297 | operator_id_queue = [] 298 | for x in start_nodes: 299 | operator_id_queue.append(x) 300 | while len(operator_id_queue) > 0: 301 | operator_id = operator_id_queue.pop(0) 302 | if operator_id is None or operator_id == "": 303 | continue 304 | operator = OperatorDao.get_operator_by_id(operator_id) 305 | operator_list.append(operator) 306 | for x in operator.child_operator_ids.split(','): 307 | operator_id_queue.append(x) 308 | 309 | # 每个operator 状态初始化为initial 310 | for operator in operator_list: 311 | OperatorDao.update_operator_by_id(operator.id, "initial") 312 | 313 | # 追加执行记录 314 | model_execute = ModelExecute(start_nodes=','.join(start_nodes), status='initial', execute_user_id=execute_user_id, 315 | create_time=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) 316 | model_execute = ModelExecuteDao.create_model_execute(model_execute) 317 | if model_execute is False: 318 | return False 319 | else: 320 | return model_execute.id 321 | -------------------------------------------------------------------------------- /app/service/PreprocessService.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from app.Utils import * 3 | import random 4 | import string 5 | from app.ConstFile import const 6 | import app.dao.OperatorDao as OperatorDao 7 | from pyspark.sql import functions as func 8 | from pyspark.sql.functions import split, explode, concat_ws, regexp_replace 9 | 10 | save_dir = const.SAVEDIR 11 | 12 | 13 | def read_data_with_update_record(spark_session, operator_id, file_url): 14 | """ 15 | 读数据算子,拷贝数据并更新算子记录表 16 | 17 | :param spark_session: 18 | :param operator_id: 19 | :param file_url: 20 | :return: 21 | """ 22 | try: 23 | # 修改计算状态 24 | OperatorDao.update_operator_by_id(operator_id, 'running', '', '') 25 | # 读取数据 26 | df = read_data(spark_session, file_url) 27 | # 存储结果 28 | result_file_url = save_data(df) 29 | 30 | run_info = 'read_data算子执行成功' 31 | # 修改计算状态 32 | OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url, run_info) 33 | return [result_file_url] 34 | except Exception as e: 35 | run_info = str(e) 36 | OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info) 37 | traceback.print_exc() 38 | return [] 39 | 40 | 41 | def filter_multi_conditions(spark_session, operator_id, file_url, condition): 42 | """ 43 | 按照多个条件进行过滤 44 | 45 | :param spark_session: 46 | :param operator_id: 47 | :param file_url: 48 | :param condition: {"userId":1,"projectId":32,"parameter":[{"colName":"利润", "operate":">","value":"100", "relation":"AND"},{"colName":"装运方式", "operate":"==", "value":"一级", "relation":""}]} 49 | :return: 50 | """ 51 | 52 | try: 53 | 54 | # 修改计算状态 55 | OperatorDao.update_operator_by_id(operator_id, 'running', '', '') 56 | # 读取数据 57 | df = read_data(spark_session, file_url) 58 | # 过滤函数 59 | result_df = filter_core(spark_session, df, condition['parameter']) 60 | # 存储结果 61 | result_file_url = save_data(result_df) 62 | 63 | run_info = '过滤算子执行成功' 64 | # 修改计算状态 65 | OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url, run_info) 66 | return [result_file_url] 67 | except Exception as e: 68 | run_info = str(e) 69 | OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info) 70 | traceback.print_exc() 71 | return [] 72 | 73 | 74 | def filter_core(spark, df, condition): 75 | """ 76 | 过滤的核心函数 77 | :param spark: 78 | :param df: 79 | :param condition: 80 | :return: 81 | """ 82 | 83 | table_name = ''.join(random.sample(string.ascii_letters + string.digits, 8)) 84 | sql_str = 'select * from ' + table_name + ' where' 85 | for i in condition: 86 | if is_number(i['value']): 87 | sql_str = sql_str + ' `' + i['colName'] + '` ' + i['operate'] + ' ' + i['value'] + ' ' + i['relation'] 88 | else: 89 | sql_str = sql_str + ' `' + i['colName'] + '` ' + i['operate'] + ' \"' + i['value'] + '\" ' + i['relation'] 90 | print(sql_str) 91 | df.createOrReplaceTempView(table_name) 92 | sql_df = spark.sql(sql_str) 93 | 94 | return sql_df 95 | 96 | 97 | def sort(spark_session, operator_id, file_url, condition): 98 | """ 99 | 排序 100 | 101 | :param spark_session: 102 | :param operator_id: 103 | :param file_url: 104 | :param condition: {"userId":1,"projectId":32,"columnName":"利润","sortType":"降序"} 105 | :return: 106 | """ 107 | 108 | try: 109 | # 修改计算状态 110 | OperatorDao.update_operator_by_id(operator_id, 'running', '', '') 111 | # 读取数据 112 | df = read_data(spark_session, file_url) 113 | # 过滤函数 114 | result_df = sort_core(df, condition['columnName'], condition['sortType']) 115 | # 存储结果 116 | result_file_url = save_data(result_df) 117 | # TODO :判断返回结果是否是String(异常信息) 118 | run_info = '排序算子执行成功' 119 | # 修改计算状态 120 | OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url, run_info) 121 | return [result_file_url] 122 | 123 | except Exception as e: 124 | run_info = str(e) 125 | OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info) 126 | traceback.print_exc() 127 | return [] 128 | 129 | 130 | def sort_core(df, column_name, column_type): 131 | """ 132 | 排序主函数,函数功能包括解析参数、排序;返回df(spark格式) 133 | :param df: 134 | :param column_name: 135 | :param column_type: 136 | :return: 137 | """ 138 | # 只能输入一列,否则报错 139 | if len(column_name.split(",")) != 1: 140 | return "ERROR_NOT_ONLY_ONE_COL" 141 | 142 | # sortType默认为升序,若用户指定,以用户指定为准 143 | if (column_type == "") or (column_type is None): 144 | column_type = "升序" 145 | 146 | # 排序 147 | if column_type == "降序": 148 | df = df.sort(column_name, ascending=False) 149 | else: 150 | df = df.sort(column_name) 151 | return df 152 | 153 | 154 | def column_split(spark_session, operator_id, file_url, condition): 155 | """ 156 | 按列拆分 157 | :param spark_session: 158 | :param operator_id: 159 | :param file_url: 160 | :param condition: {"userId": 1, "projectId": 32, "columnName": "订购日期", "delimiter": "/", "newColumnNames": ["year", "月"]} 161 | :return: 162 | """ 163 | try: 164 | # 修改计算状态 165 | OperatorDao.update_operator_by_id(operator_id, 'running', '', '') 166 | # 读取数据 167 | df = read_data(spark_session, file_url) 168 | # 拆分函数 169 | result_df = column_split_core(spark_session, df, condition) 170 | # 存储结果 171 | result_file_url = save_data(result_df) 172 | # 修改计算状态 173 | run_info = '拆分算子执行成功' 174 | OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url, run_info) 175 | return [result_file_url] 176 | 177 | except Exception as e: 178 | run_info = str(e) 179 | OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info) 180 | traceback.print_exc() 181 | return [] 182 | 183 | 184 | def column_split_core(ss, df, condition_dict): 185 | """ 186 | 按列拆分主函数,返回dataFrame(spark格式) 187 | :param ss: 188 | :param df: 189 | :param condition_dict: 190 | :return: 191 | """ 192 | # 参数解析 193 | column_name = condition_dict['columnName'] 194 | delimiter = condition_dict['delimiter'] 195 | new_column_names = condition_dict['newColumnNames'] 196 | # 获取拆分出的新列的列名,若未指定,暂时存储为空列表,后续根据拆分数填充成为[x_split_1, x_split_2, ...] 197 | if new_column_names is None: 198 | new_column_names = [] 199 | # 将指定列columnName按splitSymbol拆分,存入"splitColumn"列,列内数据格式为[a, b, c, ...] 200 | first_row = df.first() 201 | df_split = df.withColumn("splitColumn", split(df[column_name], delimiter)) 202 | split_number = len(first_row[column_name].split(delimiter)) 203 | # 若用户为指定拆分出的新列的列名,根据拆分数填充 204 | if len(new_column_names) == 0: 205 | for i in range(split_number): 206 | new_column_names.append(column_name + '_split_' + str(i + 1)) 207 | # 给新列名生成索引,格式为:[('年', 0), ('月', 1), ('日', 2)],方便后续操作 208 | sc = ss.sparkContext 209 | newColumnNames_with_index = sc.parallelize(new_column_names).zipWithIndex().collect() 210 | # 遍历生成新列 211 | for name, index in newColumnNames_with_index: 212 | df_split = df_split.withColumn(name, df_split["splitColumn"].getItem(index)) 213 | df = df_split.drop("splitColumn") 214 | return df 215 | 216 | 217 | def columns_merge(spark_session, operator_id, file_url, condition): 218 | """ 219 | 多列合并 220 | :param spark_session: 221 | :param operator_id: 222 | :param file_url: 223 | :param condition: {"userId": 1, "projectId": 32, "columnNames": ["类别", "子类别", "产品名称"], "connector": "-", "newColumnName": "品类名称"} 224 | :return: 225 | """ 226 | try: 227 | # 修改计算状态 228 | OperatorDao.update_operator_by_id(operator_id, 'running', '', '') 229 | # 读取数据 230 | df = read_data(spark_session, file_url) 231 | # 合并函数 232 | result_df = columns_merge_core(df, condition) 233 | # 存储结果 234 | result_file_url = save_data(result_df) 235 | # 修改计算状态 236 | run_info = '多列合并算子执行成功' 237 | OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url, run_info) 238 | return [result_file_url] 239 | 240 | except Exception as e: 241 | run_info = str(e) 242 | OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info) 243 | traceback.print_exc() 244 | return [] 245 | 246 | 247 | def columns_merge_core(df, condition_dict): 248 | """ 249 | 多列合并主函数,新增一列,列内的值为指定多列合并而成;返回df(spark格式) 250 | :param df: 251 | :param condition_dict: 252 | :return: 253 | """ 254 | # 解析参数 255 | column_names = condition_dict['columnNames'] 256 | split_symbol = condition_dict['connector'] 257 | new_column_name = condition_dict['newColumnName'] 258 | # 默认分隔符是",",或以用户指定为准 259 | if split_symbol is None or split_symbol == '': 260 | split_symbol = ',' 261 | if new_column_name is None or new_column_name == '': 262 | new_column_name = '_'.join(new_column_name) 263 | # 合并 264 | column_list = [] 265 | for i in range(len(column_names)): 266 | column_list.append(df[column_names[i]]) 267 | df = df.withColumn(new_column_name, concat_ws(split_symbol, *column_list)) 268 | return df 269 | 270 | 271 | def replace(spark_session, operator_id, file_url, condition): 272 | """ 273 | 数据替换 274 | :param spark_session: 275 | :param operator_id: 276 | :param file_url: 277 | :param condition: {"userId": 1, "projectId": 32, "columnNames": ["类别", "子类别", "客户名称"],"replaceCharacters":[{"source":"技术","target":"技术copy"},{"source":"电话","target":"电话copy"}]} 278 | :return: 279 | """ 280 | try: 281 | # 修改计算状态 282 | OperatorDao.update_operator_by_id(operator_id, 'running', '', '') 283 | # 读取数据 284 | df = read_data(spark_session, file_url) 285 | # 替换函数 286 | result_df = replace_core(df, condition) 287 | # 存储结果 288 | result_file_url = save_data(result_df) 289 | # 修改计算状态 290 | run_info = '数据替换算子执行成功' 291 | OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url, run_info) 292 | return [result_file_url] 293 | 294 | except Exception as e: 295 | run_info = str(e) 296 | OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info) 297 | traceback.print_exc() 298 | return [] 299 | 300 | 301 | def replace_core(df, condition_dict): 302 | """ 303 | 数据列替换主函数, 将多个列中的字符进行替换;返回df(spark格式) 304 | :param df: 305 | :param condition_dict: 306 | :return: 307 | """ 308 | 309 | def mul_regexp_replace(col): 310 | """ 311 | 对每一列进行替换 312 | :param col: 313 | :return: 314 | """ 315 | for item in replace_characters: 316 | col = regexp_replace(col, item["source"], item["target"]) 317 | return col 318 | 319 | # 解析参数 320 | column_names = condition_dict['columnNames'] 321 | replace_characters = condition_dict['replaceCharacters'] 322 | # 对每一列进行替换 323 | for i in range(len(column_names)): 324 | column_name = column_names[i] 325 | df = df.withColumn(column_name, (mul_regexp_replace(df[column_name]))) 326 | return df 327 | 328 | 329 | def fill_null_value(spark_session, operator_id, file_url, condition): 330 | """ 331 | 填充空值 332 | :param spark_session: 333 | :param operator_id: 334 | :param file_url: 335 | :param condition: {'userId':1,'projectId':32,'parameter':[{'operate':'均值填充','colName':''},{'operate':'均值填充','colName':'最大值填充'}]} 336 | :return: 337 | """ 338 | 339 | try: 340 | # 修改计算状态 341 | OperatorDao.update_operator_by_id(operator_id, 'running', '', '') 342 | # 读取数据 343 | df = read_data(spark_session, file_url) 344 | # 空值填充函数 345 | result_df = fill_null_value_core(df, condition["parameter"]) 346 | # 存储结果 347 | result_file_url = save_data(result_df) 348 | # 修改计算状态 349 | run_info = '数据替换算子执行成功' 350 | OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url, run_info) 351 | return [result_file_url] 352 | 353 | except Exception as e: 354 | run_info = str(e) 355 | OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info) 356 | traceback.print_exc() 357 | return [] 358 | 359 | 360 | def fill_null_value_core(df, condition): 361 | """ 362 | 填充空值核心函数 363 | :param df: 364 | :param condition: 365 | :return: 366 | """ 367 | for i in condition: 368 | if i['operate'] == '均值填充': 369 | mean_item = df.select(func.mean(i['colName'])).collect()[0][0] 370 | df = df.na.fill({i['colName']: mean_item}) 371 | elif i['operate'] == '最大值填充': 372 | mean_item = df.select(func.max(i['colName'])).collect()[0][0] 373 | df = df.na.fill({i['colName']: mean_item}) 374 | elif i['operate'] == '最小值填充': 375 | mean_item = df.select(func.min(i['colName'])).collect()[0][0] 376 | df = df.na.fill({i['colName']: mean_item}) 377 | return df 378 | 379 | 380 | def column_map(spark_session, operator_id, file_url, condition): 381 | """ 382 | 列映射 383 | :param spark_session: 384 | :param operator_id: 385 | :param file_url: 386 | :param condition:{"userId":1,"projectId":32,"parameter":[{"colName_1":"利润", "operate_1":"+","value_1":"100","operate":"+","colName_2":"数量", "operate_2":"*","value_2":"0.0001","newName":"newCol1"},{"colName_1":"利润", "operate_1":"+","value_1":"10","operate":"*","colName_2":"数量", "operate_2":"*","value_2":"0.1","newName":"newCol2"}]} 387 | :return: 388 | """ 389 | 390 | try: 391 | # 修改计算状态 392 | OperatorDao.update_operator_by_id(operator_id, 'running', '', '') 393 | # 读取数据 394 | df = read_data(spark_session, file_url) 395 | # 列映射函数 396 | result_df = column_map_core(df, condition["parameter"]) 397 | # 存储结果 398 | result_file_url = save_data(result_df) 399 | # 修改计算状态 400 | run_info = '列映射算子执行成功' 401 | OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url, run_info) 402 | return [result_file_url] 403 | 404 | except Exception as e: 405 | run_info = str(e) 406 | OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info) 407 | traceback.print_exc() 408 | return [] 409 | 410 | 411 | def column_map_core(df, condition): 412 | """ 413 | 列映射核心函数 414 | :param df: 415 | :param condition: 416 | :return: 417 | """ 418 | for i in condition: 419 | name1 = i['colName_1'] 420 | name2 = i['colName_2'] 421 | new_name = i['newName'] 422 | if i['operate_1'] == '+': 423 | df = df.withColumn(new_name, df[name1] + i['value_1']) 424 | elif i['operate_1'] == '-': 425 | df = df.withColumn(new_name, df[name1] - i['value_1']) 426 | elif i['operate_1'] == '*': 427 | df = df.withColumn(new_name, df[name1] * i['value_1']) 428 | elif i['operate_1'] == '/': 429 | df = df.withColumn(new_name, df[name1] / i['value1_']) 430 | if not ((name2 == "") or (name2 is None)): 431 | new_name2 = new_name + "_2" 432 | if i['operate_2'] == '+': 433 | df = df.withColumn(new_name2, df[name2] + i['value_2']) 434 | elif i['operate_2'] == '-': 435 | df = df.withColumn(new_name2, df[name2] - i['value_2']) 436 | elif i['operate_2'] == '*': 437 | df = df.withColumn(new_name2, df[name2] * i['value_2']) 438 | elif i['operate_2'] == '/': 439 | df = df.withColumn(new_name2, df[name2] / i['value_2']) 440 | 441 | if i['operate'] == '+': 442 | df = df.withColumn(new_name, df[new_name] + df[new_name2]) 443 | elif i['operate'] == '-': 444 | df = df.withColumn(new_name, df[new_name] - df[new_name2]) 445 | elif i['operate'] == '*': 446 | df = df.withColumn(new_name, df[new_name] * df[new_name2]) 447 | elif i['operate'] == '/': 448 | df = df.withColumn(new_name, df[new_name] / df[new_name2]) 449 | df = df.drop(new_name2) 450 | return df 451 | 452 | 453 | def random_split(spark_session, operator_id, file_url, condition): 454 | """ 455 | 按照比例随机划分数据 456 | :param spark_session: 457 | :param operator_id: 458 | :param file_url: 459 | :param condition: 460 | :return: 461 | """ 462 | try: 463 | # 修改计算状态 464 | OperatorDao.update_operator_by_id(operator_id, 'running', '', '') 465 | # 读取数据 466 | df = read_data(spark_session, file_url) 467 | # 划分函数 468 | (result_df1, result_df2) = random_split_core(df, condition) 469 | # 存储结果 470 | result_file_url1 = save_data(result_df1) 471 | result_file_url2 = save_data(result_df2) 472 | # 修改计算状态 473 | run_info = '列映射算子执行成功' 474 | OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url1 + "*," + 475 | result_file_url2, run_info) 476 | return [result_file_url1, result_file_url2] 477 | 478 | except Exception as e: 479 | run_info = str(e) 480 | OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info) 481 | traceback.print_exc() 482 | return [] 483 | 484 | 485 | def random_split_core(df, condition): 486 | """ 487 | 划分函数 488 | :param df: 489 | :param condition:{"proportion1": 0.7, "proportion2": 0.3, "seed": 10} 490 | :return: 491 | """ 492 | 493 | seed = condition['seed'] 494 | train = float(condition['proportion1']) 495 | test = float(condition['proportion2']) 496 | (trainingData, testData) = df.randomSplit([train, test], seed=seed) 497 | return trainingData, testData 498 | -------------------------------------------------------------------------------- /app/service/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 逻辑处理 3 | """ 4 | -------------------------------------------------------------------------------- /app/service/ml/Evaluation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | """ 4 | 模型评估 5 | """ 6 | import app.dao.OperatorDao as OperatorDao 7 | from pyspark.mllib.classification import SVMModel 8 | from pyspark.mllib.regression import LabeledPoint 9 | from pyspark.mllib.evaluation import BinaryClassificationMetrics 10 | from pyspark.ml.linalg import Vectors 11 | from pyspark.sql.types import Row 12 | from pyspark.ml.classification import LogisticRegressionModel 13 | from app.Utils import * 14 | 15 | 16 | def second_evaluation(spark_session, operator_id, condition): 17 | """ 18 | 二分类评估 19 | :param spark_session: 20 | :param operator_id: 21 | :param condition: 22 | :return: 23 | """ 24 | try: 25 | # 修改计算状态 26 | OperatorDao.update_operator_by_id(operator_id, 'running', '', '') 27 | # 评估函数 28 | result_df = second_evaluation_core(spark_session, condition, operator_id) 29 | if isinstance(result_df, str): 30 | OperatorDao.update_operator_by_id(operator_id, 'error', '', result_df) 31 | else: 32 | # 存储结果 33 | result_df.show() 34 | result_file_url = save_data(result_df) 35 | run_info = '评估算子执行成功' 36 | # 修改计算状态 37 | OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url, run_info) 38 | return [result_file_url] 39 | 40 | except Exception as e: 41 | run_info = str(e) 42 | OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info) 43 | traceback.print_exc() 44 | return [] 45 | 46 | 47 | def second_evaluation_core(spark_session, condition, operator_id): 48 | """ 49 | 二分类评估核心函数 50 | :param spark_session: 51 | :param condition: 52 | :param operator_id: 53 | :return: 54 | """ 55 | # 读模型 56 | # 当前节点(评估节点)一个父节点 57 | operator = OperatorDao.get_operator_by_id(operator_id) 58 | # 父节点(预测节点) 两个父节点 59 | father_id = operator.father_operator_ids 60 | father_operator = OperatorDao.get_operator_by_id(father_id) 61 | # 祖节点(模型节点和读预测数据节点) 62 | grand_father_ids = father_operator.father_operator_ids.split(',') 63 | print("**********祖节点(模型节点和读预测数据源节点):", grand_father_ids) 64 | 65 | # 读数据 66 | def get_predict_data(operator_config_): 67 | for grand_father_file_ in operator_config_: 68 | grand_father_id_ = list(grand_father_file_.keys())[0] 69 | grand_father_ = OperatorDao.get_operator_by_id(grand_father_id_) 70 | if grand_father_.operator_type_id == 5001 or grand_father_.operator_type_id < 3000: 71 | print("***************评估函数,预测数据:", grand_father_.operator_type_id) 72 | pre_data_file_url = grand_father_.operator_output_url.split('*,')[ 73 | grand_father_file_[grand_father_id_]] 74 | print("***************评估函数,预测数据url:", pre_data_file_url) 75 | return read_data(spark_session, pre_data_file_url) 76 | 77 | print("**********预测节点:", father_operator.operator_config) 78 | df = get_predict_data(json.loads(father_operator.operator_config)['fileUrl']) 79 | 80 | # 评估 81 | for grand_father_id in grand_father_ids: 82 | grand_father = OperatorDao.get_operator_by_id(grand_father_id) 83 | grand_father_operator_type = grand_father.operator_type_id 84 | # 模型加载节点 85 | if grand_father_operator_type == 8000: 86 | grand_father_operator_type = json.loads(grand_father.operator_config)['parameter']['modelTypeId'] 87 | if grand_father_operator_type == 6001: # svm二分类节点 88 | print("***************评估函数,训练模型", grand_father.operator_type_id) 89 | evaluation_df = svm_second_evaluation(spark_session, grand_father.operator_output_url, df, 90 | json.loads(father_operator.operator_config)['parameter'], condition) 91 | return evaluation_df 92 | elif grand_father_operator_type == 6003: # lr二分类节点 93 | print("***************评估函数,训练模型", grand_father.operator_type_id) 94 | evaluation_df = lr_second_evaluation(spark_session, grand_father.operator_output_url, df, 95 | json.loads(father_operator.operator_config)['parameter'], condition) 96 | return evaluation_df 97 | 98 | 99 | def svm_second_evaluation(spark_session, svm_model_path, df, predict_condition, condition): 100 | """ 101 | svm二分类评估 102 | :param spark_session: 103 | :param svm_model_path: 模型地址 104 | :param df: 预测数据 105 | :param predict_condition: 预测算子(父算子)配置 106 | :param condition: 该算子配置 {"label":"标签"} 107 | :return: 108 | """ 109 | 110 | feature_indexs = predict_condition['features'] 111 | label = condition['label'] 112 | 113 | # 1. 准备数据 114 | def func(x): 115 | features_data = [] 116 | for feature in feature_indexs: 117 | features_data.append(x[feature]) 118 | return LabeledPoint(label=x[label], features=features_data) 119 | 120 | predict_data = df.rdd.map(lambda x: func(x)) 121 | 122 | # 加载模型 123 | svm_model = SVMModel.load(spark_session.sparkContext, svm_model_path) 124 | 125 | # 计算评估指标 126 | svmTotalCorrect = predict_data.map(lambda r: 1 if (svm_model.predict(r.features) == r.label) else 0).reduce( 127 | lambda x, y: x + y) 128 | svmAccuracy = svmTotalCorrect / float(predict_data.count()) 129 | 130 | # 清除默认阈值,这样会输出原始的预测评分,即带有确信度的结果 131 | svm_model.clearThreshold() 132 | svmPredictionAndLabels = predict_data.map(lambda lp: (float(svm_model.predict(lp.features)), lp.label)) 133 | svmMetrics = BinaryClassificationMetrics(svmPredictionAndLabels) 134 | print("Area under PR = %s" % svmMetrics.areaUnderPR) 135 | print("Area under ROC = %s" % svmMetrics.areaUnderROC) 136 | 137 | # 返回数据 138 | result = [("正确个数", float(svmTotalCorrect)), 139 | ("精准度", float(svmAccuracy)), 140 | ("Area under PR", float(svmMetrics.areaUnderPR)), 141 | ("Area under ROC", float(svmMetrics.areaUnderROC))] 142 | return spark_session.createDataFrame(result, schema=['指标', '值']) 143 | 144 | 145 | def lr_second_evaluation(spark_session, lr_model_path, df, predict_condition, condition): 146 | """ 147 | lr二分类评估 148 | :param spark_session: 149 | :param lr_model_path: 模型地址 150 | :param df: 预测数据 151 | :param predict_condition: 预测算子(父算子)配置 152 | :param condition: 该算子配置 {"label":"标签"} 153 | :return: 154 | """ 155 | 156 | feature_indexs = predict_condition['features'] 157 | label_index = condition['label'] 158 | 159 | # 1. 准备数据 160 | def func(x): 161 | features_data = [] 162 | for feature in feature_indexs: 163 | features_data.append(x[feature]) 164 | return Row(label=x[label_index], features=Vectors.dense(features_data)) 165 | 166 | predict_data = df.rdd.map(lambda x: func(x)).toDF() 167 | 168 | # 2.加载模型 169 | print("*****lr_model_path:", lr_model_path) 170 | lr_model = LogisticRegressionModel.load(lr_model_path) 171 | 172 | # 计算评估指标 173 | result = lr_model.transform(predict_data) 174 | print(result.prediction) 175 | lrTotalCorrect = result.rdd.map(lambda r: 1 if (r.prediction == r.label) else 0).reduce(lambda x, y: x + y) 176 | 177 | lrAccuracy = lrTotalCorrect / float(predict_data.count()) # 0.5136044023234485 178 | # # 清除默认阈值,这样会输出原始的预测评分,即带有确信度的结果 179 | lrPredictionAndLabels = result.rdd.map(lambda lp: (float(lp.prediction), float(lp.label))) 180 | lrmetrics = BinaryClassificationMetrics(lrPredictionAndLabels) 181 | 182 | print("Area under PR = %s" % lrmetrics.areaUnderPR) 183 | print("Area under ROC = %s" % lrmetrics.areaUnderROC) 184 | 185 | # 返回数据 186 | result = [("正确个数", float(lrTotalCorrect)), 187 | ("精准度", float(lrAccuracy)), 188 | ("Area under PR", float(lrmetrics.areaUnderPR)), 189 | ("Area under ROC", float(lrmetrics.areaUnderROC))] 190 | return spark_session.createDataFrame(result, schema=['指标', '值']) 191 | -------------------------------------------------------------------------------- /app/service/ml/ModelService.py: -------------------------------------------------------------------------------- 1 | """ 2 | 模型加载 3 | """ 4 | import app.dao.OperatorDao as OperatorDao 5 | import app.dao.MLModelDao as MLModelDao 6 | from app.Utils import * 7 | 8 | 9 | def model_operator(operator_id, condition): 10 | """ 11 | 加载模型算子 12 | :param operator_id: 13 | :param condition:{"MLModelId": 2, "modelTypeId": 6001} 14 | :return: 15 | """ 16 | 17 | try: 18 | # 修改计算状态 19 | OperatorDao.update_operator_by_id(operator_id, 'running', '', '') 20 | # 评估函数 21 | model_file_url = model_operator_core(condition) 22 | # 修改计算状态 23 | run_info = '模型算子执行成功' 24 | OperatorDao.update_operator_by_id(operator_id, 'success', model_file_url, run_info) 25 | return [model_file_url] 26 | 27 | except Exception as e: 28 | run_info = str(e) 29 | OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info) 30 | traceback.print_exc() 31 | return [] 32 | 33 | 34 | def model_operator_core(condition): 35 | # 查询 ml_model 36 | ml_model_id = condition['MLModelId'] 37 | ml_model = MLModelDao.get_ml_model(ml_model_id) 38 | return ml_model.model_url 39 | -------------------------------------------------------------------------------- /app/service/ml/MultipleClassifition.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | 多分类 4 | """ 5 | 6 | import app.dao.OperatorDao as OperatorDao 7 | from app.Utils import * 8 | from pyspark.ml.linalg import Vectors 9 | from pyspark.ml.classification import LogisticRegression, MultilayerPerceptronClassifier 10 | from pyspark.sql.types import Row 11 | 12 | 13 | def model_url(): 14 | """ 15 | 二分类模型保存地址 16 | :return: 17 | """ 18 | return const.MIDDATA + 'model/multipleClassification' 19 | 20 | 21 | def lr(spark_session, operator_id, file_url, condition): 22 | """ 23 | 逻辑回归多分类 24 | :param spark_session: 25 | :param operator_id: 26 | :param file_url: 27 | :param condition: 28 | :return: 29 | """ 30 | try: 31 | # 修改计算状态 32 | OperatorDao.update_operator_by_id(operator_id, 'running', '', '') 33 | # 读取数据 34 | df = read_data(spark_session, file_url) 35 | # svm_core函数 36 | result_model_url = lr_core(df, condition) 37 | # 修改计算状态 38 | run_info = '逻辑回归多分类算子执行成功' 39 | OperatorDao.update_operator_by_id(operator_id, 'success', result_model_url, run_info) 40 | return [result_model_url] 41 | 42 | except Exception as e: 43 | run_info = str(e) 44 | OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info) 45 | traceback.print_exc() 46 | return [] 47 | 48 | 49 | def lr_core(df, condition): 50 | """ 51 | lr多分类核心函数 52 | :param spark_session: 53 | :param df: 54 | :param condition:{"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20,"regParam":0.0,"elasticNetParam":0.0,"tol":0.000006,"fitIntercept":True} 55 | :return: 56 | """ 57 | # 参数 58 | label_index = condition['label'] # 标签列(列名或列号) 59 | feature_indexs = condition['features'] # 特征列(列名或列号) 60 | iterations = condition['iterations'] # 最大迭代次数(默认100) 61 | regParam = condition['regParam'] # 正则化参数(默认0.0) 62 | # ElasticNet混合参数,范围为[0,1]。对于alpha = 0,惩罚是L2惩罚。对于alpha = 1,这是L1惩罚(默认值:0.0) 63 | elasticNetParam = condition['elasticNetParam'] 64 | tol = condition['tol'] # 迭代算法的收敛容限(> = 0)(默认值:1e-06即 0.000006) 65 | fitIntercept = condition['fitIntercept'] # 是否训练截距项(默认值:"True","False"可选) 66 | 67 | # 参数类型转换 68 | if isinstance(iterations, str): 69 | iterations = int(iterations) 70 | if isinstance(regParam, str): 71 | regParam = float(regParam) 72 | if isinstance(elasticNetParam, str): 73 | elasticNetParam = float(elasticNetParam) 74 | if isinstance(tol, str): 75 | tol = float(tol) 76 | if isinstance(fitIntercept, str): 77 | if fitIntercept == 'False': 78 | fitIntercept = False 79 | else: 80 | fitIntercept = True 81 | 82 | # 1. 准备数据 83 | def func(x): 84 | features_data = [] 85 | for feature in feature_indexs: 86 | features_data.append(x[feature]) 87 | return Row(label=x[label_index], features=Vectors.dense(features_data)) 88 | 89 | training_set = df.rdd.map(lambda x: func(x)).toDF() 90 | 91 | # 2.训练模型 92 | lr_param = LogisticRegression(featuresCol="features", labelCol="label", predictionCol="prediction", 93 | maxIter=iterations, regParam=regParam, elasticNetParam=elasticNetParam, tol=tol, 94 | fitIntercept=fitIntercept, probabilityCol="probability", 95 | rawPredictionCol="rawPrediction", standardization=True, aggregationDepth=2, 96 | family="multinomial") 97 | lr_model = lr_param.fit(training_set) 98 | print(lr_model.coefficientMatrix) # 系数 99 | print(lr_model.interceptVector) # 截距 100 | print(lr_model.explainParams()) # 参数以及其注解 101 | 102 | # 3.保存模型 103 | lr_model_path = model_url() + '/lr/' + str(uuid.uuid1()) 104 | deltree(lr_model_path) # 删除已经存在的模型 105 | lr_model.write().overwrite().save(lr_model_path) 106 | 107 | return lr_model_path 108 | 109 | 110 | def mpc(spark_session, operator_id, file_url, condition): 111 | """ 112 | mpc多分类 113 | Classifier trainer based on the Multilayer Perceptron. 114 | Each layer has sigmoid activation function, output layer has softmax. 115 | Number of inputs has to be equal to the size of feature vectors. 116 | Number of outputs has to be equal to the total number of labels. 117 | 118 | :param spark_session: 119 | :param operator_id: 120 | :param file_url: 121 | :param condition: 122 | :return: 123 | """ 124 | try: 125 | # 修改计算状态 126 | OperatorDao.update_operator_by_id(operator_id, 'running', '', '') 127 | # 读取数据 128 | df = read_data(spark_session, file_url) 129 | # svm_core函数 130 | result_model_url = mpc_core(df, condition) 131 | # 修改计算状态 132 | run_info = 'mpc多分类算子执行成功' 133 | OperatorDao.update_operator_by_id(operator_id, 'success', result_model_url, run_info) 134 | return [result_model_url] 135 | 136 | except Exception as e: 137 | run_info = str(e) 138 | OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info) 139 | traceback.print_exc() 140 | return [] 141 | 142 | 143 | def mpc_core(df, condition): 144 | """ 145 | mpc多分类核心函数 146 | :param df: 147 | :param condition:{"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20,"regParam":0.0,"elasticNetParam":0.0,"tol":0.000006,"fitIntercept":True} 148 | :return: 149 | """ 150 | {"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20, "seed": 1, "layers": [4, 2, 2], 151 | "stepSize": 0.03, "tol": 0.000001, "blockSize": 128, "solver": "l-bfgs"} 152 | 153 | # maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, solver="l-bfgs" 154 | label_index = condition['label'] # 标签列(列名或列号) 155 | feature_indexs = condition['features'] # 特征列(列名或列号) 156 | iterations = condition['iterations'] # 最大迭代次数(默认100) 157 | tol = condition['tol'] # 迭代算法的收敛容限(> = 0)(默认值:1e-06即 0.000001) 158 | seed = condition['seed'] # 随机种子 159 | layers = condition['layers'] # Sizes of layers from input layer to output layer 160 | blockSize = condition['blockSize'] # Block size for stacking input data in matrices. 161 | stepSize = condition['stepSize'] # 步长,默认值:0.03 162 | solver = condition['solver'] # 是否训练截距项(默认值:"l-bfgs","gd"可选) 163 | 164 | # 参数类型转换 165 | if isinstance(iterations, str): 166 | iterations = int(iterations) 167 | if isinstance(tol, str): 168 | tol = float(tol) 169 | if isinstance(seed, str): 170 | seed = int(seed) 171 | if isinstance(layers, list): 172 | for i in range(len(layers)): 173 | if isinstance(layers[i], str): 174 | layers[i] = int(layers[i]) 175 | if isinstance(blockSize, str): 176 | blockSize = int(blockSize) 177 | if isinstance(stepSize, str): 178 | stepSize = float(stepSize) 179 | 180 | # 1. 准备数据 181 | def func(x): 182 | features_data = [] 183 | for feature in feature_indexs: 184 | features_data.append(x[feature]) 185 | return Row(label=x[label_index], features=Vectors.dense(features_data)) 186 | 187 | training_set = df.rdd.map(lambda x: func(x)).toDF() 188 | 189 | # 2.训练模型 190 | mpc_param = MultilayerPerceptronClassifier(maxIter=iterations, tol=tol, seed=seed, layers=layers, 191 | blockSize=blockSize, stepSize=stepSize, solver=solver) 192 | mpc_model = mpc_param.fit(training_set) 193 | 194 | # 3.保存模型 195 | mpc_model_path = model_url() + '/mpc/' + str(uuid.uuid1()) 196 | deltree(mpc_model_path) # 删除已经存在的模型 197 | mpc_model.write().overwrite().save(mpc_model_path) 198 | 199 | return mpc_model_path 200 | -------------------------------------------------------------------------------- /app/service/ml/PredictService.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | 二分类 4 | """ 5 | from pyspark.mllib.classification import SVMModel 6 | from pyspark.mllib.regression import LabeledPoint 7 | import app.dao.OperatorDao as OperatorDao 8 | from app.Utils import * 9 | from pyspark.ml.linalg import Vectors 10 | from pyspark.sql.types import Row 11 | from pyspark.ml.classification import GBTClassificationModel, LogisticRegressionModel, \ 12 | MultilayerPerceptronClassificationModel 13 | 14 | 15 | def ml_predict(spark_session, operator_id, file_urls, condition): 16 | """ 17 | 机器学习模型预测函数 18 | :param spark_session: 19 | :param operator_id: 20 | :param file_urls: ["modelUrl","predictDataUrl"] 21 | # 两个输入源 一个是模型 一个是预测数据 22 | :param condition: 23 | :return: 24 | """ 25 | try: 26 | # 修改计算状态 27 | OperatorDao.update_operator_by_id(operator_id, 'running', '', '') 28 | # 读取数据 29 | for url in file_urls: 30 | print("------fileUrl:", file_urls) 31 | if url[-4:] == ".csv": 32 | url1 = url 33 | else: 34 | url0 = url 35 | df = read_data(spark_session, url1) 36 | # 预测函数 37 | result_df = ml_predict_core(spark_session, operator_id, df, url0, condition) 38 | if isinstance(result_df, str): 39 | OperatorDao.update_operator_by_id(operator_id, 'error', '', result_df) 40 | else: 41 | # 存储结果 42 | result_df.show() 43 | result_file_url = save_data(result_df) 44 | run_info = '预测算子执行成功' 45 | # 修改计算状态 46 | OperatorDao.update_operator_by_id(operator_id, 'success', result_file_url, run_info) 47 | return [result_file_url] 48 | 49 | except Exception as e: 50 | run_info = str(e) 51 | OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info) 52 | traceback.print_exc() 53 | return [] 54 | 55 | 56 | def ml_predict_core(spark_session, operator_id, df, model_url, condition): 57 | """ 58 | 路由控制加载哪种模型进行预测 59 | :param spark_session: 60 | :param operator_id: 61 | :param df: 62 | :param model_url: 63 | :param condition: 64 | :return: 预测结果 sparkframe 65 | """ 66 | 67 | # 父节点是什么组件 68 | operator = OperatorDao.get_operator_by_id(operator_id) 69 | father_ids = operator.father_operator_ids.split(',') 70 | print("**********", operator.father_operator_ids) 71 | for father_id in father_ids: 72 | father = OperatorDao.get_operator_by_id(father_id) 73 | print("***************", father.operator_type_id) 74 | print("---------------", father.operator_type_id == 6001) 75 | operator_type_flag = father.operator_type_id 76 | 77 | # 模型加载节点 78 | if operator_type_flag == 8000: 79 | operator_type_flag = json.loads(father.operator_config)['parameter']['modelTypeId'] 80 | 81 | if operator_type_flag == 6001: # svm二分类 82 | prediction_df = svm_second_predict(spark_session, model_url, df, condition) 83 | elif operator_type_flag == 6002: # gbdt二分类 84 | prediction_df = gbdt_second_predict(model_url, df, condition) 85 | elif operator_type_flag == 6003: # lr二分类 86 | prediction_df = lr_second_predict(model_url, df, condition) 87 | elif operator_type_flag == 6004: # lr多分类 88 | prediction_df = lr_multiple_predict(model_url, df, condition) 89 | elif operator_type_flag == 6005: # mpc多分类 90 | prediction_df = mpc_multiple_predict(model_url, df, condition) 91 | 92 | # 根据父组件的类型决定加载哪种模型 93 | return prediction_df 94 | 95 | 96 | def svm_second_predict(spark_session, svm_model_path, df, condition): 97 | """ 98 | 支持向量机二分类预测 99 | :param spark_session: spark 会话 100 | :param svm_model_path: 模型地址 101 | :param df: 数据 102 | :param condition: {"features": [12, 13, 14, 15], "label": "label"} 103 | 特征列 104 | :return: 预测结果 sparkframe 105 | """ 106 | feature_indexs = condition['features'] 107 | label_index = condition['label'] 108 | if label_index is None or label_index == "": # 无标签列 109 | # 1. 准备数据 110 | def func(x): 111 | features_data = [] 112 | for feature in feature_indexs: 113 | features_data.append(x[feature]) 114 | return features_data 115 | 116 | predict_data = df.rdd.map(lambda x: func(x)) 117 | print(predict_data.take(10)) 118 | 119 | # 2.加载模型 120 | svm_model = SVMModel.load(spark_session.sparkContext, svm_model_path) 121 | 122 | # 3.预测 123 | def f(x): 124 | return {"prediction_result": x} 125 | 126 | prediction_rdd = svm_model.predict(predict_data) 127 | print(prediction_rdd.take(10)) 128 | prediction_df = prediction_rdd.map(lambda x: Row(**f(x))).toDF() 129 | return prediction_df 130 | else: # 有标签列 131 | # 1. 准备数据 132 | def func(x): 133 | features_data = [] 134 | for feature in feature_indexs: 135 | features_data.append(x[feature]) 136 | return LabeledPoint(label=x[label_index], features=features_data) 137 | 138 | predict_label_data = df.rdd.map(lambda x: func(x)) 139 | print(predict_label_data.take(10)) 140 | 141 | # 2.加载模型 142 | svm_model = SVMModel.load(spark_session.sparkContext, svm_model_path) 143 | 144 | # 3.预测 145 | from pyspark.sql.types import Row 146 | 147 | def f(x): 148 | return {"prediction_result": x[0], label_index: x[1]} 149 | 150 | prediction_rdd = predict_label_data.map(lambda x: (svm_model.predict(x.features), x.label)) 151 | print(prediction_rdd.take(10)) 152 | prediction_df = prediction_rdd.map(lambda x: Row(**f(x))).toDF() 153 | return prediction_df 154 | 155 | 156 | def gbdt_second_predict(gbdt_model_path, df, condition): 157 | """ 158 | gbdt二分类预测 159 | :param gbdt_model_path: 模型地址 160 | :param df: 数据 161 | :param condition: {"features": [12, 13, 14, 15], "label": "label"} 162 | 特征列 163 | :return: 预测结果 sparkframe 164 | """ 165 | feature_indexs = condition['features'] 166 | label_index = condition['label'] 167 | 168 | if label_index is None or label_index == "": # 无标签列 169 | # 1. 准备数据 170 | def func(x): 171 | features_data = [] 172 | for feature in feature_indexs: 173 | features_data.append(x[feature]) 174 | return Row(features=Vectors.dense(features_data)) 175 | 176 | training_set = df.rdd.map(lambda x: func(x)).toDF() 177 | 178 | # 2.加载模型 179 | gbdt_model = GBTClassificationModel.load(gbdt_model_path) 180 | 181 | # 3.预测 182 | prediction_df = gbdt_model.transform(training_set).select("prediction", "features") 183 | return prediction_df 184 | else: # 有标签列 185 | # 1. 准备数据 186 | def func(x): 187 | features_data = [] 188 | for feature in feature_indexs: 189 | features_data.append(x[feature]) 190 | return Row(label=x[label_index], features=Vectors.dense(features_data)) 191 | 192 | training_set = df.rdd.map(lambda x: func(x)).toDF() 193 | 194 | # 2.加载模型 195 | print("****gbdt_model_path:", gbdt_model_path) 196 | gbdt_model = GBTClassificationModel.load(gbdt_model_path) 197 | 198 | # 3.预测 199 | prediction_df = gbdt_model.transform(training_set).select("prediction", "label", "features") 200 | return prediction_df 201 | 202 | 203 | def lr_second_predict(lr_model_path, df, condition): 204 | """ 205 | lr二分类预测 206 | :param lr_model_path: 模型地址 207 | :param df: 数据 208 | :param condition: {"features": [12, 13, 14, 15], "label": "label"} 209 | 特征列 210 | :return: 预测结果 spark dataframe 211 | """ 212 | feature_indexs = condition['features'] 213 | label_index = condition['label'] 214 | 215 | if label_index is None or label_index == "": # 无标签列 216 | # 1. 准备数据 217 | def func(x): 218 | features_data = [] 219 | for feature in feature_indexs: 220 | features_data.append(x[feature]) 221 | return Row(features=Vectors.dense(features_data)) 222 | 223 | training_set = df.rdd.map(lambda x: func(x)).toDF() 224 | 225 | # 2.加载模型 226 | lr_model = LogisticRegressionModel.load(lr_model_path) 227 | 228 | # 3.预测 229 | prediction_df = lr_model.transform(training_set).select("prediction", "features") 230 | return prediction_df 231 | else: # 有标签列 232 | # 1. 准备数据 233 | def func(x): 234 | features_data = [] 235 | for feature in feature_indexs: 236 | features_data.append(x[feature]) 237 | return Row(label=x[label_index], features=Vectors.dense(features_data)) 238 | 239 | training_set = df.rdd.map(lambda x: func(x)).toDF() 240 | 241 | # 2.加载模型 242 | print("*****lr_model_path:", lr_model_path) 243 | lr_model = LogisticRegressionModel.load(lr_model_path) 244 | 245 | # 3.预测 246 | prediction_df = lr_model.transform(training_set).select("prediction", "label", "features") 247 | return prediction_df 248 | 249 | 250 | """ 多分类 """ 251 | 252 | 253 | def lr_multiple_predict(lr_model_path, df, condition): 254 | """ 255 | lr多分类预测 256 | :param lr_model_path: 模型地址 257 | :param df: 数据 258 | :param condition: {"features": [12, 13, 14, 15], "label": "label"} 259 | 特征列 260 | :return: 预测结果 sparkframe 261 | """ 262 | return lr_second_predict(lr_model_path, df, condition) 263 | 264 | 265 | def mpc_multiple_predict(mpc_model_path, df, condition): 266 | """ 267 | mpc多分类预测 268 | :param mpc_model_path: 模型地址 269 | :param df: 数据 270 | :param condition: {"features": [12, 13, 14, 15], "label": "label"} 271 | 特征列 272 | :return: 预测结果 sparkframe 273 | """ 274 | feature_indexs = condition['features'] 275 | label_index = condition['label'] 276 | 277 | if label_index is None or label_index == "": # 无标签列 278 | # 1. 准备数据 279 | def func(x): 280 | features_data = [] 281 | for feature in feature_indexs: 282 | features_data.append(x[feature]) 283 | return Row(features=Vectors.dense(features_data)) 284 | 285 | training_set = df.rdd.map(lambda x: func(x)).toDF() 286 | 287 | # 2.加载模型 288 | mpc_model = MultilayerPerceptronClassificationModel.load(mpc_model_path) 289 | 290 | # 3.预测 291 | prediction_df = mpc_model.transform(training_set).select("prediction", "features") 292 | return prediction_df 293 | else: # 有标签列 294 | # 1. 准备数据 295 | def func(x): 296 | features_data = [] 297 | for feature in feature_indexs: 298 | features_data.append(x[feature]) 299 | return Row(label=x[label_index], features=Vectors.dense(features_data)) 300 | 301 | training_set = df.rdd.map(lambda x: func(x)).toDF() 302 | 303 | # 2.加载模型 304 | print("*****mpc_model_path:", mpc_model_path) 305 | mpc_model = MultilayerPerceptronClassificationModel.load(mpc_model_path) 306 | 307 | # 3.预测 308 | prediction_df = mpc_model.transform(training_set).select("prediction", "label", "features") 309 | return prediction_df 310 | -------------------------------------------------------------------------------- /app/service/ml/SecondClassification.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | 二分类 4 | """ 5 | from pyspark.mllib.classification import SVMWithSGD 6 | from pyspark.mllib.regression import LabeledPoint 7 | import app.dao.OperatorDao as OperatorDao 8 | from app.Utils import * 9 | 10 | from pyspark.ml.linalg import Vectors 11 | from pyspark.ml.classification import GBTClassifier, LogisticRegression 12 | from pyspark.ml.feature import StringIndexer 13 | from pyspark.sql.types import Row 14 | 15 | 16 | def model_url(): 17 | """ 18 | 二分类模型保存地址 19 | :return: 20 | """ 21 | return const.MIDDATA + 'model/secondClassification' 22 | 23 | 24 | def svm(spark_session, operator_id, file_url, condition): 25 | """ 26 | 支持向量机二分类 27 | :param spark_session: 28 | :param operator_id: 29 | :param file_url: 30 | :param condition: 31 | :return: 32 | """ 33 | try: 34 | # 修改计算状态 35 | OperatorDao.update_operator_by_id(operator_id, 'running', '', '') 36 | # 读取数据 37 | df = read_data(spark_session, file_url) 38 | # svm_core函数 39 | result_model_url = svm_core(spark_session, df, condition) 40 | # 修改计算状态 41 | run_info = '支持向量机二分类算子执行成功' 42 | OperatorDao.update_operator_by_id(operator_id, 'success', result_model_url, run_info) 43 | return [result_model_url] 44 | 45 | except Exception as e: 46 | run_info = str(e) 47 | OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info) 48 | traceback.print_exc() 49 | return [] 50 | 51 | 52 | def svm_core(spark_session, df, condition): 53 | """ 54 | 支持向量机二分类核心函数 55 | :param spark_session: 56 | :param df: 57 | :param condition: 58 | {"label": "", "features": [12, 13, 14, 15], "iterations": 20, "step": 1.0, "regParam": 0.01, "regType": "l2", "convergenceTol": 0.001} 59 | :return: 60 | """ 61 | 62 | # 参数 63 | label_index = condition['label'] # 标签列(列名或列号) 64 | feature_indexs = condition['features'] # 特征列(列名或列号) 65 | iterations = condition['iterations'] # 迭代轮数 66 | step = condition['step'] # 步长 67 | reg_param = condition['regParam'] # 正则化系数 68 | reg_type = condition['regType'] # 正则化 69 | convergence_tol = condition['convergenceTol'] # 收敛系数 70 | 71 | # 1. 准备数据 72 | def func(x): 73 | features_data = [] 74 | for feature in feature_indexs: 75 | features_data.append(x[feature]) 76 | return LabeledPoint(label=x[label_index], features=features_data) 77 | 78 | training_data = df.rdd.map(lambda x: func(x)) 79 | 80 | # 2. 训练 81 | svm_model = SVMWithSGD.train(training_data, iterations=iterations, step=step, regParam=reg_param, 82 | miniBatchFraction=1.0, initialWeights=None, regType=reg_type, 83 | intercept=False, validateData=True, convergenceTol=convergence_tol) 84 | 85 | # 3.保存模型 86 | svm_model_path = model_url() + '/svm/' + str(uuid.uuid1()) 87 | deltree(svm_model_path) # 删除已经存在的模型 88 | svm_model.save(spark_session.sparkContext, svm_model_path) 89 | 90 | return svm_model_path 91 | 92 | 93 | def gbdt(spark_session, operator_id, file_url, condition): 94 | """ 95 | # GBDT(Gradient Boosting Decision Tree) 又叫 MART(Multiple Additive Regression Tree),是一种迭代的决策树算法, 96 | # 该算法由多棵决策树组成,所有树的结论累加起来做最终答案。 97 | :param spark_session: 98 | :param operator_id: 99 | :param file_url: 100 | :param condition: 101 | :return: 102 | """ 103 | try: 104 | # 修改计算状态 105 | OperatorDao.update_operator_by_id(operator_id, 'running', '', '') 106 | # 读取数据 107 | df = read_data(spark_session, file_url) 108 | # svm_core函数 109 | result_model_url = gbdt_core(df, condition) 110 | # 修改计算状态 111 | run_info = 'GBDT二分类算子执行成功' 112 | OperatorDao.update_operator_by_id(operator_id, 'success', result_model_url, run_info) 113 | return [result_model_url] 114 | 115 | except Exception as e: 116 | run_info = str(e) 117 | OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info) 118 | traceback.print_exc() 119 | return [] 120 | 121 | 122 | def gbdt_core(df, condition): 123 | """ 124 | gdbt二分类核心函数 125 | :param spark_session: 126 | :param df: 127 | :param condition:{"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20, "step": 0.1, "maxDepth": 5, "minInstancesPerNode": 1, "seed": 1} 128 | :return: 129 | """ 130 | 131 | # 参数 132 | label_index = condition['label'] # 标签列(列名或列号) 133 | feature_indexs = condition['features'] # 特征列(列名或列号) 134 | iterations = condition['iterations'] # 迭代次数 135 | step = condition['step'] # 学习速率(0-1) 136 | max_depth = condition['maxDepth'] # 数的最大深度[1,100] 137 | minInstancesPerNode = condition['minInstancesPerNode'] # 叶子节点最少样本数[1,1000] 138 | seed = condition['seed'] # 随机数产生器种子[0,10] 139 | 140 | # 1. 准备数据 141 | def func(x): 142 | features_data = [] 143 | for feature in feature_indexs: 144 | features_data.append(x[feature]) 145 | return Row(label=x[label_index], features=Vectors.dense(features_data)) 146 | 147 | training_set = df.rdd.map(lambda x: func(x)).toDF() 148 | 149 | string_indexer = StringIndexer(inputCol="label", outputCol="indexed") 150 | si_model = string_indexer.fit(training_set) 151 | tf = si_model.transform(training_set) 152 | 153 | # 2. 训练 154 | gbdt = GBTClassifier(labelCol="indexed", 155 | maxIter=iterations, stepSize=step, maxDepth=max_depth, minInstancesPerNode=minInstancesPerNode, 156 | seed=seed) 157 | gbdt_model = gbdt.fit(tf) 158 | print(gbdt_model.featureImportances) 159 | 160 | # 3.保存模型 161 | svm_model_path = model_url() + '/gbdt/' + str(uuid.uuid1()) 162 | deltree(svm_model_path) # 删除已经存在的模型 163 | gbdt_model.write().overwrite().save(svm_model_path) 164 | 165 | return svm_model_path 166 | 167 | 168 | def lr(spark_session, operator_id, file_url, condition): 169 | """ 170 | 逻辑回归二分类 171 | :param spark_session: 172 | :param operator_id: 173 | :param file_url: 174 | :param condition: 175 | :return: 176 | """ 177 | try: 178 | # 修改计算状态 179 | OperatorDao.update_operator_by_id(operator_id, 'running', '', '') 180 | # 读取数据 181 | df = read_data(spark_session, file_url) 182 | # svm_core函数 183 | result_model_url = lr_core(df, condition) 184 | # 修改计算状态 185 | run_info = '逻辑回归二分类算子执行成功' 186 | OperatorDao.update_operator_by_id(operator_id, 'success', result_model_url, run_info) 187 | return [result_model_url] 188 | 189 | except Exception as e: 190 | run_info = str(e) 191 | OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info) 192 | traceback.print_exc() 193 | return [] 194 | 195 | 196 | def lr_core(df, condition): 197 | """ 198 | lr二分类核心函数 199 | :param spark_session: 200 | :param df: 201 | :param condition:{"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20,"regParam":0.0,"elasticNetParam":0.0,"tol":0.000006,"fitIntercept":True,"threshold":0.5} 202 | :return: 203 | """ 204 | # 参数 205 | label_index = condition['label'] # 标签列(列名或列号) 206 | feature_indexs = condition['features'] # 特征列(列名或列号) 207 | iterations = condition['iterations'] # 最大迭代次数(默认100) 208 | regParam = condition['regParam'] # 正则化参数(默认0.0) 209 | # ElasticNet混合参数,范围为[0,1]。对于alpha = 0,惩罚是L2惩罚。对于alpha = 1,这是L1惩罚(默认值:0.0) 210 | elasticNetParam = condition['elasticNetParam'] 211 | tol = condition['tol'] # 迭代算法的收敛容限(> = 0)(默认值:1e-06即 0.000006) 212 | fitIntercept = condition['fitIntercept'] # 是否训练截距项(默认值:"True","False"可选) 213 | threshold = condition['threshold'] # 二进制分类预测中的阈值,范围为[0,1](默认值:0.5) 214 | 215 | # 参数类型转换 216 | if isinstance(iterations, str): 217 | iterations = int(iterations) 218 | if isinstance(regParam, str): 219 | regParam = float(regParam) 220 | if isinstance(elasticNetParam, str): 221 | elasticNetParam = float(elasticNetParam) 222 | if isinstance(tol, str): 223 | tol = float(tol) 224 | if isinstance(fitIntercept, str): 225 | if fitIntercept == 'False': 226 | fitIntercept = False 227 | else: 228 | fitIntercept = True 229 | if isinstance(threshold, str): 230 | threshold = float(threshold) 231 | 232 | # 1. 准备数据 233 | def func(x): 234 | features_data = [] 235 | for feature in feature_indexs: 236 | features_data.append(x[feature]) 237 | return Row(label=x[label_index], features=Vectors.dense(features_data)) 238 | 239 | training_set = df.rdd.map(lambda x: func(x)).toDF() 240 | 241 | # 2.训练模型 242 | lr_param = LogisticRegression(featuresCol="features", labelCol="label", predictionCol="prediction", 243 | maxIter=iterations, regParam=regParam, elasticNetParam=elasticNetParam, tol=tol, 244 | fitIntercept=fitIntercept, threshold=threshold, probabilityCol="probability", 245 | rawPredictionCol="rawPrediction", standardization=True, 246 | aggregationDepth=2, family="auto") 247 | lr_model = lr_param.fit(training_set) 248 | print(lr_model.coefficients) # 系数 249 | print(lr_model.intercept) # 截距 250 | print(lr_model.explainParams()) # 参数以及其注解 251 | 252 | # 3.保存模型 253 | lr_model_path = model_url() + '/lr/' + str(uuid.uuid1()) 254 | deltree(lr_model_path) # 删除已经存在的模型 255 | lr_model.write().overwrite().save(lr_model_path) 256 | 257 | return lr_model_path 258 | 259 | 260 | ''' 261 | 错误 'PipelinedRDD' object has no attribute 'show' 262 | 报这个错,是因为 df.show() is only for spark DataFrame 所致。 263 | ''' 264 | -------------------------------------------------------------------------------- /app/service/ml/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 机器学习Service 3 | """ -------------------------------------------------------------------------------- /app/test/FPGrowthTest.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext 2 | sc = SparkContext("local","testing") 3 | from pyspark.mllib.fpm import FPGrowth 4 | data = [["A", "B", "C", "E", "F","O"], ["A", "C", "G"], ["E","I"], ["A", "C","D","E","G"], ["A", "C", "E","G","L"], 5 | ["E","J"],["A","B","C","E","F","P"],["A","C","D"],["A","C","E","G","M"],["A","C","E","G","N"]] 6 | # 转换成RDD 参数numSlices指定了将数据集切分为几份,这里不设置,Spark会尝试根据集群的状况,来自动设定slices的数目 7 | rdd = sc.parallelize(data) 8 | #支持度阈值为20% 9 | model = FPGrowth.train(rdd, 0.3, 2) 10 | print(sorted(model.freqItemsets().collect())) 11 | 12 | 13 | from pyspark.mllib.fpm import PrefixSpan 14 | data = [ 15 | [['a'],["a", "b", "c"], ["a","c"],["d"],["c", "f"]], 16 | [["a","d"], ["c"],["b", "c"], ["a", "e"]], 17 | [["e", "f"], ["a", "b"], ["d","f"],["c"],["b"]], 18 | [["e"], ["g"],["a", "f"],["c"],["b"],["c"]] 19 | ] 20 | rdd = sc.parallelize(data) 21 | model = PrefixSpan.train(rdd, 0.5,4) 22 | print(sorted(model.freqItemsets().collect())) 23 | -------------------------------------------------------------------------------- /app/test/PySparkTest.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | import pyspark.rdd 3 | 4 | APP_NAME = "Random Forest Example2" 5 | SPARK_URL = "spark://10.108.211.130:7077" 6 | spark = SparkSession.builder \ 7 | .appName(APP_NAME) \ 8 | .master(SPARK_URL) \ 9 | .getOrCreate() 10 | # data = spark.sparkContext.parallelize([('Ferrari','fast'),{'Porsche':10000},['Spain','visited',4504]]).collect() 11 | # arr = [] 12 | # arr.append(data) 13 | # print(data[1]['Porsche']) 14 | # print(data[2][1]) 15 | # 16 | # df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age")) 17 | # df = spark.createDataFrame([(1, "John Doe", 22)], ("id", "name", "age")) 18 | # df.show() 19 | # arr2 = [] 20 | # arr2.append(df) 21 | # for d in arr2: 22 | # d.show() 23 | data = spark.sparkContext.textFile("hdfs://10.108.211.130/user/yufeng/files/spam.txt") 24 | data2 = data.map(lambda x: len(x)) 25 | sum = data2.fold(0, (lambda x, y: x + y)) 26 | 27 | sum 28 | -------------------------------------------------------------------------------- /app/test/RandomForestTest.py: -------------------------------------------------------------------------------- 1 | """ 2 | Random Forest Classification Example. 3 | """ 4 | from pyspark import SparkContext 5 | from pyspark.sql import SparkSession 6 | 7 | if __name__ == "__main__": 8 | 9 | CSV_PATH = "/home/zk/data/creditcard.csv" 10 | APP_NAME = "Random Forest Example" 11 | SPARK_URL = "local[*]" 12 | RANDOM_SEED = 13579 13 | TRAINING_DATA_RATIO = 0.7 14 | RF_NUM_TREES = 3 15 | RF_MAX_DEPTH = 4 16 | RF_MAX_BINS = 32 17 | 18 | spark = SparkSession.builder \ 19 | .appName(APP_NAME) \ 20 | .master(SPARK_URL) \ 21 | .getOrCreate() 22 | 23 | df = spark.read \ 24 | .options(header="true", inferschema="true") \ 25 | .csv(CSV_PATH) 26 | 27 | print("Total number of rows: %d" % df.count()) 28 | 29 | from pyspark.mllib.linalg import Vectors 30 | from pyspark.mllib.regression import LabeledPoint 31 | 32 | transformed_df = df.rdd.map(lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1]))) 33 | 34 | splits = [TRAINING_DATA_RATIO, 1.0 - TRAINING_DATA_RATIO] 35 | training_data, test_data = transformed_df.randomSplit(splits, RANDOM_SEED) 36 | 37 | print("Number of training set rows: %d" % training_data.count()) 38 | print("Number of test set rows: %d" % test_data.count()) 39 | 40 | from pyspark.mllib.tree import RandomForest 41 | from time import * 42 | import shutil, os 43 | 44 | start_time = time() 45 | 46 | model = RandomForest.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={}, \ 47 | numTrees=RF_NUM_TREES, featureSubsetStrategy="auto", impurity="gini", \ 48 | maxDepth=RF_MAX_DEPTH, maxBins=RF_MAX_BINS, seed=RANDOM_SEED) 49 | if os.path.exists("myRandomForestClassificationModel"): 50 | shutil.rmtree("myRandomForestClassificationModel") 51 | model.save(spark.sparkContext, "myRandomForestClassificationModel") 52 | 53 | print('Learned classification forest model:') 54 | print(model.numTrees()) 55 | print(model.totalNumNodes()) 56 | print(model.toDebugString()) 57 | end_time = time() 58 | elapsed_time = end_time - start_time 59 | print("Time to train model: %.3f seconds" % elapsed_time) 60 | 61 | predictions = model.predict(test_data.map(lambda x: x.features)) 62 | labels_and_predictions = test_data.map(lambda x: x.label).zip(predictions) 63 | acc = labels_and_predictions.filter(lambda x: x[0] == x[1]).count() / float(test_data.count()) 64 | print("Model accuracy: %.3f%%" % (acc * 100)) 65 | 66 | from pyspark.mllib.evaluation import BinaryClassificationMetrics 67 | 68 | start_time = time() 69 | 70 | metrics = BinaryClassificationMetrics(labels_and_predictions) 71 | print("Area under Precision/Recall (PR) curve: %.f" % (metrics.areaUnderPR * 100)) 72 | print("Area under Receiver Operating Characteristic (ROC) curve: %.3f" % (metrics.areaUnderROC * 100)) 73 | 74 | end_time = time() 75 | elapsed_time = end_time - start_time 76 | print("Time to evaluate model: %.3f seconds" % elapsed_time) 77 | -------------------------------------------------------------------------------- /app/test/Test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from app.Utils import getProjectCurrentDataUrl 3 | import pandas as pd 4 | 5 | def fullTableStatistics2(): 6 | # columnNames = request.form.getlist("columns") 7 | # projectName = request.form.getlist("projectName") 8 | columnNames = [ "行 ID", 9 | "订单 ID", 10 | "订购日期", 11 | "装运日期", 12 | "装运方式", 13 | "客户 ID", 14 | "客户名称", 15 | "细分市场", 16 | "邮政编码 (Postal Code)", 17 | "城市 (City)", 18 | "省/市/自治区 (State/Province)", 19 | "国家/地区 (Country)", 20 | "地区", 21 | "市场", 22 | "产品 ID", 23 | "类别", 24 | "子类别", 25 | "产品名称", 26 | "销售额", 27 | "数量", 28 | "折扣", 29 | "利润", 30 | "装运成本", 31 | "订单优先级"] 32 | projectName = "爱德信息分析项目" 33 | fileUrl = getProjectCurrentDataUrl(projectName) 34 | if fileUrl[-4:] == ".csv": 35 | df_excel = pd.read_csv(fileUrl, encoding="utf-8") 36 | else: 37 | df_excel = pd.read_excel(fileUrl, encoding="utf-8") 38 | res = [] 39 | statistics = ['字段名','类型','总数','最小值','最小值位置','25%分位数','中位数','75%分位数','均值','最大值','最大值位置','平均绝对偏差','方差','标准差','偏度','峰度'] 40 | for columnName in columnNames: 41 | info = {}.fromkeys(statistics) 42 | info['字段名'] = columnName 43 | info['类型'] = df_excel[columnName].dtype 44 | if info['类型'] == 'int64' or info['类型'] == 'float64': 45 | info['总数'] = df_excel[columnName].count() 46 | info['最小值'] = df_excel[columnName].min() 47 | info['最小值位置'] = df_excel[columnName].idxmin() 48 | info['25%分位数'] = df_excel[columnName].quantile(.25) 49 | info['中位数'] = df_excel[columnName].median() 50 | info['75%分位数'] = df_excel[columnName].quantile(.75) 51 | info['均值'] = df_excel[columnName].mean() 52 | info['最大值'] = df_excel[columnName].max() 53 | info['最大值位置'] = df_excel[columnName].idxmax() 54 | info['平均绝对偏差'] = df_excel[columnName].mad() 55 | info['方差'] = df_excel[columnName].var() 56 | info['标准差'] = df_excel[columnName].std() 57 | info['偏度'] = df_excel[columnName].skew() 58 | info['峰度'] = df_excel[columnName].kurt() 59 | else: 60 | info['类型'] = "text" 61 | res.append(info) 62 | # print(res) 63 | 64 | # fullTableStatistics2() 65 | 66 | str1 = "利润,+,100,+,数量,*,0.0001,newCol" 67 | print(len(str1.split(';'))) #1 68 | str1 = "利润,均值填充;数量,最大值填充" 69 | print(len(str1.split(';'))) #2 70 | str1 = "利润,均值填充;数量,最大值填充;" 71 | print(len(str1.split(';'))) #3 -------------------------------------------------------------------------------- /app/test/zhoukang: -------------------------------------------------------------------------------- 1 | 2 | #数据源 过滤 排序 3 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"data2" :{"type" : "data","name" : 5001,"location" : {"x":"12px", "y":"23px"},"config" : {"fileId" : 2,"fileUrl" : [{"data2":"/home/zk/data/订单信息.csv"}]},"next" : ["pre1"],"pre" : []},"exp1" : {"type" : "exploration","name" : 1001,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" : {"userId":"1","projectId":32,"parameter":[{"colName":"利润", "operate":">", "value":"100", "relation":"AND"},{"colName":"装运方式", "operate":"==", "value":"一级", "relation":""}]},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]},"pre1" : {"type" : "preprocess","name" : 1002,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter":{"userId": "1", "projectId": 32, "columnName": "利润", "sortType": "升序"},"fileUrl":[{"data2":0}]},"next" : [],"pre" : ["data2"]}} 4 | 5 | # 排序 6 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 1002,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" : {"userId":"1","projectId":32,"parameter":{"userId":1,"projectId":32,"columnName":"利润","sortType":"降序"}},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}} 7 | 8 | #数据列拆分 9 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 1003,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" : {"userId": 1, "projectId": 32, "columnName": "订购日期", "delimiter": "/", "newColumnNames": ["year", "月"]},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}} 10 | 11 | # 数据列合并 12 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 1005,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" : {"userId": 1, "projectId": 32, "columnNames": ["类别", "子类别", "产品名称"], "connector": "-", "newColumnName": "品类名称"},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}} 13 | 14 | # 替换 15 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 1006,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" : {"userId": 1, "projectId": 32, "columnNames": ["类别", "子类别", "客户名称"],"replaceCharacters":[{"source":"技术","target":"技术copy"},{"source":"电话","target":"电话copy"}]},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}} 16 | 17 | #填充空值 18 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 1007,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" : {"userId":1,"projectId":32,"parameter":[{"operate":"均值填充","colName":"利润"},{"operate":"均值填充","colName":"数量"}]},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}} 19 | 20 | # 列映射 21 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 1008,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"userId":1,"projectId":32,"parameter":[{"colName_1":"利润", "operate_1":"+","value_1":"100","operate":"+","colName_2":"数量", "operate_2":"*","value_2":"0.0001","newName":"newCol1"},{"colName_1":"利润", "operate_1":"+","value_1":"10","operate":"*","colName_2":"数量", "operate_2":"*","value_2":"0.1","newName":"newCol2"}]},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}} 22 | 23 | # 分位数离散化 24 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 2001,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"userId":1,"projectId":32,"columnName":"装运成本","newColumnName":"装运成本(分位数离散化)","numBuckets":10},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}} 25 | 26 | # 向量索引 27 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 2002,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"userId":1,"projectId":32,"columnNames":["装运成本"],"newColumnName":"向量索引转换结果","maxCategories":50},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}} 28 | 29 | # 标准化 30 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 2003,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"projectId":32,"columnNames":["利润"],"newColumnName":"利润(标准化)"},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}} 31 | 32 | # pca 33 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 2004,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"userId":1,"projectId":32,"columnNames":["销售额","数量","折扣","利润","装运成本"],"newColumnName":"降维结果","k":4},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}} 34 | 35 | # 字符串转标签 36 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 2005,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"userId":1,"projectId":32,"columnName":"客户名称","newColumnName":"客户名称(标签化,按频率排序,0为频次最高)"},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}} 37 | 38 | # 独热编码 39 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 2006,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"userId":1,"projectId":32,"columnNames":["数量","数量"],"newColumnNames":["独热编码1","独热编码2"]},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}} 40 | 41 | # 多项式扩展 42 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 2007,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"projectId":32,"columnNames":["数量","折扣","装运成本"],"newColumnName":"多项式扩展"},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}} 43 | 44 | # 卡方选择 45 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 2008,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"userId":"1","projectId":"订单分析","columnNames":["折扣","装运成本"],"columnName_label":"数量","newColumnName":"卡方选择","numTopFeatures":2},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}} 46 | 47 | # 全表统计 48 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 3001,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"projectId": 32, "columnNames": ["利润"]},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}} 49 | 50 | # 频率统计 51 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 3002,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"projectId":32,"columnName":"类别"},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}} 52 | 53 | # 相关系数 54 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 3003,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"projectId": 32, "columnNames": ["销售额", "折扣", "装运成本"]},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}} 55 | 56 | # 支持向量机二分类 57 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 6001,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20, "step": 1.0, "regParam": 0.01, "regType": "l2", "convergenceTol": 0.001},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}} 58 | 59 | # gbdt二分类 60 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 6002,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter": {"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20, "step": 0.1, "maxDepth": 5, "minInstancesPerNode": 1, "seed": 1},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}} 61 | 62 | # 逻辑回归二分类 63 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 6003,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20,"regParam":0.0,"elasticNetParam":0.0,"tol":0.000006,"fitIntercept":"True","threshold":0.5},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}} 64 | 65 | # 逻辑回归多分类 66 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 6004,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20,"regParam":0.0,"elasticNetParam":0.0,"tol":0.000006,"fitIntercept":"True"},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}} 67 | 68 | # 多层感知机多分类 69 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 6005,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20, "seed": 1, "stepSize": 0.03,"layers": [4, 2, 2],"tol": 0.000001, "blockSize": 128, "solver": "l-bfgs"},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}} 70 | 71 | ## 预测 72 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "数据预处理","name" : 6001,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20, "step": 1.0, "regParam": 0.01, "regType": "l2", "convergenceTol": 0.001},"fileUrl":[{"data1":0}]},"next" : ["pre1"],"pre" : ["data1"]},"pre1" : {"type" : "机器学习","name" : 6000,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"label":"标签","features": [12, 13, 14, 15]},"fileUrl":[{"data1":0},{"exp1":0}]},"next" : [],"pre" : ["exp1","data1"]}} 73 | 74 | # 拆分数据 75 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "exploration","name" : 1009,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" : {"proportion1": 0.7, "proportion2": 0.3, "seed": 10},"fileUrl":[{"data1":0}]},"next" : [],"pre" : ["data1"]}} 76 | 77 | # 评估 78 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["exp1"],"pre" : [""]},"exp1" : {"type" : "数据预处理","name" : 6001,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20, "step": 1.0, "regParam": 0.01, "regType": "l2", "convergenceTol": 0.001},"fileUrl":[{"data1":0}]},"next" : ["pre1"],"pre" : ["data1"]},"pre1" : {"type" : "机器学习","name" : 6000,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"label":"标签","features": [12, 13, 14, 15]},"fileUrl":[{"data1":0},{"exp1":0}]},"next" : ["eva1"],"pre" : ["exp1","data1"]},"eva1" : {"type" : "机器学习","name" : 7001,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter" :{"label":"标签"},"fileUrl":[]},"next" : [],"pre" : ["pre1"]}} 79 | 80 | 81 | 82 | # 加载数据 + 加载gbdt二分类 + 预测 83 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["pre1"],"pre" : [""]},"model1" : {"type" : "exploration","name" : 8000,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter": {"userId": 1, "projectId": 37, "MLModelId": 3, "modelTypeId": 6002},"fileUrl":[]},"next" : ["pre1"],"pre" : []},"pre1" : {"type" : "exploration","name" : 6000,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter": {"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"]},"fileUrl":[{"data1":0},{"model1":0}]},"next" : [],"pre" : ["data1","model1"]}} 84 | 85 | # 加载数据 + 加载逻辑回归二分类 + 预测 86 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["pre1"],"pre" : [""]},"model1" : {"type" : "exploration","name" : 8000,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter": {"userId": 1, "projectId": 32, "MLModelId": 5, "modelTypeId": 6003},"fileUrl":[]},"next" : ["pre1"],"pre" : []},"pre1" : {"type" : "exploration","name" : 6000,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter": {"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"]},"fileUrl":[{"data1":0},{"model1":0}]},"next" : [],"pre" : ["data1","model1"]}} 87 | 88 | # 加载数据 + 加载多层感知机多分类 + 预测 89 | {"data1" :{"type" : "data","location" : {"x":"12px", "y":"23px"},"name" : 5001,"config" : {"fileId" : 1,"fileUrl" : [{"data1":"/home/zk/data/订单信息.csv"}]},"next" : ["pre1"],"pre" : [""]},"model1" : {"type" : "exploration","name" : 8000,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter": {"userId": 1, "projectId": 32, "MLModelId": 7, "modelTypeId": 6005},"fileUrl":[]},"next" : ["pre1"],"pre" : []},"pre1" : {"type" : "exploration","name" : 6000,"location" : {"x":"12px", "y":"23px"},"config" : {"parameter": {"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"]},"fileUrl":[{"data1":0},{"model1":0}]},"next" : [],"pre" : ["data1","model1"]}} 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /app/views/OperateFlow.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from flask import flash, get_flashed_messages, redirect, render_template, request, session, url_for, jsonify, Response, \ 3 | abort 4 | from flask.json import jsonify 5 | from app import app 6 | import json 7 | import os 8 | import time 9 | from app.Utils import * 10 | from app.views import Process 11 | import app.Utils as apus 12 | import pandas as pd 13 | from pyspark.sql import SparkSession 14 | import random 15 | import string 16 | 17 | 18 | # 解决 list, dict 不能返回的问题 19 | class MyResponse(Response): 20 | @classmethod 21 | def force_type(cls, response, environ=None): 22 | if isinstance(response, (list, dict)): 23 | response = jsonify(response) 24 | return super(Response, cls).force_type(response, environ) 25 | 26 | 27 | app.response_class = MyResponse 28 | 29 | 30 | # 解析filter参数函数 31 | def parsingFilterParameters(str): 32 | condition = [] 33 | strList = str.split(';') 34 | for i in range(len(strList)): 35 | ll = strList[i].split(',', 3) 36 | con = {} 37 | con['name'] = ll[0] 38 | con['operate'] = ll[1] 39 | con['value'] = ll[2] 40 | con['relation'] = ll[3] 41 | condition.append(con) 42 | return condition 43 | 44 | 45 | # 查看处理流程 46 | @app.route("/getOperateFlow", methods=['POST']) 47 | def getOperateFlow(): 48 | projectName = request.form.get('projectName') 49 | userId = request.form.get('userId') 50 | project = getProjectByNameAndUserId(projectName, userId) 51 | # print(project) 52 | processflow = getProcessFlowByProjectId(project.id) 53 | operates = json.loads(processflow.operates) 54 | # print(operates) 55 | # for item in operates: 56 | # # print(item) 57 | # # print(item['type']) 58 | # # print(item['operate']) 59 | # if (item['type'] == '1'): 60 | # item['operate'] = parsingFilterParameters(item['operate']) 61 | print(operates) 62 | return operates 63 | 64 | 65 | @app.route("/executeAgain", methods=['POST']) 66 | def executeAgain(): 67 | """ 68 | 重新执行处理流程(DAG)。 69 | 请求,判断这个节点的父节点是否执行完成,如果完成 拿父节点输出的数据 作为输入,处理后存储数据并标记该节点已经完成。 70 | :return: 71 | """ 72 | projectName = request.form.get('projectName') 73 | userId = request.form.get('userId') 74 | nodeId = request.form.get('nodeId') # 节点开始执行的 75 | project = getProjectByNameAndUserId(projectName, userId) 76 | # print(project) 77 | processflow = getProcessFlowByProjectId(project.id) 78 | operates = json.loads(processflow.operates) 79 | fileUrl = getProjectCurrentDataUrl(projectName)['fileUrl'] 80 | # print(operates) 81 | functionName = projectName + "-executeAgain" 82 | 83 | # spark会话 84 | spark = getSparkSession(userId, functionName) 85 | 86 | # 获取数据 87 | df = spark.read.format("CSV").option("header", "true").load(fileUrl) 88 | 89 | # 执行DAG图 90 | for item in operates: 91 | if (item['type'] == '1'): 92 | # 解析参数格式 93 | condition = parsingFilterParameters(item['operate']) 94 | # 过滤函数 95 | df = Process.filterCore(spark, df, condition) 96 | df.show() 97 | 98 | # 处理后的数据写入文件 99 | df.toPandas().to_csv("/home/zk/data/test.csv", header=True) 100 | # 返回前50条数据 101 | data2 = df.limit(50).toJSON().collect() 102 | print(data2) 103 | data3 = ",".join(data2) 104 | print(data3) 105 | data4 = '[' + data3 + ']' 106 | print(data4) 107 | return jsonify({'length': df.count(), 'data': json.loads(data4)}) 108 | -------------------------------------------------------------------------------- /app/views/OperateType.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from flask import jsonify, Response 3 | from app import app 4 | from app.Utils import * 5 | import app.dao.OperatorTypeDao as OperatorTypeDao 6 | 7 | 8 | # 解决 list, dict 不能返回的问题 9 | class MyResponse(Response): 10 | @classmethod 11 | def force_type(cls, response, environ=None): 12 | if isinstance(response, (list, dict)): 13 | response = jsonify(response) 14 | return super(Response, cls).force_type(response, environ) 15 | 16 | 17 | app.response_class = MyResponse 18 | 19 | 20 | @app.route('/operateType/getAll', methods=['GET', 'POST']) 21 | def get_all_operate_type(): 22 | """ 23 | 获取所有的算子种类 24 | :return: 25 | """ 26 | operator_types = OperatorTypeDao.get_all_operator_type() 27 | 28 | aaa = dict() 29 | for i in operator_types: 30 | if i.type_label not in aaa.keys(): 31 | aaa[i.type_label] = [{"id": i.id, "name": i.type_name}] 32 | else: 33 | aaa.get(i.type_label).append({"id": i.id, "name": i.type_name}) 34 | # 和Java一样 存的是数组的引用呀 35 | 36 | item_list = [] 37 | for name in aaa.keys(): 38 | list = aaa.get(name) 39 | item_list.append({'name': name, 'list': list}) 40 | 41 | result = {'list': item_list} 42 | return result 43 | -------------------------------------------------------------------------------- /app/views/Operator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from flask import jsonify, Response, request 3 | from app import app 4 | from app.Utils import * 5 | import app.dao.OperatorDao as OperatorDao 6 | import pandas as pd 7 | import app.service.MLModelService as MLModelService 8 | 9 | 10 | # 解决 list, dict 不能返回的问题 11 | class MyResponse(Response): 12 | @classmethod 13 | def force_type(cls, response, environ=None): 14 | if isinstance(response, (list, dict)): 15 | response = jsonify(response) 16 | return super(Response, cls).force_type(response, environ) 17 | 18 | 19 | app.response_class = MyResponse 20 | 21 | 22 | @app.route('/operate/getOperateResultData', methods=['GET', 'POST']) 23 | def get_operate_result_data(): 24 | """ 25 | 查看算子运行结果数据 26 | :return: 27 | """ 28 | operator_id = request.form.get('operatorId') 29 | start = int(request.form.get('start')) 30 | end = int(request.form.get('end')) 31 | print(operator_id, start, end) 32 | operator = OperatorDao.get_operator_by_id(operator_id) 33 | if operator.status != "success": 34 | return "请执行该节点" 35 | if operator.operator_output_url is not None: 36 | operator_output_url = operator.operator_output_url.split('*,') 37 | else: 38 | return "没有运行结果" 39 | result_arr = [] 40 | try: 41 | for i in range(len(operator_output_url)): 42 | data = pd.read_csv(operator_output_url[i], encoding='utf-8') 43 | if len(data) < end: 44 | end = len(data) 45 | if start > end: 46 | result_arr.append({'length': len(data), 'data': "请输入合法参数", 'position': i}) 47 | else: 48 | data2 = data[int(start):int(end)].to_json(orient='records', force_ascii=False) 49 | result_arr.append({'length': len(data), 'data': json.loads(data2), 'position': i}) 50 | return jsonify(result_arr) 51 | except: 52 | traceback.print_exc() 53 | return "Error,please contact the administrator " 54 | 55 | 56 | @app.route('/operate/saveOperateModel', methods=['GET', 'POST']) 57 | def save_operate_model(): 58 | """ 59 | 对于模型算子 保存模型 60 | :return: 61 | """ 62 | operator_id = request.form.get('operatorId') 63 | user_id = request.form.get('userId') 64 | name = request.form.get('name') 65 | 66 | try: 67 | result = MLModelService.save_ml_model(operator_id, user_id, name) 68 | if isinstance(result, str): 69 | return result 70 | if isinstance(result, bool): 71 | if result is True: 72 | return "success" 73 | return "fail" 74 | except: 75 | traceback.print_exc() 76 | return "Error,please contact the administrator " 77 | 78 | 79 | @app.route('/operate/getOperateModel', methods=['GET', 'POST']) 80 | def get_operate_model(): 81 | """ 82 | 获取保存的模型 83 | :return: 84 | """ 85 | ml_model_id = request.args.get('MLModelId') 86 | project_id = request.args.get('projectId') 87 | user_id = request.args.get('userId') 88 | model_id = request.args.get('modelId') 89 | name = request.args.get('name') 90 | status = request.args.get('status') 91 | 92 | print(ml_model_id, project_id, user_id, model_id, name, status) 93 | try: 94 | results = MLModelService.get_ml_model(ml_model_id, project_id, user_id, model_id, name, status) 95 | return jsonify(results) 96 | except: 97 | traceback.print_exc() 98 | return "Error,please contact the administrator " 99 | 100 | 101 | @app.route('/operate/deleteOperateModel', methods=['GET', 'POST']) 102 | def delete_operate_model(): 103 | """ 104 | 删除保存的模型 105 | :return: 106 | """ 107 | ml_model_id = request.form.get('MLModelId') 108 | try: 109 | model = MLModelService.delete_ml_model(ml_model_id) 110 | return 'success' 111 | except: 112 | traceback.print_exc() 113 | return "Error,please contact the administrator " 114 | -------------------------------------------------------------------------------- /app/views/Project.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | import sys 3 | from importlib import reload 4 | 5 | reload(sys) 6 | 7 | from flask import request, jsonify, Response 8 | from flask.json import jsonify 9 | from app import app 10 | from app import db 11 | from app.models.MSEntity import DataSource, Project, Model 12 | import os 13 | from app.Utils import mkdir, getProjectByNameAndUserId 14 | from app.ConstFile import const 15 | 16 | 17 | class MyResponse(Response): 18 | """解决 list, dict 不能返回的问题""" 19 | 20 | @classmethod 21 | def force_type(cls, response, environ=None): 22 | if isinstance(response, (list, dict)): 23 | response = jsonify(response) 24 | return super(Response, cls).force_type(response, environ) 25 | 26 | 27 | app.response_class = MyResponse 28 | 29 | 30 | @app.route('/project/testList', methods=['GET', 'POST']) 31 | def test_list(): 32 | """ 33 | 获取项目列表 34 | :return: 35 | """ 36 | result = [] 37 | 38 | return jsonify(result) 39 | 40 | 41 | @app.route('/project/getAll', methods=['GET', 'POST']) 42 | def get_all(): 43 | """ 44 | 获取项目列表 45 | :return: 46 | """ 47 | data_sources = Project.query.all() 48 | result = [] 49 | for i in data_sources: 50 | result.append({"id": i.id, "name": i.project_name}) 51 | return jsonify(result) 52 | 53 | 54 | @app.route('/project/create', methods=['GET', 'POST']) 55 | def create(): 56 | """ 57 | 创建项目。 58 | 创建项目的时候 创建一个model,现在项目和model是 1:1对应的关系 59 | 60 | :return: 61 | """ 62 | if request.method == 'GET': 63 | projectName = request.form.get('projectName') 64 | userId = request.form.get('userId') 65 | else: 66 | projectName = request.form.get('projectName') 67 | userId = request.form.get('userId') 68 | # 弃用,因此默认 1 69 | data_source_id = 1 70 | print('projectName: {}, dataSourceId: {}, userId: {}'.format(projectName, data_source_id, userId)) 71 | 72 | root_url = const.ROOTURL 73 | 74 | # 数据库中添加Project记录 75 | project = Project(project_name=projectName, project_address=root_url + projectName, user_id=userId, 76 | dataSource_id=data_source_id) 77 | db.session.add(project) 78 | 79 | # 数据库中添加Model记录 80 | # 格式化成2016-03-20 11:45:39形式 81 | import time 82 | project = getProjectByNameAndUserId(projectName, userId) 83 | model = Model(model_name=projectName, project_id=project.id, start_nodes="", 84 | create_time=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) 85 | db.session.add(model) 86 | db.session.commit() 87 | 88 | # 创建项目目录 89 | try: 90 | if not (os.path.exists(root_url + projectName)): 91 | filters = { 92 | DataSource.id == data_source_id 93 | } 94 | data_sources = DataSource.query.filter(*filters).first() 95 | db.session.commit() 96 | mkdir(root_url + projectName) 97 | print(data_sources.file_url) 98 | return get_all() 99 | else: 100 | return "Double name" 101 | except: 102 | return "error" 103 | -------------------------------------------------------------------------------- /app/views/ProjectModel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from flask import request, jsonify, Response 3 | from app import app 4 | from app.Utils import * 5 | from app.dao.ModelDao import * 6 | import app.service.ModelService as ModelService 7 | 8 | 9 | # 解决 list, dict 不能返回的问题 10 | class MyResponse(Response): 11 | @classmethod 12 | def force_type(cls, response, environ=None): 13 | if isinstance(response, (list, dict)): 14 | response = jsonify(response) 15 | return super(Response, cls).force_type(response, environ) 16 | 17 | 18 | app.response_class = MyResponse 19 | 20 | 21 | @app.route("/model/updateFlow", methods=['POST']) 22 | def update_flow(): 23 | """ 24 | 新建 处理流程 25 | :return: 26 | """ 27 | user_id = request.form.get('userId') 28 | project_id = request.form.get('projectId') 29 | config = request.form.get('config') 30 | start_nodes = request.form.get('startNode') 31 | relationship = request.form.get('relationship') 32 | config_order = request.form.get('configOrder') 33 | 34 | print('------updateFlow', user_id, project_id, config, start_nodes, relationship, config_order) 35 | # 更新 model(流程图) 36 | result = ModelService.update_model(project_id, start_nodes, config, relationship, config_order) 37 | 38 | if result is not False: 39 | return "保存成功" 40 | else: 41 | return "保存失败,请重试!" 42 | 43 | 44 | @app.route("/model/getFlow", methods=['POST']) 45 | def get_flow(): 46 | """ 47 | 查看model(执行流程) 48 | :return: 49 | """ 50 | project_id = request.form.get('projectId') 51 | user_id = request.form.get('userId') 52 | 53 | print(project_id, user_id) 54 | flow = ModelService.get_model_by_project_id(project_id) 55 | if flow is False: 56 | return "获取执行流程图失败,请联系工作人员" 57 | 58 | return flow 59 | 60 | 61 | @app.route("/model/getRunStatus", methods=['POST']) 62 | def get_run_status(): 63 | """ 64 | 查看model(执行流程)中每个节点的运行状态 65 | :return: 66 | """ 67 | project_id = request.form.get('projectId') 68 | user_id = request.form.get('userId') 69 | model_execute_id = request.form.get('modelExecuteId') 70 | 71 | print(project_id, user_id, model_execute_id) 72 | # 查看状态 73 | flow = ModelService.get_run_status_by_project_id(project_id, model_execute_id) 74 | 75 | if flow is False: 76 | return "获取执行流程图失败,请联系工作人员" 77 | 78 | return flow 79 | 80 | 81 | @app.route("/model/executeAll", methods=['POST']) 82 | def model_execute_all(): 83 | """ 84 | 从model(执行流程)中的某个节点开始执行 85 | :return: 86 | """ 87 | import _thread 88 | 89 | project_id = request.form.get('projectId') 90 | user_id = request.form.get('userId') 91 | print('-----/model/executeAll-----', user_id, project_id) 92 | 93 | try: 94 | param = ModelService.run_execute_status_from_start(user_id, project_id) 95 | _thread.start_new_thread(ModelService.model_execute, (user_id, project_id, param)) 96 | return {'model_execute_id': param['model_execute_id']} 97 | except: 98 | traceback.print_exc() 99 | print("Error: 无法启动线程") 100 | return '启动失败' 101 | 102 | 103 | @app.route("/model/executeFromOne", methods=['POST']) 104 | def model_execute_from_one(): 105 | """ 106 | 从model(执行流程)中的某个节点开始执行 107 | :return: 108 | """ 109 | import _thread 110 | 111 | project_id = request.form.get('projectId') 112 | user_id = request.form.get('userId') 113 | operator_id = request.form.get('operatorId') 114 | print('-----/model/executeFromOne-----', user_id, project_id, operator_id) 115 | 116 | try: 117 | param = ModelService.run_execute_status_from_one(user_id, operator_id) 118 | _thread.start_new_thread(ModelService.model_execute, (user_id, project_id, param)) 119 | return {'model_execute_id': param['model_execute_id']} 120 | except: 121 | print("Error: 无法启动线程") 122 | return '启动失败' 123 | 124 | return '启动成功' 125 | -------------------------------------------------------------------------------- /app/views/Report.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | from flask import request 3 | from flask.json import jsonify 4 | from app import app 5 | from app import db 6 | from app.models.MSEntity import Report 7 | import traceback 8 | 9 | 10 | @app.route('/report/getAll', methods=['GET', 'POST']) 11 | def report_get_all(): 12 | """ 13 | 获取所有报告 14 | :return: 15 | """ 16 | reports = Report.query.all() 17 | result = [] 18 | for i in reports: 19 | result.append({"id": i.id, "userId": i.user_id, "title": i.report_title, "content": i.report_content}) 20 | return jsonify(result) 21 | 22 | 23 | @app.route('/report/getOne', methods=['GET']) 24 | def report_get_one(): 25 | """ 26 | 获取一个报告 27 | :return: 28 | """ 29 | report_id = int(request.args.get('reportId')) 30 | report = db.session.query(Report).filter(Report.id == report_id).first() 31 | 32 | return {"id": report.id, "userId": report.user_id, "title": report.report_title, "content": report.report_content} 33 | 34 | 35 | @app.route('/report/getReportByUserId', methods=['GET', 'POST']) 36 | def report_get_by_user_id(): 37 | """ 38 | 获取某个用户的所有报告 39 | :return: 40 | """ 41 | user_id = request.args.get('userId') 42 | reports = db.session.query(Report).filter(Report.user_id == user_id) 43 | result = [] 44 | for i in reports: 45 | result.append({"id": i.id, "userId": i.user_id, "title": i.report_title, "content": i.report_content}) 46 | return jsonify(result) 47 | 48 | 49 | @app.route('/report/deleteOne', methods=['POST']) 50 | def report_delete_one(): 51 | """ 52 | 删除一个报告 53 | :return: 54 | """ 55 | try: 56 | report_id = int(request.form.get('reportId')) 57 | db.session.query(Report).filter(Report.id == report_id).delete() 58 | db.session.commit() 59 | return {"status": True, "message": "成功"} 60 | except: 61 | return {"status": False, "message": "失败"} 62 | 63 | 64 | @app.route('/report/updateOne', methods=['POST']) 65 | def report_update_one(): 66 | """ 67 | 更新一个报告 title、content 传值为""是不更新 68 | :return: 69 | """ 70 | try: 71 | report_id = request.form.get('reportId') 72 | report_title = request.form.get('title') 73 | report_content = request.form.get('content') 74 | 75 | update_content = {} 76 | if (report_title is not None) and (report_title != ''): 77 | update_content[Report.report_title] = report_title 78 | if (report_content is not None) and (report_content != ''): 79 | update_content[Report.report_content] = report_content 80 | 81 | db.session.query(Report).filter(Report.id == report_id).update(update_content) 82 | db.session.commit() 83 | return {"status": True, "message": "成功"} 84 | except: 85 | traceback.print_exc() 86 | return {"status": False, "message": "失败"} 87 | 88 | 89 | @app.route('/report/save', methods=['POST']) 90 | def report_save_one(): 91 | """ 92 | 保存一个报告 93 | :return: 94 | """ 95 | try: 96 | user_id = request.form.get('userId') 97 | report_title = request.form.get('title') 98 | report_content = request.form.get('content') 99 | 100 | report = Report(user_id=user_id, report_title=report_title, report_content=report_content) 101 | db.session.add(report) 102 | db.session.commit() 103 | return {"status": True, "message": "成功"} 104 | except: 105 | traceback.print_exc() 106 | return {"status": False, "message": "失败"} 107 | -------------------------------------------------------------------------------- /app/views/Test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import app.views.Process as process 3 | 4 | process.filterMultiConditions() -------------------------------------------------------------------------------- /app/views/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kugomusic/Easy_Data/a74c3cd2c9c3b0e5a9298f8c3b7af2a2f5caf260/app/views/__init__.py -------------------------------------------------------------------------------- /app/views/datasource/DataSource.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | from flask import request, send_from_directory, Response 4 | from app.views.datasource.werkzeug.utils import secure_filename 5 | from app import app, db 6 | from app.models.MSEntity import DataSource 7 | from sqlalchemy.sql import and_, or_, text 8 | 9 | from flask.json import jsonify 10 | import json 11 | import pandas as pd 12 | 13 | ALLOWED_EXTENSIONS = set(['txt', 'csv', 'pdf', 'png', 'jpg', 'jpeg', 'gif']) 14 | 15 | app.config['UPLOAD_FOLDER'] = os.getcwd() + '/../data' 16 | app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 * 1024 17 | 18 | html = ''' 19 | 20 |