├── .gitignore ├── 3rd_PanJiu_AIOps_Competition ├── model │ └── model.pkl ├── code │ ├── requirements.txt │ ├── .DS_Store │ ├── log.py │ ├── stacking.py │ ├── generate_pseudo_label.py │ ├── model.py │ ├── utils.py │ ├── lgb_fs.py │ ├── catboost_fs.py │ ├── get_crashdump_venus_fea.py │ └── generate_feature.py ├── data │ ├── 数据集下载地址 │ └── .DS_Store ├── tcdata │ └── 数据集下载地址 ├── .DS_Store ├── feature │ └── .DS_Store ├── 答辩PPT │ ├── .DS_Store │ └── 悦智AI实验室_20220525.pdf ├── user_data │ └── .DS_Store ├── docker_push.sh ├── run.sh ├── run.log ├── Dockerfile ├── README.md ├── log │ └── catboost.log └── LICENSE ├── .DS_Store ├── README.md ├── .idea └── workspace.xml └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .idea 3 | -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/model/model.pkl: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/code/requirements.txt: -------------------------------------------------------------------------------- 1 | scikit_learn==1.0.2 2 | -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yz-intelligence/AI-Competition/HEAD/.DS_Store -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/data/数据集下载地址: -------------------------------------------------------------------------------- 1 | https://tianchi.aliyun.com/competition/entrance/531947/information?lang=zh-cn -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/tcdata/数据集下载地址: -------------------------------------------------------------------------------- 1 | https://tianchi.aliyun.com/competition/entrance/531947/information?lang=zh-cn -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yz-intelligence/AI-Competition/HEAD/3rd_PanJiu_AIOps_Competition/.DS_Store -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/code/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yz-intelligence/AI-Competition/HEAD/3rd_PanJiu_AIOps_Competition/code/.DS_Store -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yz-intelligence/AI-Competition/HEAD/3rd_PanJiu_AIOps_Competition/data/.DS_Store -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/feature/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yz-intelligence/AI-Competition/HEAD/3rd_PanJiu_AIOps_Competition/feature/.DS_Store -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/答辩PPT/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yz-intelligence/AI-Competition/HEAD/3rd_PanJiu_AIOps_Competition/答辩PPT/.DS_Store -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/user_data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yz-intelligence/AI-Competition/HEAD/3rd_PanJiu_AIOps_Competition/user_data/.DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AI-Competition 2 | 开源往期获奖竞赛代码 3 | 4 | - [第三届阿里云磐久智维算法大赛亚军方案](./3rd_PanJiu_AIOps_Competition/README.md) 5 | - INFO:Catboost、伪标签、对抗验证、毫秒级预测(所有竞赛中最优) -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/答辩PPT/悦智AI实验室_20220525.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yz-intelligence/AI-Competition/HEAD/3rd_PanJiu_AIOps_Competition/答辩PPT/悦智AI实验室_20220525.pdf -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/docker_push.sh: -------------------------------------------------------------------------------- 1 | # 创建镜像 并提交到你的镜像仓库 2 | rm -rf result.zip 3 | # built 镜像 4 | docker build -t [你的仓库地址]:[TAG] . 5 | # push 镜像 6 | docker push [你的仓库地址]:[TAG] -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/run.sh: -------------------------------------------------------------------------------- 1 | rm -rf model 2 | #unzip model.zip 3 | python3 code/get_crashdump_venus_fea.py 4 | python3 code/catboost_fs.py 5 | zip -j result.zip prediction_result/catboost_result.csv 6 | -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/run.log: -------------------------------------------------------------------------------- 1 | Archive: model.zip 2 | creating: model/deberta-base/ 3 | inflating: model/debert_model_v21_128_fs_flod_5.h5 4 | inflating: model/debert_model_v21_128_fs_flod_6.h5 5 | inflating: model/debert_model_v21_128_fs_flod_8.h5 6 | inflating: model/README.txt 7 | inflating: model/weight_cs6399_fold_8_v21_128_fs.npy 8 | inflating: model/weight_cs6558_fold_5_v21_128_fs.npy 9 | inflating: model/weight_cs6614_fold_6_v21_128_fs.npy 10 | inflating: model/weight_fs6138_fold_8_v21_128_fs.npy 11 | inflating: model/weight_fs6280_fold_5_v21_128_fs.npy 12 | inflating: model/weight_fs6359_fold_6_v21_128_fs.npy 13 | -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/Dockerfile: -------------------------------------------------------------------------------- 1 | # Base Images 2 | ## 从天池基础镜像构建 3 | FROM registry.cn-shanghai.aliyuncs.com/tcc-public/python:3 4 | ## 把当前文件夹里的文件构建到镜像的根目录下 5 | ADD . / 6 | ## 指定默认工作目录为根目录(需要把run.sh和生成的结果文件都放在该文件夹下,提交后才能运行) 7 | WORKDIR / 8 | 9 | ## 安装所需要的包 10 | RUN pip config set global.index-url http://mirrors.aliyun.com/pypi/simple/ 11 | RUN pip config set install.trusted-host mirrors.aliyun.com 12 | RUN pip3 install -r code/requirements.txt 13 | RUN pip install --upgrade pip 14 | RUN apt -y update 15 | RUN apt install zip 16 | RUN apt install vim -y 17 | RUN apt install screen -y 18 | RUN pip install catboost 19 | RUN pip install scikit-learn 20 | RUN pip install tqdm 21 | RUN pip install lightgbm 22 | RUN pip install gensim==4.1.2 23 | 24 | ## 镜像启动后统一执行 sh run.sh 25 | CMD ["sh", "run.sh"] 26 | -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/code/log.py: -------------------------------------------------------------------------------- 1 | 2 | import logging 3 | import os 4 | 5 | 6 | class Logger: 7 | def __init__(self, name, log_path, mode='a'): 8 | """ 9 | 程序运行日志类的构造函数 10 | :param name: 需要保存的日志文件名称,默认后缀名称为 .log 11 | :param log_path: 需要保存的日志文件路径 12 | :param mode: 日志写入模式, a:追加, w:覆盖 13 | 使用说明: 14 | 1、创建日志实例对象 15 | logger = Logger("textCNN_train", log_path="../logs").get_log 16 | 2、将关键信息通过日志实例对象写入日志文件 17 | logger.info("") 18 | """ 19 | self.__name = name 20 | self.logger = logging.getLogger(self.__name) 21 | self.logger.setLevel(logging.DEBUG) 22 | self.log_path = log_path 23 | self.mode = mode 24 | 25 | # 创建一个handler,用于写入日志文件 26 | # log_path = os.path.dirname(os.path.abspath(__file__)) 27 | # 指定utf-8格式编码,避免输出的日志文本乱码 28 | logname = os.path.join(self.log_path, self.__name + '.log') # 指定输出的日志文件名 29 | # 定义handler的输出格式 30 | formatter = logging.Formatter( 31 | '%(asctime)s-%(filename)s-[日志信息]-[%(module)s-%(funcName)s-line:%(lineno)d]-%(levelname)s: %(message)s') 32 | 33 | fh = logging.FileHandler(logname, mode=self.mode, encoding='utf-8') # 不拆分日志文件,a指追加模式,w为覆盖模式 34 | fh.setLevel(logging.DEBUG) 35 | 36 | # 创建一个handler,用于将日志输出到控制台 37 | ch = logging.StreamHandler() 38 | ch.setLevel(logging.DEBUG) 39 | 40 | fh.setFormatter(formatter) 41 | ch.setFormatter(formatter) 42 | 43 | # 给logger添加handler 44 | self.logger.addHandler(fh) 45 | self.logger.addHandler(ch) 46 | 47 | @property 48 | def get_log(self): 49 | """定义一个函数,回调logger实例""" 50 | return self.logger 51 | -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/code/stacking.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | from utils import RESULT_DIR 5 | lgb_result = pd.read_csv(os.path.join(RESULT_DIR,'lgb_prob_result.csv')) 6 | lgb_result = lgb_result[lgb_result['label'].isnull()] 7 | print(lgb_result.columns) 8 | del lgb_result['label'] 9 | 10 | cat_result = pd.read_csv(os.path.join(RESULT_DIR,'cat_prob_result.csv')) 11 | cat_result = cat_result[cat_result['label'].isnull()] 12 | del cat_result['label'] 13 | 14 | # bert_result = pd.read_csv(os.path.join(RESULT_DIR,'bert_prob_result.csv')) 15 | 16 | model_weight = {'lgb':0.2,'cat':0.8,'bert':0.2} 17 | print(f'MODEL WEIGHT: {model_weight}') 18 | # for i in ['bert_class_0', 'bert_class_1', 'bert_class_2','bert_class_3']: 19 | # bert_result[i] = bert_result[i]*model_weight['bert'] 20 | 21 | for i in ['cat_class_0', 'cat_class_1', 'cat_class_2', 'cat_class_3']: 22 | cat_result[i] = cat_result[i]*model_weight['cat'] 23 | 24 | for i in ['lgb_class_0', 'lgb_class_1', 'lgb_class_2', 'lgb_class_3']: 25 | lgb_result[i] = lgb_result[i]*model_weight['lgb'] 26 | 27 | result= lgb_result.merge(cat_result,on =['sn', 'fault_time'],how ='left' ) 28 | 29 | # result= bert_result.merge(cat_result,on =['sn', 'fault_time'],how ='left' ) 30 | # 31 | # result['class_0'] =result.loc[:,['cat_class_0','bert_class_0']].sum(1) 32 | # result['class_1'] =result.loc[:,['cat_class_1','bert_class_0']].sum(1) 33 | # result['class_2'] =result.loc[:,['cat_class_2','bert_class_0']].sum(1) 34 | # result['class_3'] =result.loc[:,['cat_class_3','bert_class_0']].sum(1) 35 | 36 | result['class_0'] =result.loc[:,['lgb_class_0','cat_class_0']].sum(1) 37 | result['class_1'] =result.loc[:,['lgb_class_1','cat_class_1']].sum(1) 38 | result['class_2'] =result.loc[:,['lgb_class_2','cat_class_2']].sum(1) 39 | result['class_3'] =result.loc[:,['lgb_class_3','cat_class_3']].sum(1) 40 | 41 | result['label'] = np.argmax(result.loc[:,['class_0', 'class_1', 'class_2', 'class_3']].values,axis = 1) 42 | result = result[['sn', 'fault_time','label']] 43 | result.to_csv(os.path.join(RESULT_DIR,'stacking_result.csv'),index = False) 44 | -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/README.md: -------------------------------------------------------------------------------- 1 | # 第三届阿里云磐久智维算法大赛亚军方案 2 | 3 | ## 官网地址 4 | 5 | https://tianchi.aliyun.com/competition/entrance/531947/introduction 6 | 7 | ## 项目目录结构 8 | ``` 9 | ├── Dockerfile 10 | ├── README.md 11 | ├── code 12 | │   ├── catboost_fs.py +++++++++++++++++++++++++++++++ 模型训练代码 13 | │   ├── generate_feature.py ++++++++++++++++++++++++++ 特征生成代码 14 | │   ├── generate_pseudo_label.py ++++++++++++++++++++ 伪标签代码 15 | │   ├── get_crashdump_venus_fea.py +++++++++++++++++++ 新数据特征生成代码 16 | │   ├── requirements.txt +++++++++++++++++++++++++++++ python包版本 17 | │   ├── stacking.py ++++++++++++++++++++++++++++++++++ 模型融合代码 18 | │   └── utils.py +++++++++++++++++++++++++++++++++++++ 小工具脚本 19 | ├── data 20 | │   ├── preliminary_a_test +++++++++++++++++++++++++++ 初赛A榜测试数据集 21 | │   ├── preliminary_b_test +++++++++++++++++++++++++++ 初赛B榜测试数据集 22 | │   └── preliminary_train ++++++++++++++++++++++++++++ 训练集数据 23 | ├── docker_push.sh +++++++++++++++++++++++++++++++++++++++++ Docker镜像构建、push脚本 24 | ├── feature 25 | │   └── generation +++++++++++++++++++++++++++++++++++ 特征生成文件夹 26 | ├── log ++++++++++++++++++++++++++++++++++++++++++++++++++++ 日志文件夹 27 | │   ├── catboost.log +++++++++++++++++++++++++++++++++ 模型运行日志 28 | ├── model ++++++++++++++++++++++++++++++++++++++++++++++++++ 模型文件 29 | ├── prediction_result ++++++++++++++++++++++++++++++++++++++ 模型预测结果文件夹 30 | │   ├── cat_prob_result.csv ++++++++++++++++++++++++++ CATBOOST模型预测概率 31 | │   ├── catboost_result.csv ++++++++++++++++++++++++++ CATBOOST模型预测结果 32 | │   └── stacking_result.csv ++++++++++++++++++++++++++ 模型融合结果 33 | ├── run.log ++++++++++++++++++++++++++++++++++++++++++++++++ 代码运行日志 34 | ├── run.sh ++++++++++++++++++++++++++++++++++++++++++++++++ 代码运行脚本 35 | ├── tcdata ++++++++++++++++++++++++++++++++++++++++++++++++ 复赛测试集数据文件夹(具体文件请使用初赛相关文件更改文件名替换) 36 | │   ├── final_crashdump_dataset_b.csv ++++++++++++++++ 复赛B榜新数据文件 37 | │   ├── final_sel_log_dataset_b.csv ++++++++++++++++++ 复赛测试集日志文件 38 | │   ├── final_submit_dataset_b.csv +++++++++++++++++++ 复赛测试集ID 39 | │   └── final_venus_dataset_b.csv ++++++++++++++++++++ 复赛B榜新数据文件 40 | ├── user_data 41 | │   └── tmp_data +++++++++++++++++++++++++++++++++++++ 临时文件 42 | └── 答辩PPT 43 | └── 悦智AI实验室_20220525.pdf 44 | ``` 45 | ## 运行环境 46 | Python版本为3.8,各个Python包版本见requirements.txt,使用如下命令即可安装: 47 | ``` 48 | pip install -r code/requirements.txt 49 | ``` 50 | 51 | ## 构建镜像运行代码 52 | ### 构建镜像 53 | ``` 54 | docker build -t [你的镜像仓库]:[TAG] . 55 | ``` 56 | ### 运行镜像 57 | ``` 58 | docker run [你的镜像ID] sh run.sh 59 | ``` 60 | ### push 镜像 61 | ``` 62 | docker push [你的仓库地址]:[TAG] 63 | ``` 64 | ### 运行&push 镜像 65 | ``` 66 | bash docker_push.sh 67 | ``` 68 | 69 | ## 运行代码 70 | ``` 71 | bash run.sh 72 | ``` -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 10 | 11 | 13 | 14 | 15 | 16 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 1653988069044 36 | 42 | 43 | 1653988253675 44 | 49 | 50 | 1653988258918 51 | 56 | 59 | 60 | 62 | 63 | 72 | 74 | 75 | 76 | 78 | -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/code/generate_pseudo_label.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import os 4 | from utils import TRAIN_DIR ,TEST_A_DIR,TEST_B_DIR,RESULT_DIR,DATA_DIR 5 | 6 | log_dataset_a = pd.read_csv(os.path.join(DATA_DIR,'preliminary_a_test/preliminary_sel_log_dataset_a.csv')) 7 | log_dataset_b = pd.read_csv(os.path.join(DATA_DIR,'preliminary_b_test/preliminary_sel_log_dataset_b.csv')) 8 | submit_dataset_a = pd.read_csv(os.path.join(DATA_DIR,'preliminary_a_test/preliminary_submit_dataset_a.csv')) 9 | submit_dataset_b = pd.read_csv(os.path.join(DATA_DIR,'preliminary_b_test/preliminary_submit_dataset_b.csv')) 10 | 11 | log_dataset_c = pd.concat([log_dataset_a,log_dataset_b],ignore_index = True,axis = 0) 12 | submit_dataset_c = pd.concat([submit_dataset_a,submit_dataset_b],ignore_index = True,axis = 0) 13 | 14 | log_dataset_c.to_csv(os.path.join(TEST_A_DIR,'final_log_dataset_c.csv'),index =False) 15 | submit_dataset_c.to_csv(os.path.join(TEST_A_DIR,'final_submit_dataset_c.csv'),index =False) 16 | 17 | 18 | # 19 | # cat_prob = pd.read_csv(os.path.join(RESULT_DIR,'../../../TianchiAIOps_bert_model/cat_prob_result.csv')) 20 | # lgb_prob = pd.read_csv(os.path.join(RESULT_DIR,'../../../TianchiAIOps_bert_model/lgb_prob_result.csv')) 21 | 22 | cat_prob = pd.read_csv(os.path.join(RESULT_DIR,'B_prob_7511.csv')) 23 | lgb_prob = pd.read_csv(os.path.join(RESULT_DIR,'baseline_prob_7495.csv')) 24 | cat_prob.columns = ['cat_class_0', 'cat_class_1', 'cat_class_2', 'cat_class_3', 'label', 'sn', 25 | 'fault_time'] 26 | lgb_prob.columns = ['lgb_class_0', 'lgb_class_1', 'lgb_class_2', 'lgb_class_3', 'label', 'sn', 27 | 'fault_time'] 28 | 29 | lgb_prob = lgb_prob[lgb_prob['label'].isnull()] 30 | cat_prob = cat_prob[cat_prob['label'].isnull()] 31 | 32 | cat_prob['cat_prob'] = cat_prob.loc[:,['cat_class_0', 'cat_class_1', 'cat_class_2', 'cat_class_3']].max(1) 33 | cat_prob['cat_label'] = np.argmax(cat_prob.loc[:,['cat_class_0', 'cat_class_1', 'cat_class_2', 'cat_class_3']].values,axis = 1) 34 | 35 | lgb_prob['lgb_prob'] = lgb_prob.loc[:,['lgb_class_0', 'lgb_class_1', 'lgb_class_2', 'lgb_class_3']].max(1) 36 | lgb_prob['lgb_label'] = np.argmax(lgb_prob.loc[:,['lgb_class_0', 'lgb_class_1', 'lgb_class_2', 'lgb_class_3']].values,axis = 1) 37 | 38 | lgb_prob = lgb_prob[['sn','fault_time','lgb_label','lgb_prob']] 39 | cat_prob = cat_prob[['sn','fault_time','cat_label','cat_prob']] 40 | 41 | # prob = cat_prob.merge(lgb_prob,on =['sn','fault_time'], 42 | # how = 'left' ) 43 | 44 | prob = pd.concat([cat_prob,lgb_prob],ignore_index = True) 45 | prob['cat_prob']=prob['cat_prob'].fillna(1) 46 | prob['lgb_prob']=prob['lgb_prob'].fillna(1) 47 | prob.loc[prob['cat_label'].isnull(),'cat_label'] = prob.loc[prob['cat_label'].isnull(),'lgb_label'] 48 | prob.loc[prob['lgb_label'].isnull(),'lgb_label'] = prob.loc[prob['lgb_label'].isnull(),'cat_label'] 49 | 50 | 51 | pseudo_labels = prob.query('cat_prob >0.85 and lgb_prob >0.85 and lgb_label == cat_label ') 52 | 53 | pseudo_labels = pseudo_labels[['sn','fault_time','cat_label']].rename(columns = {'cat_label':'label'}).reset_index(drop = True) 54 | pseudo_labels.to_csv(os.path.join(TRAIN_DIR,'pseudo_labels.csv'),index= False) 55 | print(f'生成伪标签的数据维度:{pseudo_labels.shape}') 56 | 57 | pseudo_sel_log_dataset = pd.read_csv(os.path.join(TEST_A_DIR,'final_sel_log_dataset_c.csv')) 58 | pseudo_sel_log_dataset = pseudo_sel_log_dataset[pseudo_sel_log_dataset['sn'].isin(pseudo_labels['sn'].to_list())] 59 | pseudo_sel_log_dataset.to_csv(os.path.join(TRAIN_DIR,'pseudo_sel_log_dataset.csv'),index = False) 60 | print(f'生成伪标签的日志数据维度:{pseudo_sel_log_dataset.shape}') 61 | 62 | # 制作新的测试集 63 | final_submit_dataset_d= prob.merge(pseudo_labels,on =['sn','fault_time'],how = 'left' ) 64 | final_submit_dataset_d = final_submit_dataset_d[final_submit_dataset_d['label'].isnull()][['sn','fault_time' ]].reset_index(drop = True) 65 | final_submit_dataset_d.to_csv(os.path.join(TEST_A_DIR,'final_submit_dataset_d.csv'),index= False) 66 | print(f'生成新的测试集维度:{final_submit_dataset_d.shape}') 67 | 68 | final_sel_log_dataset_d = pd.read_csv(os.path.join(TEST_A_DIR,'final_sel_log_dataset_c.csv')) 69 | final_sel_log_dataset_d = final_sel_log_dataset_d[final_sel_log_dataset_d['sn'].isin(final_submit_dataset_d['sn'].to_list())] 70 | 71 | final_sel_log_dataset_d.to_csv( 72 | os.path.join(TEST_A_DIR,'final_sel_log_dataset_d.csv'),index = False) 73 | print(f'生成新的测试集日志数据维度:{final_sel_log_dataset_d.shape}') -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/log/catboost.log: -------------------------------------------------------------------------------- 1 | use_less_col:335 2 | 使用的特征维度: 1762 3 | ********************** RUN CATBOOST MODEL ********************** 4 | ****************** 当前的 SEED 42 ********************** 5 | FOLD 1 IS RUNNING... 6 | 0: learn: 1.2939006 test: 1.2943096 best: 1.2943096 (0) total: 135ms remaining: 22m 27s 7 | 800: learn: 0.2057676 test: 0.2778188 best: 0.2778188 (800) total: 1m 2s remaining: 11m 55s 8 | 1600: learn: 0.1533318 test: 0.2698555 best: 0.2698522 (1599) total: 2m 4s remaining: 10m 54s 9 | Stopped by overfitting detector (100 iterations wait) 10 | 11 | bestTest = 0.2677497192 12 | bestIteration = 2222 13 | 14 | Shrink model to first 2223 iterations. 15 | {'learn': {'MultiClass': 0.12163532058790176}, 'validation': {'MultiClass': 0.26774971916097773}} 16 | FOLD 2 IS RUNNING... 17 | 0: learn: 1.2947765 test: 1.2944610 best: 1.2944610 (0) total: 81.8ms remaining: 13m 38s 18 | 800: learn: 0.2009925 test: 0.2969940 best: 0.2969940 (800) total: 1m 2s remaining: 11m 53s 19 | Stopped by overfitting detector (100 iterations wait) 20 | 21 | bestTest = 0.2898436422 22 | bestIteration = 1413 23 | 24 | Shrink model to first 1414 iterations. 25 | {'learn': {'MultiClass': 0.15671545706553627}, 'validation': {'MultiClass': 0.2898436422052235}} 26 | FOLD 3 IS RUNNING... 27 | 0: learn: 1.2956904 test: 1.2979653 best: 1.2979653 (0) total: 83.6ms remaining: 13m 55s 28 | 800: learn: 0.2010365 test: 0.3031897 best: 0.3031249 (796) total: 1m 2s remaining: 11m 56s 29 | 1600: learn: 0.1521093 test: 0.2952955 best: 0.2952927 (1598) total: 2m 4s remaining: 10m 54s 30 | Stopped by overfitting detector (100 iterations wait) 31 | 32 | bestTest = 0.2948664255 33 | bestIteration = 1799 34 | 35 | Shrink model to first 1800 iterations. 36 | {'learn': {'MultiClass': 0.13764700334845772}, 'validation': {'MultiClass': 0.2948664254808659}} 37 | FOLD 4 IS RUNNING... 38 | 0: learn: 1.2944941 test: 1.2931731 best: 1.2931731 (0) total: 83.8ms remaining: 13m 58s 39 | 800: learn: 0.2055831 test: 0.2798750 best: 0.2798750 (800) total: 1m 2s remaining: 11m 54s 40 | 1600: learn: 0.1555797 test: 0.2733073 best: 0.2732265 (1590) total: 2m 4s remaining: 10m 54s 41 | Stopped by overfitting detector (100 iterations wait) 42 | 43 | bestTest = 0.2729804824 44 | bestIteration = 1672 45 | 46 | Shrink model to first 1673 iterations. 47 | {'learn': {'MultiClass': 0.14819996336927216}, 'validation': {'MultiClass': 0.27298048242230794}} 48 | FOLD 5 IS RUNNING... 49 | 0: learn: 1.2909100 test: 1.2914652 best: 1.2914652 (0) total: 86.9ms remaining: 14m 29s 50 | 800: learn: 0.2014462 test: 0.2983963 best: 0.2983963 (800) total: 1m 2s remaining: 11m 55s 51 | 1600: learn: 0.1523926 test: 0.2909189 best: 0.2907775 (1582) total: 2m 4s remaining: 10m 54s 52 | Stopped by overfitting detector (100 iterations wait) 53 | 54 | bestTest = 0.2898741689 55 | bestIteration = 1887 56 | 57 | Shrink model to first 1888 iterations. 58 | {'learn': {'MultiClass': 0.13391467495348316}, 'validation': {'MultiClass': 0.289874168865446}} 59 | 60 | OOF-MEAN-ERROR score:0.283063, OOF-STD:0.010657 61 | Inint Score: 0.7240031522090993 62 | round: 1 63 | class:0, new_weight:1.01, f1 score: 0.7242893038330873 64 | class:0, new_weight:1.02, f1 score: 0.7244468658037289 65 | class:0, new_weight:1.03, f1 score: 0.7247189260435818 66 | class:0, new_weight:1.05, f1 score: 0.7247883133652404 67 | class:0, new_weight:1.06, f1 score: 0.7253074441711662 68 | class:0, new_weight:1.07, f1 score: 0.7255838308898628 69 | class:0, new_weight:1.09, f1 score: 0.7258591461588992 70 | class:0, new_weight:1.1, f1 score: 0.7263732069942956 71 | class:0, new_weight:1.11, f1 score: 0.7269810148203093 72 | class:0, new_weight:1.12, f1 score: 0.727085092104794 73 | class:0, new_weight:1.19, f1 score: 0.7275673332111965 74 | class:0, new_weight:1.2, f1 score: 0.7277300984468054 75 | class:0, new_weight:1.21, f1 score: 0.7300337938032027 76 | class:0, new_weight:1.22, f1 score: 0.7302916982856817 77 | class:0, new_weight:1.32, f1 score: 0.7302972834627351 78 | class:0, new_weight:1.33, f1 score: 0.7305212560605624 79 | class:0, new_weight:1.34, f1 score: 0.7307742905548762 80 | class:0, new_weight:1.3800000000000001, f1 score: 0.731115696618317 81 | class:0, new_weight:1.3900000000000001, f1 score: 0.7311341774607671 82 | class:0, new_weight:1.4000000000000001, f1 score: 0.7321211157346706 83 | class:0, new_weight:1.41, f1 score: 0.732530278451288 84 | class:0, new_weight:1.42, f1 score: 0.7326514907204666 85 | class:0, new_weight:1.43, f1 score: 0.7326655042252155 86 | class:0, new_weight:1.44, f1 score: 0.7340465325949609 87 | class:0, new_weight:1.45, f1 score: 0.7349701799847135 88 | class:2, new_weight:0.47000000000000003, f1 score: 0.7351355277520346 89 | class:2, new_weight:0.51, f1 score: 0.7352366908078052 90 | class:2, new_weight:0.52, f1 score: 0.7354704485017871 91 | class:2, new_weight:0.53, f1 score: 0.7356003615547112 92 | class:2, new_weight:0.54, f1 score: 0.7358162977063339 93 | class:2, new_weight:0.55, f1 score: 0.7360528528073605 94 | class:2, new_weight:0.6, f1 score: 0.7360930396635706 95 | class:2, new_weight:0.62, f1 score: 0.7361315695490319 96 | class:3, new_weight:0.77, f1 score: 0.736236509770795 97 | class:3, new_weight:0.79, f1 score: 0.7362861930960579 98 | class:3, new_weight:0.8, f1 score: 0.73637330491084 99 | class:3, new_weight:0.81, f1 score: 0.7364039172775363 100 | class:3, new_weight:0.8200000000000001, f1 score: 0.7365143106346561 101 | class:3, new_weight:0.8300000000000001, f1 score: 0.7366247783303799 102 | round: 2 103 | class:2, new_weight:0.55, f1 score: 0.7366811046538598 104 | round: 3 105 | ********************** SEARCH BEST WEIGHT : [1.45, 1.0, 0.55, 0.8300000000000001] ********************** 106 | ********************** BEST MACRO_F1 : 0.7366811046538598 ********************** -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/code/model.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import datetime 3 | import lightgbm as lgb 4 | import numpy as np 5 | import pandas as pd 6 | from catboost import CatBoostClassifier 7 | from sklearn.model_selection import StratifiedKFold 8 | 9 | from utils import N_ROUNDS 10 | import pickle 11 | import os 12 | warnings.filterwarnings('ignore') 13 | 14 | 15 | def get_model_feature_importances(model): 16 | feature_importances = pd.DataFrame() 17 | feature_importances['fea'] = model.feature_names_ 18 | feature_importances['importances'] = model.feature_importances_ 19 | feature_importances = feature_importances.sort_values('importances', ascending=False).reset_index(drop=True) 20 | 21 | return feature_importances 22 | 23 | 24 | def run_cbt(train, target, test, k, seed, NUM_CLASS=4, cat_cols=[]): 25 | print('********************** RUN CATBOOST MODEL **********************') 26 | print(f'****************** 当前的 SEED {seed} ********************** ') 27 | folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed) 28 | oof_prob = np.zeros((train.shape[0], NUM_CLASS)) 29 | test_prob = np.zeros((test.shape[0], NUM_CLASS)) 30 | feature_importance_df = [] 31 | offline_score = [] 32 | model_list = [] 33 | 34 | ## K-Fold 35 | for fold, (trn_idx, val_idx) in enumerate(folds.split(train, target)): 36 | print("FOLD {} IS RUNNING...".format(fold + 1)) 37 | trn_x, trn_y = train.loc[trn_idx], target.loc[trn_idx] 38 | val_x, val_y = train.loc[val_idx], target.loc[val_idx] 39 | catboost_model = CatBoostClassifier( 40 | iterations=N_ROUNDS, 41 | od_type='Iter', 42 | od_wait=120, 43 | max_depth=8, 44 | learning_rate=0.05, 45 | l2_leaf_reg=9, 46 | random_seed=seed, 47 | fold_len_multiplier=1.1, 48 | loss_function='MultiClass', 49 | logging_level='Verbose', 50 | # task_type="GPU" 51 | 52 | ) 53 | 54 | start_time = datetime.datetime.now() 55 | 56 | catboost_model.fit(trn_x, 57 | trn_y, 58 | eval_set=(val_x, val_y), 59 | use_best_model=True, 60 | verbose=800, 61 | early_stopping_rounds=100, 62 | cat_features=cat_cols, 63 | ) 64 | end_time = datetime.datetime.now() 65 | model_train_cost_time = end_time - start_time 66 | print('****************** 模型训练 COST TIME : ',str(model_train_cost_time),' ******************') 67 | 68 | start_time = datetime.datetime.now() 69 | oof_prob[val_idx] = catboost_model.predict_proba(train.loc[val_idx]) 70 | end_time = datetime.datetime.now() 71 | model_pred_cost_time = end_time - start_time 72 | print('****************** 模型预测 COST TIME : ', str(model_pred_cost_time), ' ******************') 73 | # catboost_model = catboost_model.get_best_iteration() 74 | test_prob += catboost_model.predict_proba(test) / folds.n_splits 75 | print(catboost_model.get_best_score()) 76 | offline_score.append(catboost_model.get_best_score()['validation']['MultiClass']) 77 | 78 | feature_importance_df.append(get_model_feature_importances(catboost_model)) 79 | model_list.append(catboost_model) 80 | with open(os.path.join('../model', f'cat_model_flod_{fold}.pkl'), 'wb') as f: 81 | pickle.dump(catboost_model, f) 82 | print('\nOOF-MEAN-ERROR score:%.6f, OOF-STD:%.6f' % (np.mean(offline_score), np.std(offline_score))) 83 | fea_imp_df = pd.concat(feature_importance_df, ignore_index=True).groupby('fea').agg( 84 | {'importances': 'mean'}).reset_index().sort_values('importances', ascending=False).reset_index(drop=True) 85 | 86 | return oof_prob, test_prob, fea_imp_df, model_list 87 | 88 | 89 | def run_lgb(train, target, test, k, seed=42, NUM_CLASS=4, cat_cols=[]): 90 | # feats = [f for f in train.columns if f not in ['cust_no', 'label', 'I7', 'I9', 'B6']] 91 | # print('Current num of features:', len(feats)) 92 | print(f'********************** RUN LGBM MODEL **********************') 93 | print(f'****************** 当前的 SEED {seed} ********************** ') 94 | cols_map = {j: i for i, j in enumerate(train.columns)} 95 | cat_cols = [cols_map[i] for i in cat_cols] 96 | train = train.rename(columns=cols_map) 97 | test = test.rename(columns=cols_map) 98 | folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed) 99 | oof_prob = np.zeros((train.shape[0], NUM_CLASS)) 100 | test_prob = np.zeros((test.shape[0], NUM_CLASS)) 101 | fea_imp_df_list = [] 102 | offline_score = [] 103 | model_list = [] 104 | ## K-Fold 105 | for fold, (trn_idx, val_idx) in enumerate(folds.split(train, target)): 106 | params = { 107 | "objective": "multiclass", 108 | "num_class": NUM_CLASS, 109 | "learning_rate": 0.01, 110 | "max_depth": -1, 111 | "num_leaves": 32, 112 | "verbose": -1, 113 | "bagging_fraction": 0.8, 114 | "feature_fraction": 0.8, 115 | "seed": seed, 116 | 'metric': 'multi_error' 117 | 118 | } 119 | print("FOLD {} IS RUNNING...".format(fold + 1)) 120 | trn_data = lgb.Dataset(train.loc[trn_idx], label=target.loc[trn_idx]) 121 | val_data = lgb.Dataset(train.loc[val_idx], label=target.loc[val_idx]) 122 | 123 | # train 124 | params['seed'] = seed 125 | lgb_model = lgb.train( 126 | params, 127 | trn_data, 128 | num_boost_round=N_ROUNDS, 129 | valid_sets=[trn_data, val_data], 130 | early_stopping_rounds=100, 131 | verbose_eval=200, 132 | categorical_feature=cat_cols, 133 | 134 | ) 135 | # predict 136 | oof_prob[val_idx] = lgb_model.predict(train.loc[val_idx], num_iteration=lgb_model.best_iteration) 137 | test_prob += lgb_model.predict(test, num_iteration=lgb_model.best_iteration) / folds.n_splits 138 | offline_score.append(lgb_model.best_score['valid_1']['multi_error']) 139 | fea_imp = pd.DataFrame() 140 | fea_imp['feature_name'] = lgb_model.feature_name() 141 | fea_imp['importance'] = lgb_model.feature_importance() 142 | fea_imp['feature_name'] = fea_imp['feature_name'].map({str(cols_map[i]): i for i in cols_map}) 143 | fea_imp = fea_imp.sort_values('importance', ascending=False) 144 | fea_imp_df_list.append(fea_imp) 145 | 146 | model_list.append(lgb_model) 147 | print('\nOOF-MEAN-ERROR score:%.6f, OOF-STD:%.6f' % (np.mean(offline_score), np.std(offline_score))) 148 | fea_imp_df = pd.concat(fea_imp_df_list, ignore_index=True).groupby('feature_name').agg( 149 | {'importance': 'mean'}).reset_index().sort_values('importance', ascending=False) 150 | return oof_prob, test_prob, fea_imp_df, model_list 151 | -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/code/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from log import Logger 4 | from collections import Counter 5 | from tqdm import tqdm 6 | import numpy as np 7 | import pandas as pd 8 | 9 | ROOT_DIR = os.path.join(sys.path[0], '../') 10 | LOG_DIR = os.path.join(ROOT_DIR, 'log') 11 | 12 | DATA_DIR = os.path.join(ROOT_DIR, 'data') 13 | TRAIN_DIR = os.path.join(DATA_DIR, 'preliminary_train') 14 | # 提交docker时 需要打开更换 15 | MODEL_PATH = os.path.join(ROOT_DIR, './model/deberta-base') 16 | MODEL_1_PATH = os.path.join(ROOT_DIR, './model') 17 | TEST_A_DIR = os.path.join(ROOT_DIR, './tcdata') 18 | # TEST_A_DIR = os.path.join(ROOT_DIR, './tcdata_test') 19 | PSEUDO_FALG = True 20 | TEST_B_DIR = os.path.join(ROOT_DIR, 'tcdata') 21 | 22 | 23 | 24 | RESULT_DIR = os.path.join(ROOT_DIR, 'prediction_result') 25 | 26 | FEATURE_DIR = os.path.join(ROOT_DIR, 'feature') 27 | GENERATION_DIR = os.path.join(FEATURE_DIR, 'generation') 28 | CORRELATION_DIR = os.path.join(FEATURE_DIR, 'correlation') 29 | 30 | 31 | USER_DATA_DIR = os.path.join(ROOT_DIR, 'user_data') 32 | USER_MODEL_DIR = os.path.join(USER_DATA_DIR, 'model_data') 33 | TMP_DIR = os.path.join(USER_DATA_DIR, 'tmp_data') 34 | N_ROUNDS = 10000 35 | TIME_INTERVAL = 60 36 | 37 | KEY_1 = ['OEM record c2', 'Processor CPU_Core_Error', '001c4c', 'System Event Sys_Event', 'Power Supply PS0_Status', 38 | 'Temperature CPU0_Margin_Temp', 'Reading 51 > Threshold 85 degrees C', 'Lower Non-critical going low', 39 | 'Temperature CPU1_Margin_Temp', 'System ACPI Power State #0x7d', 'Lower Critical going low'] 40 | KEY_2 = ['OEM CPU0 MCERR', 'OEM CPU0 CATERR', 'Reading 0 < Threshold 2 degrees C', '0203c0a80101', 41 | 'Unknown CPU0 MCERR', 'Unknown CPU0 CATERR', 'Microcontroller #0x3b', 'System Boot Initiated', 42 | 'Processor #0xfa', 'Power Unit Pwr Unit Status', 'Hard reset', 'Power off/down', 'System Event #0xff', 43 | 'Memory CPU1A1_DIMM_Stat', '000000', 'Power cycle', 'OEM record c3', 'Memory CPU1C0_DIMM_Stat', 44 | 'Reading 0 < Threshold 1 degrees C', 'IERR'] 45 | KEY_3 = ['Memory', 'Correctable ECC logging limit reached', 'Memory MEM_CHE0_Status', 'Memory Memory_Status', 46 | 'Memory #0x87', 'Memory CPU0F0_DIMM_Stat', 'Memory Device Disabled', 'Memory #0xe2', 47 | 'OS Stop/Shutdown OS Status', 'System Boot Initiated System Restart', 'OS Boot BIOS_Boot_Up', 48 | 'System Boot Initiated BIOS_Boot_UP', 'Memory DIMM101', 'OS graceful shutdown', 'OS Critical Stop OS Status', 49 | 'Memory #0xf9', 'Memory CPU0C0_DIMM_Stat', 'Memory DIMM111', 'Memory DIMM021', ] 50 | KEY_4 = ['Drive Fault', 'NMI/Diag Interrupt', 'Failure detected', 'Power Supply AC lost', 'Power Supply PSU0_Supply', 51 | 'AC out-of-range, but present', 'Predictive failure', 'Drive Present', 'Temperature Temp_DIMM_KLM', 52 | 'Temperature Temp_DIMM_DEF', 'Power Supply PS1_Status', 'Identify Status', 'Power Supply PS2_Status', 53 | 'Temperature DIMMG1_Temp', 'Upper Non-critical going high', 'Temperature DIMMG0_Temp', 54 | 'Upper Critical going high', 'Power Button pressed', 'System Boot Initiated #0xb8', 'Deasserted'] 55 | TOP_KEY_WORDS = ['0203c0a80101', 'Configuration Error', 'Correctable ECC', 'Deasserted', 'Device Enabled', 'Drive Present', 56 | 'Event Logging Disabled SEL', 'Failure detected', 'IERR', 'Initiated by hard reset', 'Initiated by power up', 57 | 'Initiated by warm reset', 'Log area reset/cleared', 'Memory', 'Memory #0xe2', 'Memory CPU0C0', 58 | 'Microcontroller/Coprocessor BMC', 'OEM CPU0 CATERR', 'OEM CPU0 MCERR', 'OS Boot BIOS', 59 | 'OS Critical Stop OS Status', 'Power Supply PS1', 'Power Supply PS2', 'Presence detected', 'Processor', 'Processor CPU', 'Processor CPU0', 60 | 'Processor CPU1', 'S0/G0: working', 'S4/S5: soft-off', 'Slot / Connector PCIE', 'State Asserted', 'State Deasserted', 61 | 'System ACPI Power State ACPI', 'System Boot Initiated', 'System Boot Initiated #0xe0', 'System Boot Initiated BIOS', 62 | 'System Event', 'System Event #0x10', 'System Event #0xff', 'Timestamp Clock Sync', 'Transition to Running', 'Uncorrectable ECC', 63 | 'Uncorrectable machine check exception', 'Unknown CPU0 CATERR', 'Unknown CPU0 MCERR', 'Unknown Chassis', 'Watchdog2 IPMI', 64 | ] 65 | TOP_KEY_WORDS_2 = ['Processor CPU0 Status', 'System Boot Initiated BIOS Boot Up', 'Uncorrectable ECC', 'Initiated by power up', 66 | 'Configuration Error', 'Processor CPU CATERR', 'Processor CPU1 Status', 'Memory #0xe2', 'IERR', 'Initiated by warm reset', 67 | 'State Asserted', 'S4/S5: soft-off', 'Memory #0xf9', 'S0/G0: working', 'boot completed - device not specified', 'Timestamp Clock Sync', 68 | 'Presence detected', 'System Boot Initiated #0xe0', 'Drive Fault', 'Power Supply PS1 Status', 'Power off/down', 'OS Boot #0xe9', 69 | 'Failure detected', 'Uncorrectable machine check exception', 'Transition to Running', 'Power Supply PS2 Status', 70 | 'Memory Device Disabled', 'System Restart', 'System Event #0x10', 'Sensor access degraded or unavailable', 'Unknown #0x17', 71 | 'Drive Present', 'Management Subsys Health System Health', 'Power Supply AC lost', 'Microcontroller #0x16'] 72 | CHARATERS = ['#', '&', ] 73 | # KEY_WORDS = KEY_1 + KEY_2 + KEY_3 + KEY_4 + CHARATERS 74 | KEY_WORDS = KEY_1 + KEY_2 + KEY_3 + KEY_4 + CHARATERS + TOP_KEY_WORDS 75 | KEY_WORDS = list(set(KEY_WORDS)) 76 | # cnt_1_0_diff_key_words = ['State Asserted','Processor CPU_CATERR','Unknown #0x17','Microcontroller #0x16','Transition to Running','State Deasserted','Processor #0xfa','Temperature CPU1_Margin_Temp','Temperature CPU0_Margin_Temp','Power cycle','Management Subsys Health System_Health','Sensor access degraded or unavailable','Power off/down','System ACPI Power State #0x7d'] 77 | # key_words_0 = ['Temperature CPU0_Margin_Temp','Lower Critical going low','System ACPI Power State #0x7d','Temperature CPU1_Margin_Temp','Lower Non-critical going low','Uncorrectable machine check exception','Reading 0 < Threshold 1 degrees C','000000','Unknown #0x19','Temperature DIMMG1_Temp','Reading 0 < Threshold 0 degrees C','001c4c','IERR','Upper Critical going high','Unknown Chassis_control','Temperature DIMMG0_Temp','Upper Non-critical going high','Temperature Temp_DIMM_DEF','Power cycle','Processor CPU0_Status','Temperature Temp_DIMM_KLM','Processor CPU1_Status','Management Subsys Health System_Health'] 78 | # key_words_1 = ['Processor #0xfa','State Deasserted','Power off/down','Power cycle','IERR','Unknown #0x17','Management Subsys Health System_Health','Processor CPU_CATERR','Reading 0 < Threshold 1 degrees C','','Sensor access degraded or unavailable','Transition to Running','State Asserted','Microcontroller #0x16','Processor CPU0_Status','Processor CPU1_Status','Slot / Connector PCIE_Status','Fault Status','System ACPI Power State ACPI_PWR_Status','Management Subsystem Health System_Health','Configuration Error','Uncorrectable machine check exception','Timestamp Clock Sync'] 79 | # key_words_2 = ['Memory #0xe2','Memory Device Disabled','Memory #0x87','Memory #0xf9','Correctable ECC','Memory CPU0D0_DIMM_Stat','Uncorrectable ECC','Memory CPU1B0_DIMM_Stat','System Boot Initiated BIOS_Boot_UP','System Restart','Presence Detected','Temperature CPU0_Temp','boot completed - device not specified','Log almost full','Device Present','Legacy OFF state','System Boot Initiated #0xe0','System Event #0x10','Legacy ON state','OS Boot #0xe0','Unknown #0xc5','System Boot Initiated #0xb8','Event Logging Disabled SEL_Status'] 80 | # key_words_3 = ['Drive Fault','Failure detected','Drive Present','Temperature Temp_DIMM_KLM','Temperature Temp_DIMM_DEF','Power Supply PS4_Status','Upper Non-critical going high','Temperature DIMMG0_Temp','Temperature DIMMG1_Temp','Power Supply PS3_Status','Upper Critical going high','Predictive failure','Power Supply AC lost','Unknown #0x19','Power Unit Power Unit','AC out-of-range, but present','Power Supply PS1_Status','Power Supply PS2_Status','Log area reset/cleared','Microcontroller/Coprocessor BMC_Boot_Up','System Boot Initiated #0xb8','Power Button pressed','Device Present'] 81 | # top_key_words = [ 'Configuration Error','Uncorrectable ECC','Processor CPU0_Status','Initiated by power up','','Presence Detected','Processor CPU1_Status','S0/G0: working','Processor CPU_CATERR','Presence detected','S4/S5: soft-off','Upper Critical going high','Memory #0xe2','IERR','Initiated by warm reset','State Asserted','Upper Non-critical going high','boot completed - device not specified','Memory Device Disabled','Timestamp Clock Sync','Lower Critical going low','Transition to Running','Memory #0xf9','Power Supply PS1_Status'] 82 | # key_words_1_desc = ['#0xfa', '#0x','#0xff','CATERR','cycle','Unit','IERR','IPMI','#0x17', 'Running','#0x7c','Unknown','CPU', 'Sensor','CPU0','CPU1','Subsys'] 83 | # 84 | # key_words = cnt_1_0_diff_key_words +key_words_0+key_words_1+key_words_2+key_words_3+top_key_words+key_words_1_desc 85 | # key_words = list(set(key_words)) 86 | # KEY_WORDS = key_words+CHARATERS 87 | 88 | 89 | def create_dir(dir): 90 | """ 91 | 创建目录 92 | :param dir: 目录名 93 | :return: 94 | """ 95 | if not os.path.exists(dir): 96 | os.mkdir(dir) 97 | print(f'{dir}目录不存在,创建{dir}目录成功.') 98 | else: 99 | print(f'{dir}目录已存在.') 100 | 101 | 102 | def create_all_dir(): 103 | """ 104 | 创建所有需要的目录 105 | :return: 106 | """ 107 | create_dir(ROOT_DIR) 108 | create_dir(LOG_DIR) 109 | 110 | # create_dir(MODEL_DIR) 111 | create_dir(RESULT_DIR) 112 | 113 | create_dir(FEATURE_DIR) 114 | create_dir(GENERATION_DIR) 115 | create_dir(CORRELATION_DIR) 116 | 117 | create_dir(DATA_DIR) 118 | create_dir(TRAIN_DIR) 119 | create_dir(TEST_A_DIR) 120 | # create_dir(TEST_B_DIR) 121 | 122 | create_dir(USER_DATA_DIR) 123 | create_dir(USER_MODEL_DIR) 124 | create_dir(TMP_DIR) 125 | 126 | 127 | def clean_str(string): 128 | return string 129 | 130 | 131 | def my_tokenizer(s): 132 | return s.split(' | ') 133 | 134 | 135 | def get_word_counter(data): 136 | print('获取异常日志计数字典') 137 | 138 | counter = Counter() 139 | for string_ in tqdm(data['msg']): 140 | string_ = string_.strip() 141 | counter.update(my_tokenizer(clean_str(string_))) 142 | return counter 143 | 144 | 145 | def macro_f1(target_df: pd.DataFrame, submit_df: pd.DataFrame): 146 | """ 147 | 计算得分 148 | :param target_df: [sn,fault_time,label] 149 | :param submit_df: [sn,fault_time,label] 150 | :return: 151 | """ 152 | 153 | weights = [5 / 11, 4 / 11, 1 / 11, 1 / 11] 154 | 155 | # weights = [3 / 7, 2 / 7, 1 / 7, 1 / 7] 156 | overall_df = target_df.merge( 157 | submit_df, how='left', on=[ 158 | 'sn', 'fault_time'], suffixes=[ 159 | '_gt', '_pr']) 160 | overall_df.fillna(-1) 161 | macro_F1 = 0. 162 | for i in range(len(weights)): 163 | TP = len(overall_df[(overall_df['label_gt'] == i) 164 | & (overall_df['label_pr'] == i)]) 165 | FP = len(overall_df[(overall_df['label_gt'] != i) 166 | & (overall_df['label_pr'] == i)]) 167 | FN = len(overall_df[(overall_df['label_gt'] == i) 168 | & (overall_df['label_pr'] != i)]) 169 | precision = TP / (TP + FP) if (TP + FP) > 0 else 0 170 | recall = TP / (TP + FN) if (TP + FN) > 0 else 0 171 | F1 = 2 * precision * recall / \ 172 | (precision + recall) if (precision + recall) > 0 else 0 173 | macro_F1 += weights[i] * F1 174 | return macro_F1 175 | 176 | 177 | def search_weight(train, valid_y, raw_prob, init_weight=[ 178 | 1.0], class_num=4, step=0.001): 179 | weight = init_weight.copy() * class_num 180 | oof = train[['sn', 'fault_time']] 181 | oof['label'] = raw_prob.argmax(axis=1) 182 | f_best = macro_f1(train[['sn', 'fault_time', 'label']], oof) 183 | print("Inint Score:", f_best) 184 | 185 | # f_best = f1_score(y_true=valid_y, y_pred=raw_prob.argmax(axis=1),average='macro') 186 | flag_score = 0 187 | round_num = 1 188 | while (flag_score != f_best): 189 | print("round: ", round_num) 190 | round_num += 1 191 | flag_score = f_best 192 | for c in range(class_num): 193 | for n_w in range(0, 2000, 10): 194 | num = n_w * step 195 | new_weight = weight.copy() 196 | new_weight[c] = num 197 | prob_df = raw_prob.copy() 198 | prob_df = prob_df * np.array(new_weight) 199 | 200 | oof['label'] = prob_df.argmax(axis=1) 201 | f = macro_f1(train[['sn', 'fault_time', 'label']], oof) 202 | # f = f1_score(y_true=valid_y, y_pred=prob_df.argmax(axis=1),average='macro') 203 | if f > f_best: 204 | weight = new_weight.copy() 205 | f_best = f 206 | print(f"class:{c}, new_weight:{num}, f1 score: {f}") 207 | print( 208 | f'********************** SEARCH BEST WEIGHT : {weight} **********************') 209 | return weight 210 | 211 | 212 | def get_new_cols(df, key=['sn', 'fault_time']): 213 | if isinstance(df.columns[0], tuple): 214 | 215 | new_cols = [] 216 | for i in df.columns: 217 | if i[0] in key: 218 | new_cols.append(i[0]) 219 | else: 220 | new_cols.append(f'{i[0]}_{i[1]}') 221 | df.columns = new_cols 222 | return df 223 | else: 224 | print('当前的DataFrame没有二级列名,请检查。') 225 | return df 226 | 227 | 228 | if __name__ == '__main__': 229 | # create_all_dir() 230 | logger = Logger(name=os.path.basename(__file__).split( 231 | '.py')[0], log_path=LOG_DIR, mode="w").get_log 232 | print(len(KEY_WORDS)) 233 | -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/code/lgb_fs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import warnings 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import datetime 7 | from generate_feature import get_beta_target, add_last_next_time4fault, get_feature, \ 8 | get_duration_minutes_fea, get_nearest_msg_fea, get_server_model_sn_fea_2, \ 9 | get_server_model_fea, get_msg_text_fea_all, get_key_word_cross_fea, get_server_model_time_interval_stat_fea, \ 10 | get_w2v_feats 11 | from model import run_cbt,run_lgb 12 | from utils import RESULT_DIR, TRAIN_DIR, \ 13 | TEST_A_DIR, KEY_WORDS, get_word_counter, search_weight, macro_f1, TIME_INTERVAL,PSEUDO_FALG,GENERATION_DIR 14 | 15 | warnings.filterwarnings('ignore') 16 | 17 | 18 | def get_label(PSEUDO_FALG): 19 | preliminary_train_label_dataset = pd.read_csv(preliminary_train_label_dataset_path) 20 | preliminary_train_label_dataset_s = pd.read_csv(preliminary_train_label_dataset_s_path) 21 | 22 | if PSEUDO_FALG: 23 | print('获取伪标签LABEL') 24 | pseudo_labels = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_labels.csv')) 25 | label = pd.concat([preliminary_train_label_dataset, 26 | pseudo_labels, 27 | preliminary_train_label_dataset_s], 28 | ignore_index=True, 29 | axis=0).sort_values( 30 | ['sn', 'fault_time']).reset_index(drop=True) 31 | else: 32 | print('不使用伪标签数据') 33 | label = pd.concat([preliminary_train_label_dataset, 34 | preliminary_train_label_dataset_s], 35 | ignore_index=True, 36 | axis=0).sort_values( 37 | ['sn', 'fault_time']).reset_index(drop=True) 38 | label['fault_time'] = label['fault_time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) 39 | label['fault_time'] = label['fault_time'].apply(lambda x: str(x)) 40 | return label 41 | 42 | 43 | def get_log_dateset(PSEUDO_FALG): 44 | preliminary_sel_log_dataset = pd.read_csv(preliminary_sel_log_dataset_path) 45 | preliminary_sel_log_dataset_a = pd.read_csv(preliminary_sel_log_dataset_a_path) 46 | if PSEUDO_FALG: 47 | print('获取伪标签日志数据') 48 | pseudo_sel_log_dataset = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_sel_log_dataset.csv')) 49 | log_dataset = pd.concat([preliminary_sel_log_dataset, 50 | pseudo_sel_log_dataset, 51 | preliminary_sel_log_dataset_a], 52 | ignore_index=True, 53 | axis=0).sort_values( 54 | ['sn', 'time', 'server_model']).reset_index(drop=True) 55 | else: 56 | print('不使用伪标签数据') 57 | log_dataset = pd.concat([preliminary_sel_log_dataset, 58 | preliminary_sel_log_dataset_a], 59 | ignore_index=True, 60 | axis=0).sort_values( 61 | ['sn', 'time', 'server_model']).reset_index(drop=True) 62 | log_dataset['time'] = log_dataset['time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) 63 | 64 | return log_dataset 65 | 66 | def get_fea_distribute(feature_df, feature_importances, dataset_type, top=30): 67 | print('根据特征重要性,获取数据集的分布情况,用于验证训练集和测试集是否分布一致') 68 | fea_distribute_list = [] 69 | for i in feature_importances[:top]['fea'].to_list(): 70 | fea_distribute_tmp = (feature_df[i].value_counts() / len(feature_df)).reset_index().rename( 71 | columns={'index': 'value'}) 72 | fea_distribute_list.append(fea_distribute_tmp) 73 | 74 | fea_distribute = fea_distribute_list[-1] 75 | for i in fea_distribute_list[:-1]: 76 | fea_distribute = fea_distribute.merge(i, on='value', how='left') 77 | fea_distribute['value'] = fea_distribute['value'].apply(lambda x: f'{dataset_type}_{int(x)}') 78 | return fea_distribute 79 | 80 | 81 | def get_train_test(label, preliminary_submit_dataset_a, log_dataset): 82 | print('获取训练集数据与测试集数据') 83 | train = label.merge(log_dataset, on='sn', how='left') 84 | test = preliminary_submit_dataset_a.merge(log_dataset, on='sn', how='left') 85 | # train['time_interval'] = (pd.to_datetime( train['fault_time'])-train['time'] ).apply(lambda x:x.total_seconds()) 86 | # test['time_interval'] = (pd.to_datetime( test['fault_time'])- test['time'] ).apply(lambda x:x.total_seconds()) 87 | # train = train.query('time_interval > 0') 88 | # test = test.query('time_interval > 0') 89 | print(f'训练集维度:{train.shape},测试集维度:{test.shape}') 90 | train = train.drop_duplicates().reset_index(drop=True) 91 | test = test.drop_duplicates().reset_index(drop=True) 92 | train['time'] = pd.to_datetime(train['time']) 93 | test['time'] = pd.to_datetime(test['time']) 94 | return train, test 95 | 96 | start_time = datetime.datetime.now() 97 | 98 | additional_sel_log_dataset_path = os.path.join(TRAIN_DIR, 'additional_sel_log_dataset.csv') 99 | preliminary_train_label_dataset_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset.csv') 100 | preliminary_train_label_dataset_s_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset_s.csv') 101 | preliminary_sel_log_dataset_path = os.path.join(TRAIN_DIR, 'preliminary_sel_log_dataset.csv') 102 | 103 | preliminary_submit_dataset_a_path = os.path.join(TEST_A_DIR, 'final_submit_dataset_b.csv') 104 | preliminary_sel_log_dataset_a_path = os.path.join(TEST_A_DIR, 'final_sel_log_dataset_b.csv') 105 | 106 | print(preliminary_submit_dataset_a_path, preliminary_sel_log_dataset_a_path) 107 | 108 | preliminary_submit_dataset_a = pd.read_csv(preliminary_submit_dataset_a_path) 109 | preliminary_submit_dataset_a.head() 110 | 111 | log_dataset = get_log_dateset(PSEUDO_FALG) 112 | label = get_label(PSEUDO_FALG) 113 | 114 | 115 | next_time_list = [i / TIME_INTERVAL for i in [3, 5, 10, 15, 30, 45, 60, 90, 120, 240, 360, 480, 540, 600]] + [1000000] 116 | 117 | label, preliminary_submit_dataset_a = add_last_next_time4fault(label, preliminary_submit_dataset_a, TIME_INTERVAL, 118 | next_time_list) 119 | train, test = get_train_test(label, preliminary_submit_dataset_a, log_dataset) 120 | train = train.drop_duplicates(['sn', 'fault_time', 'time', 'msg', 'server_model']).reset_index(drop=True) 121 | 122 | train['time_interval'] = (pd.to_datetime(train['fault_time']) - pd.to_datetime(train['time'])).apply( 123 | lambda x: x.total_seconds()) 124 | test['time_interval'] = (pd.to_datetime(test['fault_time']) - pd.to_datetime(test['time'])).apply( 125 | lambda x: x.total_seconds()) 126 | 127 | all_data = pd.concat([train, test], axis=0, ignore_index=True) 128 | all_data = all_data.sort_values(['sn','server_model', 'fault_time', 'time']) 129 | w2v_feats = get_w2v_feats(all_data, 130 | f1_list = ['sn'], 131 | f2_list = ['msg_list', 'msg_0', 'msg_1', 'msg_2']) 132 | 133 | # 获取 server_model_time_interval_stat_fea 134 | server_model_time_interval_stat_fea = get_server_model_time_interval_stat_fea(all_data) 135 | 136 | msg_text_fea = get_msg_text_fea_all(all_data) 137 | # 获取时间差特征 138 | duration_minutes_fea = get_duration_minutes_fea(train, test) 139 | 140 | # 获取时间server_model特征 141 | server_model_fea = get_server_model_fea(train, test) 142 | counter = get_word_counter(train) 143 | 144 | # 获取时间 nearest_msg 特征 145 | nearest_msg_fea = get_nearest_msg_fea(train, test) 146 | # 获取时间 server_model beta_target 特征 147 | beta_target_fea = get_beta_target(train, test) 148 | 149 | key = ['sn', 'fault_time', 'label', 'server_model'] 150 | 151 | fea_num = len(KEY_WORDS) 152 | time_list = [i * TIME_INTERVAL for i in next_time_list] 153 | train = get_feature(train, time_list, KEY_WORDS, fea_num, key=['sn', 'fault_time', 'label', 'server_model']) 154 | test = get_feature(test, time_list, KEY_WORDS, fea_num, key=['sn', 'fault_time', 'server_model']) 155 | 156 | print('添加 时间差 特征') 157 | train = train.merge(duration_minutes_fea, on=['sn', 'fault_time', 'server_model']) 158 | test = test.merge(duration_minutes_fea, on=['sn', 'fault_time', 'server_model']) 159 | 160 | print('添加 server_model特征') 161 | train = train.merge(server_model_fea, on=['sn', 'server_model']) 162 | test = test.merge(server_model_fea, on=['sn', 'server_model']) 163 | 164 | print('添加 w2v_feats') 165 | train = train.merge(w2v_feats, on=['sn' ]) 166 | test = test.merge(w2v_feats, on=['sn', ]) 167 | 168 | print('添加 nearest_msg 特征') 169 | train = train.merge(nearest_msg_fea, on=['sn', 'server_model', 'fault_time']) 170 | test = test.merge(nearest_msg_fea, on=['sn', 'server_model', 'fault_time']) 171 | 172 | print('添加 beta_target 特征') 173 | train = train.merge(beta_target_fea, on=['sn', 'server_model', 'fault_time']) 174 | test = test.merge(beta_target_fea, on=['sn', 'server_model', 'fault_time']) 175 | 176 | server_model_sn_fea_2 = get_server_model_sn_fea_2(train, test) 177 | print('添加 server_model_sn_fea_2 特征') 178 | train = train.merge(server_model_sn_fea_2, on=['sn', 'server_model']) 179 | test = test.merge(server_model_sn_fea_2, on=['sn', 'server_model']) 180 | 181 | # crashdump_venus_fea = pd.read_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea.csv') ) 182 | # print('添加 crashdump_venus_fea 特征') 183 | # print(train.shape,test.shape,crashdump_venus_fea.shape) 184 | # train = train.merge(crashdump_venus_fea, on=['sn' , 'fault_time'],how = 'left') 185 | # test = test.merge(crashdump_venus_fea, on=['sn', 'fault_time' ],how = 'left') 186 | # print(train.shape,test.shape ) 187 | 188 | crashdump_venus_fea = pd.read_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea_v1.csv') ) 189 | print('添加 crashdump_venus_fea 特征') 190 | print(train.shape,test.shape,crashdump_venus_fea.shape) 191 | train = train.merge(crashdump_venus_fea, on=['sn' , 'fault_time'],how = 'left') 192 | test = test.merge(crashdump_venus_fea, on=['sn', 'fault_time' ],how = 'left') 193 | print(train.shape,test.shape ) 194 | test.to_csv(os.path.join(GENERATION_DIR,'test.csv'),index =False) 195 | train.to_csv(os.path.join(GENERATION_DIR,'train.csv'),index =False) 196 | 197 | # print('添加 msg_text_fea 特征') 198 | # train = train.merge(msg_text_fea, on=['sn', 'fault_time' ], how='left') 199 | # test = test.merge(msg_text_fea, on=['sn', 'fault_time'], how='left') 200 | 201 | # print('添加 关键词交叉特征 ') 202 | # train,test = get_key_word_cross_fea(train,test) 203 | 204 | # print('添加 server_model_time_interval_stat_fea 特征') 205 | # train = train.merge(server_model_time_interval_stat_fea, on=['server_model' ],how ='left') 206 | # test = test.merge(server_model_time_interval_stat_fea, on=['server_model' ],how ='left') 207 | 208 | 209 | use_less_cols_1 = ['last_last_msg_cnt', 'last_first_msg_cnt','time_diff_1_min', 210 | 'last_msg_list_unique_LabelEnc', 'last_msg_0_unique_LabelEnc', 211 | 'last_msg_1_unique_LabelEnc', 'last_msg_2_unique_LabelEnc', 212 | 'last_msg_list_list_LabelEnc', 'last_msg_0_list_LabelEnc', 213 | 'last_msg_1_list_LabelEnc', 'last_msg_2_list_LabelEnc', 214 | 'last_msg_0_first_LabelEnc', 'last_msg_1_first_LabelEnc', 215 | 'last_msg_2_first_LabelEnc', 'last_msg_0_last_LabelEnc', 216 | 'last_msg_1_last_LabelEnc', 'last_msg_2_last_LabelEnc', 217 | 'last_msg_last_LabelEnc', 'last_msg_first_LabelEnc'] 218 | 219 | use_less_col = [i for i in train.columns if train[i].nunique() < 2] + use_less_cols_1 220 | 221 | 222 | print(f'use_less_col:{len(use_less_col)}') 223 | use_cols = [i for i in train.columns if i not in ['sn', 'fault_time', 'label', 'server_model'] + use_less_col] 224 | cat_cols = ['server_model_LabelEnc', 'msg_LabelEnc', 'msg_0_LabelEnc', 'msg_1_LabelEnc', 'msg_2_LabelEnc',] 225 | use_cols = sorted(use_cols) 226 | print('使用的特征维度:',len(use_cols)) 227 | 228 | # cat_cols = [] 229 | # for i in use_cols: 230 | # if '_LabelEnc' in i: 231 | # cat_cols.append(i) 232 | 233 | oof_prob = np.zeros((train.shape[0], 4)) 234 | test_prob = np.zeros((test.shape[0], 4)) 235 | # seeds = [42,4242,40424,1024,2048] 236 | seeds = [42 ] 237 | for seed in seeds: 238 | oof_prob, test_prob, fea_imp_df, model_list = run_lgb(train[use_cols], train[['label']], test[use_cols], k=5, 239 | seed=seed, cat_cols=cat_cols) 240 | oof_prob +=oof_prob/len(seeds) 241 | test_prob +=test_prob/len(seeds) 242 | 243 | weight = search_weight(train, train[['label']], oof_prob, init_weight=[1.0], class_num=4, step=0.001) 244 | oof_prob = oof_prob * np.array(weight) 245 | test_prob = test_prob * np.array(weight) 246 | 247 | target_df = train[['sn', 'fault_time', 'label']] 248 | submit_df = train[['sn', 'fault_time']] 249 | submit_df['label'] = oof_prob.argmax(axis=1) 250 | 251 | score = macro_f1(target_df=target_df, submit_df=submit_df) 252 | print(f'********************** BEST MACRO_F1 : {score} **********************') 253 | score = round(score, 5) 254 | 255 | y_pred = test_prob.argmax(axis=1) 256 | result = test[['sn', 'fault_time']] 257 | result['label'] = y_pred 258 | result = preliminary_submit_dataset_a.merge(result, on=['sn', 'fault_time'], how='left')[['sn', 'fault_time', 'label']] 259 | result['label'] = result['label'].fillna(0).astype(int) 260 | 261 | result.to_csv(os.path.join(RESULT_DIR, f'lgb_result.csv'), index=False) 262 | 263 | fea_imp_df = fea_imp_df.reset_index(drop=True) 264 | fea_imp_df.to_csv(os.path.join(RESULT_DIR, f'./lgb_fea_imp_{int(score * 100000)}.csv'), index=False) 265 | 266 | train_result_prob = pd.DataFrame(oof_prob).add_prefix('lgb_class_') 267 | test_result_prob = pd.DataFrame(test_prob).add_prefix('lgb_class_') 268 | train_result_prob['label'] = train['label'] 269 | train_result_prob['sn'] = train['sn'] 270 | train_result_prob['fault_time'] = train['fault_time'] 271 | test_result_prob['sn'] = test['sn'] 272 | test_result_prob['fault_time'] = test['fault_time'] 273 | 274 | result_prob = pd.concat([train_result_prob,test_result_prob],ignore_index = True) 275 | result_prob.to_csv(os.path.join(RESULT_DIR,f'lgb_prob_result.csv'),index = False) 276 | 277 | 278 | end_time = datetime.datetime.now() 279 | cost_time = end_time - start_time 280 | print('****************** LIGHTGBM COST TIME : ',str(cost_time),' ******************') 281 | 282 | ''' 283 | 284 | v7 最优版本 线下 7356 285 | v8: v7 添加 关键词交叉特征 线下 0.7357 线上 7338 286 | v8.1 v7 添加 关键词交叉特征 并作为类别变量输入模型 0.73361 287 | v8.2 v7 添加 关键词交叉特征 并作为类别变量输入模型 删除 TOP_KEY_WORDS 7117 288 | v8.3 v7 添加 关键词交叉特征 并作为类别变量输入模型 使用 TOP_KEY_WORDS_2 7260 289 | v8.3 v7 添加 关键词交叉特征 并作为类别变量输入模型 添加 TOP_KEY_WORDS_2 7260 290 | 291 | ''' 292 | -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/code/catboost_fs.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | import warnings 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | 9 | from generate_feature import get_beta_target, add_last_next_time4fault, get_feature, \ 10 | get_duration_minutes_fea, get_nearest_msg_fea, get_server_model_sn_fea_2, \ 11 | get_server_model_fea, get_msg_text_fea_all, get_key_word_cross_fea, get_server_model_time_interval_stat_fea, \ 12 | get_w2v_feats, get_key_for_top_fea,get_time_diff_feats_v2 13 | from model import run_cbt 14 | from utils import RESULT_DIR, TRAIN_DIR, \ 15 | TEST_A_DIR, KEY_WORDS, get_word_counter, search_weight, macro_f1, TIME_INTERVAL,PSEUDO_FALG,GENERATION_DIR 16 | 17 | warnings.filterwarnings('ignore') 18 | 19 | 20 | def get_label(PSEUDO_FALG): 21 | preliminary_train_label_dataset = pd.read_csv(preliminary_train_label_dataset_path) 22 | preliminary_train_label_dataset_s = pd.read_csv(preliminary_train_label_dataset_s_path) 23 | 24 | if PSEUDO_FALG: 25 | print('获取伪标签LABEL') 26 | pseudo_labels = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_labels.csv')) 27 | label = pd.concat([preliminary_train_label_dataset, 28 | pseudo_labels, 29 | preliminary_train_label_dataset_s], 30 | ignore_index=True, 31 | axis=0).sort_values( 32 | ['sn', 'fault_time']).reset_index(drop=True) 33 | else: 34 | print('不使用伪标签数据') 35 | label = pd.concat([preliminary_train_label_dataset, 36 | preliminary_train_label_dataset_s], 37 | ignore_index=True, 38 | axis=0).sort_values( 39 | ['sn', 'fault_time']).reset_index(drop=True) 40 | label['fault_time'] = label['fault_time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) 41 | label['fault_time'] = label['fault_time'].apply(lambda x: str(x)) 42 | return label 43 | 44 | 45 | def get_log_dateset(PSEUDO_FALG): 46 | preliminary_sel_log_dataset = pd.read_csv(preliminary_sel_log_dataset_path) 47 | preliminary_sel_log_dataset_a = pd.read_csv(preliminary_sel_log_dataset_a_path) 48 | if PSEUDO_FALG: 49 | print('获取伪标签日志数据') 50 | pseudo_sel_log_dataset = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_sel_log_dataset.csv')) 51 | log_dataset = pd.concat([preliminary_sel_log_dataset, 52 | pseudo_sel_log_dataset, 53 | preliminary_sel_log_dataset_a], 54 | ignore_index=True, 55 | axis=0).sort_values( 56 | ['sn', 'time', 'server_model']).reset_index(drop=True) 57 | else: 58 | print('不使用伪标签数据') 59 | log_dataset = pd.concat([preliminary_sel_log_dataset, 60 | preliminary_sel_log_dataset_a], 61 | ignore_index=True, 62 | axis=0).sort_values( 63 | ['sn', 'time', 'server_model']).reset_index(drop=True) 64 | log_dataset['time'] = log_dataset['time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) 65 | 66 | return log_dataset 67 | 68 | def get_fea_distribute(feature_df, feature_importances, dataset_type, top=30): 69 | print('根据特征重要性,获取数据集的分布情况,用于验证训练集和测试集是否分布一致') 70 | fea_distribute_list = [] 71 | for i in feature_importances[:top]['fea'].to_list(): 72 | fea_distribute_tmp = (feature_df[i].value_counts() / len(feature_df)).reset_index().rename( 73 | columns={'index': 'value'}) 74 | fea_distribute_list.append(fea_distribute_tmp) 75 | 76 | fea_distribute = fea_distribute_list[-1] 77 | for i in fea_distribute_list[:-1]: 78 | fea_distribute = fea_distribute.merge(i, on='value', how='left') 79 | fea_distribute['value'] = fea_distribute['value'].apply(lambda x: f'{dataset_type}_{int(x)}') 80 | return fea_distribute 81 | 82 | 83 | 84 | def get_train_test(label, preliminary_submit_dataset_a, log_dataset): 85 | print('获取训练集数据与测试集数据') 86 | train = label.merge(log_dataset, on='sn', how='left') 87 | test = preliminary_submit_dataset_a.merge(log_dataset, on='sn', how='left') 88 | # train['time_interval'] = (pd.to_datetime( train['fault_time'])-train['time'] ).apply(lambda x:x.total_seconds()) 89 | # test['time_interval'] = (pd.to_datetime( test['fault_time'])- test['time'] ).apply(lambda x:x.total_seconds()) 90 | # train = train.query('time_interval > 0') 91 | # test = test.query('time_interval > 0') 92 | print(f'训练集维度:{train.shape},测试集维度:{test.shape}') 93 | train = train.drop_duplicates().reset_index(drop=True) 94 | test = test.drop_duplicates().reset_index(drop=True) 95 | train['time'] = pd.to_datetime(train['time']) 96 | test['time'] = pd.to_datetime(test['time']) 97 | return train, test 98 | start_time = datetime.datetime.now() 99 | 100 | additional_sel_log_dataset_path = os.path.join(TRAIN_DIR, 'additional_sel_log_dataset.csv') 101 | preliminary_train_label_dataset_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset.csv') 102 | preliminary_train_label_dataset_s_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset_s.csv') 103 | preliminary_sel_log_dataset_path = os.path.join(TRAIN_DIR, 'preliminary_sel_log_dataset.csv') 104 | 105 | preliminary_submit_dataset_a_path = os.path.join(TEST_A_DIR, 'final_submit_dataset_b.csv') 106 | preliminary_sel_log_dataset_a_path = os.path.join(TEST_A_DIR, 'final_sel_log_dataset_b.csv') 107 | 108 | print(preliminary_submit_dataset_a_path, preliminary_sel_log_dataset_a_path) 109 | 110 | preliminary_submit_dataset_a = pd.read_csv(preliminary_submit_dataset_a_path) 111 | preliminary_submit_dataset_a.head() 112 | 113 | log_dataset = get_log_dateset(PSEUDO_FALG) 114 | label = get_label(PSEUDO_FALG) 115 | 116 | 117 | next_time_list = [i / TIME_INTERVAL for i in [3, 5, 10, 15, 30, 45, 60, 90, 120, 240, 360, 480, 540, 600]] + [1000000] 118 | 119 | label, preliminary_submit_dataset_a = add_last_next_time4fault(label, preliminary_submit_dataset_a, TIME_INTERVAL, 120 | next_time_list) 121 | train, test = get_train_test(label, preliminary_submit_dataset_a, log_dataset) 122 | train = train.drop_duplicates(['sn', 'fault_time', 'time', 'msg', 'server_model']).reset_index(drop=True) 123 | 124 | train['time_interval'] = (pd.to_datetime(train['fault_time']) - pd.to_datetime(train['time'])).apply( 125 | lambda x: x.total_seconds()) 126 | test['time_interval'] = (pd.to_datetime(test['fault_time']) - pd.to_datetime(test['time'])).apply( 127 | lambda x: x.total_seconds()) 128 | 129 | all_data = pd.concat([train, test], axis=0, ignore_index=True) 130 | all_data = all_data.sort_values(['sn','server_model', 'fault_time', 'time']) 131 | w2v_feats = get_w2v_feats(all_data, 132 | f1_list = ['sn'], 133 | f2_list = ['msg_list', 'msg_0', 'msg_1', 'msg_2']) 134 | # 获取 time_diff_feats_v2 135 | time_diff_feats_v2 = get_time_diff_feats_v2(all_data) 136 | # 获取 server_model_time_interval_stat_fea 137 | server_model_time_interval_stat_fea = get_server_model_time_interval_stat_fea(all_data) 138 | 139 | msg_text_fea = get_msg_text_fea_all(all_data) 140 | # 获取时间差特征 141 | duration_minutes_fea = get_duration_minutes_fea(train, test) 142 | 143 | # 获取时间server_model特征 144 | server_model_fea = get_server_model_fea(train, test) 145 | counter = get_word_counter(train) 146 | 147 | # 获取时间 nearest_msg 特征 148 | nearest_msg_fea = get_nearest_msg_fea(train, test) 149 | # 获取时间 server_model beta_target 特征 150 | beta_target_fea = get_beta_target(train, test) 151 | 152 | key = ['sn', 'fault_time', 'label', 'server_model'] 153 | 154 | fea_num = len(KEY_WORDS) 155 | time_list = [i * TIME_INTERVAL for i in next_time_list] 156 | train = get_feature(train, time_list, KEY_WORDS, fea_num, key=['sn', 'fault_time', 'label', 'server_model']) 157 | test = get_feature(test, time_list, KEY_WORDS, fea_num, key=['sn', 'fault_time', 'server_model']) 158 | 159 | print('添加 时间差 特征') 160 | train = train.merge(duration_minutes_fea, on=['sn', 'fault_time', 'server_model']) 161 | test = test.merge(duration_minutes_fea, on=['sn', 'fault_time', 'server_model']) 162 | 163 | print('添加 server_model特征') 164 | train = train.merge(server_model_fea, on=['sn', 'server_model']) 165 | test = test.merge(server_model_fea, on=['sn', 'server_model']) 166 | 167 | print('添加 w2v_feats') 168 | train = train.merge(w2v_feats, on=['sn' ]) 169 | test = test.merge(w2v_feats, on=['sn', ]) 170 | 171 | print('添加 nearest_msg 特征') 172 | train = train.merge(nearest_msg_fea, on=['sn', 'server_model', 'fault_time']) 173 | test = test.merge(nearest_msg_fea, on=['sn', 'server_model', 'fault_time']) 174 | 175 | print('添加 beta_target 特征') 176 | train = train.merge(beta_target_fea, on=['sn', 'server_model', 'fault_time']) 177 | test = test.merge(beta_target_fea, on=['sn', 'server_model', 'fault_time']) 178 | 179 | server_model_sn_fea_2 = get_server_model_sn_fea_2(train, test) 180 | print('添加 server_model_sn_fea_2 特征') 181 | train = train.merge(server_model_sn_fea_2, on=['sn', 'server_model']) 182 | test = test.merge(server_model_sn_fea_2, on=['sn', 'server_model']) 183 | 184 | print('添加 time_diff_feats_v2 特征') 185 | train = train.merge(time_diff_feats_v2, on=['sn', 'server_model', 'fault_time']) 186 | test = test.merge(time_diff_feats_v2, on=['sn', 'server_model', 'fault_time']) 187 | 188 | # test.to_csv(os.path.join(GENERATION_DIR,'test.csv'),index =False) 189 | # train.to_csv(os.path.join(GENERATION_DIR,'train.csv'),index =False) 190 | 191 | # crashdump_venus_fea = pd.read_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea.csv') ) 192 | # print('添加 crashdump_venus_fea 特征') 193 | # print(train.shape,test.shape,crashdump_venus_fea.shape) 194 | # train = train.merge(crashdump_venus_fea, on=['sn' , 'fault_time'],how = 'left') 195 | # test = test.merge(crashdump_venus_fea, on=['sn', 'fault_time' ],how = 'left') 196 | # print(train.shape,test.shape ) 197 | 198 | # crashdump_venus_fea = pd.read_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea_v1.csv') ) 199 | # print('添加 crashdump_venus_fea 特征') 200 | # print(train.shape,test.shape,crashdump_venus_fea.shape) 201 | # train = train.merge(crashdump_venus_fea, on=['sn' , 'fault_time'],how = 'left') 202 | # test = test.merge(crashdump_venus_fea, on=['sn', 'fault_time' ],how = 'left') 203 | # print(train.shape,test.shape ) 204 | # test.to_csv(os.path.join(GENERATION_DIR,'test.csv'),index =False) 205 | # train.to_csv(os.path.join(GENERATION_DIR,'train.csv'),index =False) 206 | # print('添加 key_for_top_fea 特征') 207 | # train,test = get_key_for_top_fea(train,test) 208 | 209 | # print('添加 w2v_tfidf_doc2v_fea 特征') 210 | # w2v_tfidf_fea = pd.read_csv(os.path.join(GENERATION_DIR,'w2v_tfidf_fea.csv')) 211 | # drop_cols = [i for i in w2v_tfidf_fea if 'doc2vec' in i ]+[i for i in w2v_tfidf_fea if 'tfidf' in i ] 212 | # for col in drop_cols: 213 | # del w2v_tfidf_fea[col] 214 | # 215 | # train = train.merge(w2v_tfidf_fea, on=['sn' ], how='left') 216 | # test = test.merge(w2v_tfidf_fea, on=['sn' ], how='left') 217 | 218 | # print('添加 关键词交叉特征 ') 219 | # train,test = get_key_word_cross_fea(train,test) 220 | 221 | # print('添加 server_model_time_interval_stat_fea 特征') 222 | # train = train.merge(server_model_time_interval_stat_fea, on=['server_model' ],how ='left') 223 | # test = test.merge(server_model_time_interval_stat_fea, on=['server_model' ],how ='left') 224 | 225 | 226 | use_less_cols_1 = ['last_last_msg_cnt', 'last_first_msg_cnt','time_diff_1_min', 227 | 'last_msg_list_unique_LabelEnc', 'last_msg_0_unique_LabelEnc', 228 | 'last_msg_1_unique_LabelEnc', 'last_msg_2_unique_LabelEnc', 229 | 'last_msg_list_list_LabelEnc', 'last_msg_0_list_LabelEnc', 230 | 'last_msg_1_list_LabelEnc', 'last_msg_2_list_LabelEnc', 231 | 'last_msg_0_first_LabelEnc', 'last_msg_1_first_LabelEnc', 232 | 'last_msg_2_first_LabelEnc', 'last_msg_0_last_LabelEnc', 233 | 'last_msg_1_last_LabelEnc', 'last_msg_2_last_LabelEnc', 234 | 'last_msg_last_LabelEnc', 'last_msg_first_LabelEnc'] 235 | 236 | use_less_col = [i for i in train.columns if train[i].nunique() < 2] + use_less_cols_1 237 | 238 | 239 | print(f'use_less_col:{len(use_less_col)}') 240 | use_cols = [i for i in train.columns if i not in ['sn', 'fault_time', 'label', 'server_model'] + use_less_col] 241 | 242 | cat_cols = ['server_model_LabelEnc', 'msg_LabelEnc', 'msg_0_LabelEnc', 'msg_1_LabelEnc', 'msg_2_LabelEnc',] 243 | use_cols = sorted(use_cols) 244 | 245 | cat_cols = [] 246 | for i in use_cols: 247 | if '_LabelEnc' in i: 248 | cat_cols.append(i) 249 | print('使用的特征维度:',len(use_cols),'类别特征维度:',len(cat_cols)) 250 | # fs = FeatureSelector(data=train[use_cols], labels=train['label']) 251 | # 252 | # # 选择出missing value 百分比大于60%的特征 253 | # fs.identify_missing(missing_threshold=0.9) 254 | # 255 | # # # 查看选择出的特征 256 | # # fs.ops['missing'] 257 | # # 不对feature进行one-hot encoding(默认为False), 然后选择出相关性大于98%的feature, 258 | # fs.identify_collinear(correlation_threshold=0.99, one_hot=False) 259 | # 260 | # # # 查看选择的feature 261 | # # fs.ops['collinear'] 262 | # 263 | # # 选择出只有单个值的feature 264 | # fs.identify_single_unique() 265 | # 266 | # # # 查看选择出的feature 267 | # # fs.ops['single_unique'] 268 | # 269 | # train_removed = fs.remove(methods = ['missing', 'single_unique', 'collinear',], keep_one_hot=False) 270 | # use_cols = train_removed.columns 271 | # print('特征选择之后,使用的特征维度:',len(use_cols)) 272 | 273 | 274 | oof_prob = np.zeros((train.shape[0], 4)) 275 | test_prob = np.zeros((test.shape[0], 4)) 276 | # seeds = [42,4242,40424,1024,2048] 277 | seeds = [42 ] 278 | for seed in seeds: 279 | oof_prob, test_prob, fea_imp_df, model_list = run_cbt(train[use_cols] , train[['label']] , test[use_cols], k=5, 280 | seed=seed, cat_cols=cat_cols) 281 | oof_prob +=oof_prob/len(seeds) 282 | test_prob +=test_prob/len(seeds) 283 | 284 | 285 | weight = search_weight(train, train[['label']], oof_prob, init_weight=[1.0], class_num=4, step=0.001) 286 | oof_prob = oof_prob * np.array(weight) 287 | test_prob = test_prob * np.array(weight) 288 | 289 | target_df = train[['sn', 'fault_time', 'label']] 290 | submit_df = train[['sn', 'fault_time']] 291 | submit_df['label'] = oof_prob.argmax(axis=1) 292 | 293 | score = macro_f1(target_df=target_df, submit_df=submit_df) 294 | print(f'********************** BEST MACRO_F1 : {score} **********************') 295 | score = round(score, 5) 296 | 297 | y_pred = test_prob.argmax(axis=1) 298 | result = test[['sn', 'fault_time']] 299 | result['label'] = y_pred 300 | result = preliminary_submit_dataset_a.merge(result, on=['sn', 'fault_time'], how='left')[['sn', 'fault_time', 'label']] 301 | result['label'] = result['label'].fillna(0).astype(int) 302 | 303 | result.to_csv(os.path.join(RESULT_DIR,f'catboost_result.csv'), index=False) 304 | print(result['label'].value_counts()) 305 | fea_imp_df = fea_imp_df.reset_index(drop = True) 306 | fea_imp_df.to_csv(os.path.join(RESULT_DIR,f'./cat_fea_imp_{int(score*100000)}.csv'),index = False) 307 | 308 | train_result_prob = pd.DataFrame(oof_prob).add_prefix('cat_class_') 309 | test_result_prob = pd.DataFrame(test_prob).add_prefix('cat_class_') 310 | train_result_prob['label'] = train['label'] 311 | train_result_prob['sn'] = train['sn'] 312 | train_result_prob['fault_time'] = train['fault_time'] 313 | test_result_prob['sn'] = test['sn'] 314 | test_result_prob['fault_time'] = test['fault_time'] 315 | 316 | result_prob = pd.concat([train_result_prob,test_result_prob],ignore_index = True) 317 | result_prob.to_csv(os.path.join(RESULT_DIR,f'cat_prob_result.csv'),index = False) 318 | 319 | end_time = datetime.datetime.now() 320 | cost_time = end_time - start_time 321 | print('****************** CATBOOST COST TIME : ',str(cost_time),' ******************') 322 | 323 | ''' 324 | 325 | v7: 最优 线下 0.7303 326 | v8: v7 添加 关键词交叉特征 并作为类别变量输入模型 0.73114 327 | 328 | ''' -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/code/get_crashdump_venus_fea.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | import gc 4 | import warnings 5 | import pandas as pd 6 | import pickle 7 | from gensim.models.word2vec import Word2Vec 8 | from gensim.models.doc2vec import Doc2Vec, TaggedDocument 9 | from sklearn.utils.class_weight import compute_class_weight 10 | from sklearn.preprocessing import LabelEncoder 11 | from sklearn.feature_extraction.text import TfidfVectorizer 12 | from sklearn.decomposition import TruncatedSVD 13 | import numpy as np 14 | import pandas as pd 15 | from generate_feature import add_w2v_feats, cat2num 16 | from generate_feature import get_key 17 | 18 | from generate_feature import get_beta_target, add_last_next_time4fault, get_feature, \ 19 | get_duration_minutes_fea, get_nearest_msg_fea, get_server_model_sn_fea_2, \ 20 | get_server_model_fea, get_msg_text_fea_all, get_key_word_cross_fea, get_server_model_time_interval_stat_fea, \ 21 | get_w2v_feats, get_key, get_class_key_words_nunique 22 | from model import run_cbt, run_lgb 23 | from utils import RESULT_DIR, TRAIN_DIR, \ 24 | TEST_A_DIR, KEY_WORDS, TOP_KEY_WORDS, get_word_counter, search_weight, macro_f1, TIME_INTERVAL, PSEUDO_FALG, \ 25 | GENERATION_DIR 26 | 27 | warnings.filterwarnings('ignore') 28 | 29 | 30 | def get_fault_code_list(x): 31 | try: 32 | x = x.replace('.', ',').split(',') 33 | except: 34 | x = [] 35 | return x 36 | 37 | 38 | def get_module_cause_list(x): 39 | try: 40 | x = x.replace(',', '_').replace(',', '_') 41 | x = list(set(x.split('_'))) 42 | except: 43 | x = [] 44 | return x 45 | 46 | 47 | def get_label(PSEUDO_FALG): 48 | preliminary_train_label_dataset = pd.read_csv(preliminary_train_label_dataset_path) 49 | preliminary_train_label_dataset_s = pd.read_csv(preliminary_train_label_dataset_s_path) 50 | 51 | if PSEUDO_FALG: 52 | print('获取伪标签LABEL') 53 | pseudo_labels = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_labels.csv')) 54 | label = pd.concat([preliminary_train_label_dataset, 55 | pseudo_labels, 56 | preliminary_train_label_dataset_s], 57 | ignore_index=True, 58 | axis=0).sort_values( 59 | ['sn', 'fault_time']).reset_index(drop=True) 60 | else: 61 | print('不使用伪标签数据') 62 | label = pd.concat([preliminary_train_label_dataset, 63 | preliminary_train_label_dataset_s], 64 | ignore_index=True, 65 | axis=0).sort_values( 66 | ['sn', 'fault_time']).reset_index(drop=True) 67 | label['fault_time'] = label['fault_time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) 68 | label['fault_time'] = label['fault_time'].apply(lambda x: str(x)) 69 | return label 70 | 71 | 72 | def get_module_cause_code(x, code_name): 73 | code_list = [] 74 | for i in x: 75 | if code_name in i: 76 | code_list.append(i) 77 | return code_list 78 | 79 | 80 | def get_alertname_code(x, alertname): 81 | x = x.split(',') 82 | 83 | try: 84 | alertname_code = x[x.index(alertname) + 1] 85 | except: 86 | alertname_code = np.nan 87 | return alertname_code 88 | 89 | 90 | def get_alertname_code_2(x, alertname): 91 | # x =x.split(',') 92 | 93 | try: 94 | alertname_code = x[x.index(alertname) + 1] 95 | except: 96 | alertname_code = ' ' 97 | return alertname_code 98 | 99 | 100 | def get_last_msg_cnt(x): 101 | last_msg = x[-1] 102 | cnt = x.count(last_msg) 103 | return cnt 104 | 105 | 106 | def get_first_msg_cnt(x): 107 | first_msg = x[0] 108 | cnt = x.count(first_msg) 109 | return cnt 110 | 111 | 112 | def get_crashdump_venus_data(): 113 | final_venus_dataset = pd.read_csv(os.path.join(TEST_A_DIR, 'final_venus_dataset_b.csv')) 114 | final_crashdump_dataset = pd.read_csv(os.path.join(TEST_A_DIR, 'final_crashdump_dataset_b.csv')) 115 | final_crashdump_venus = final_crashdump_dataset.merge(final_venus_dataset, on=['sn', 'fault_time'], 116 | how='outer') 117 | 118 | preliminary_venus_dataset = pd.read_csv(os.path.join(TRAIN_DIR, 'preliminary_venus_dataset.csv')) 119 | preliminary_crashdump_dataset = pd.read_csv(os.path.join(TRAIN_DIR, 'preliminary_crashdump_dataset.csv')) 120 | preliminary_crashdump_venus = preliminary_crashdump_dataset.merge(preliminary_venus_dataset, 121 | on=['sn', 'fault_time'], 122 | how='outer') 123 | 124 | crashdump_venus = pd.concat([final_crashdump_venus, preliminary_crashdump_venus], 125 | ignore_index=True).drop_duplicates() 126 | crashdump_venus = crashdump_venus.sort_values(['sn', 'fault_time']).reset_index(drop=True) 127 | return crashdump_venus 128 | 129 | 130 | def get_crashdump_venus_fea(crashdump_venus): 131 | print('生成 crashdump_venus 特征') 132 | crashdump_venus['module_cause_list'] = crashdump_venus['module_cause'].apply(lambda x: get_module_cause_list(x)) 133 | crashdump_venus['fault_code_list'] = crashdump_venus['fault_code'].apply(lambda x: get_fault_code_list(x)) 134 | 135 | code_name_list = ['module', 'cod1', 'cod2', 'addr', 'port'] 136 | for code_name in code_name_list: 137 | crashdump_venus[f'module_cause_{code_name}'] = crashdump_venus['module_cause_list'].apply( 138 | lambda x: get_module_cause_code(x, code_name)) 139 | crashdump_venus[f'module_cause_{code_name}_len'] = crashdump_venus[f'module_cause_{code_name}'].apply( 140 | lambda x: len(x)) 141 | crashdump_venus[f'module_cause_{code_name}'] = crashdump_venus[f'module_cause_{code_name}'].apply( 142 | lambda x: '_'.join(set(x))) 143 | code_name_list = ['cha', '0x', 'cod', 'core', 'cpu', 'm2m', 'pcu'] 144 | for code_name in code_name_list: 145 | crashdump_venus[f'fault_{code_name}'] = crashdump_venus['fault_code_list'].apply( 146 | lambda x: get_module_cause_code(x, code_name)) 147 | crashdump_venus[f'fault_{code_name}_len'] = crashdump_venus[f'fault_{code_name}'].apply(lambda x: len(x)) 148 | crashdump_venus[f'fault_{code_name}'] = crashdump_venus[f'fault_{code_name}'].apply(lambda x: '_'.join(set(x))) 149 | 150 | cols_tmp = ['module_cause', 'fault_code', 'module_cause_module', 151 | 'module_cause_cod1', 'module_cause_cod2', 'module_cause_addr', 152 | 'module_cause_port', 'fault_cha', 'fault_0x', 'fault_cod', 'fault_core', 153 | 'fault_cpu', 'fault_m2m', 'fault_pcu', ] 154 | new_cat_cols = [] 155 | crashdump_venus = cat2num(crashdump_venus, cols_tmp) 156 | for name in cols_tmp: 157 | # le = LabelEncoder() 158 | # crashdump_venus[f'{name}_LabelEnc'] = le.fit_transform(crashdump_venus[name]) 159 | new_cat_cols.append(f'{name}_LabelEnc') 160 | 161 | num_cols = ['fault_pcu_len', 'fault_m2m_len', 162 | 'fault_cpu_len', 'fault_0x_len', 'fault_cod_len', 163 | 'module_cause_module_len', 'module_cause_cod1_len', 164 | 'module_cause_cod2_len', 'module_cause_addr_len', 165 | 'module_cause_port_len', 'fault_cha_len', 'fault_core_len', ] 166 | 167 | crashdump_venus = crashdump_venus[['sn', 'fault_time'] + new_cat_cols + num_cols] 168 | crashdump_venus = crashdump_venus.rename(columns={'fault_time': 'crashdump_fault_time'}) 169 | 170 | crashdump_venus['crashdump_fault_time'] = pd.to_datetime(crashdump_venus['crashdump_fault_time']) 171 | del crashdump_venus['crashdump_fault_time'] 172 | print(f'生成 crashdump_venus 特征完毕,特征维度 {crashdump_venus.shape}') 173 | return crashdump_venus 174 | 175 | 176 | def get_location_word(x, num): 177 | try: 178 | return x[num] 179 | except: 180 | return 181 | 182 | 183 | def get_label(PSEUDO_FALG): 184 | preliminary_train_label_dataset = pd.read_csv(preliminary_train_label_dataset_path) 185 | preliminary_train_label_dataset_s = pd.read_csv(preliminary_train_label_dataset_s_path) 186 | 187 | if PSEUDO_FALG: 188 | print('获取伪标签LABEL') 189 | pseudo_labels = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_labels.csv')) 190 | label = pd.concat([preliminary_train_label_dataset, 191 | pseudo_labels, 192 | preliminary_train_label_dataset_s], 193 | ignore_index=True, 194 | axis=0).sort_values( 195 | ['sn', 'fault_time']).reset_index(drop=True) 196 | else: 197 | print('不使用伪标签数据') 198 | label = pd.concat([preliminary_train_label_dataset, 199 | preliminary_train_label_dataset_s], 200 | ignore_index=True, 201 | axis=0).sort_values( 202 | ['sn', 'fault_time']).reset_index(drop=True) 203 | label['fault_time'] = label['fault_time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) 204 | label['fault_time'] = label['fault_time'].apply(lambda x: str(x)) 205 | return label 206 | 207 | 208 | module_list = ['module0','module1','module2','module3','module4','module5','module7','module8','module9', 209 | 'module10','module11','module12','module13','module14','module17','module18','module19', 210 | 'in traffic control', 211 | 'irpp0','irpp1', 212 | 'pcie rootport 0:0.0','pcie rootport a2:0.0','pcie rootport 2b:3.0', 213 | 'port a','port c'] 214 | module_list2 = ['module0','module1','module2','module3','module4','module5','module7','module8','module9', 215 | 'module10','module11','module12','module13','module14','module17','module18','module19'] 216 | other_module_list = ['in traffic control', 'irpp0', 'irpp1', 'pcie rootport 0:0.0', 217 | 'pcie rootport a2:0.0', 'pcie rootport 2b:3.0', 'port a', 'port c'] 218 | module_content_list = ['module0_cod1', 'module0_cod2', 'module0_addr', 219 | 'module1_cod1', 'module1_cod2', 'module1_addr', 'module2_cod1', 220 | 'module2_cod2', 'module2_addr', 'module3_cod1', 'module3_cod2', 221 | 'module3_addr', 'module4_cod1', 'module4_cod2', 'module4_addr', 222 | 'module5_cod1', 'module5_cod2', 'module5_addr', 'module7_cod1', 223 | 'module7_cod2', 'module7_addr', 'module8_cod1', 'module8_cod2', 224 | 'module8_addr', 'module9_cod1', 'module9_cod2', 'module9_addr', 225 | 'module10_cod1', 'module10_cod2', 'module10_addr', 'module11_cod1', 226 | 'module11_cod2', 'module11_addr', 'module12_cod1', 'module12_cod2', 227 | 'module12_addr', 'module13_cod1', 'module13_cod2', 'module13_addr', 228 | 'module14_cod1', 'module14_cod2', 'module14_addr', 'module17_cod1', 229 | 'module17_cod2', 'module17_addr', 'module18_cod1', 'module18_cod2', 230 | 'module18_addr', 'module19_cod1', 'module19_cod2', 'module19_addr'] 231 | fault_code_content_list = ['fault_code_cod1', 'fault_code_cod2', 232 | 'fault_code_cpu0', 'fault_code_cpu1'] 233 | 234 | 235 | crashdump_venus = get_crashdump_venus_data() 236 | crashdump_venus['module_cause_list'] = crashdump_venus['module_cause'].fillna('_').apply(lambda x:x.split(',')) 237 | crashdump_venus['module_cause'] = crashdump_venus['module_cause'].fillna('_').apply(lambda x:x.replace(':','_').replace(',','_')) 238 | for module in module_list: 239 | crashdump_venus['module_cause'] = crashdump_venus['module_cause'].fillna('_').apply( 240 | lambda x:x.replace(f'{module}_',f'{module}:').replace(f'_{module}',f',{module}')) 241 | crashdump_venus['module_cause'] = crashdump_venus['module_cause'].apply(lambda x:x.replace(':',',')) 242 | 243 | for module in module_list: 244 | crashdump_venus[module] = crashdump_venus['module_cause'].apply(lambda x:get_alertname_code(x,module)) 245 | crashdump_venus[module] = crashdump_venus.loc[:,module].fillna(' ').apply(lambda x:x.replace('_',' ')) 246 | crashdump_venus[module] = crashdump_venus[module].apply(lambda x:x.split(' ')) 247 | crashdump_venus['module_cause_new'] = crashdump_venus.loc[:,module_list].sum(1) 248 | 249 | 250 | for module in module_list2: 251 | crashdump_venus[f'{module}_cod1'] = crashdump_venus[module].apply(lambda x:[get_alertname_code_2(x,'cod1')]) 252 | crashdump_venus[f'{module}_cod2'] = crashdump_venus[module].apply(lambda x:[get_alertname_code_2(x,'cod2')]) 253 | crashdump_venus[f'{module}_addr'] = crashdump_venus[module].apply(lambda x:[get_alertname_code_2(x,'addr')]) 254 | del crashdump_venus[module] 255 | gc.collect() 256 | 257 | crashdump_venus['fault_code_list'] = crashdump_venus['fault_code'].fillna(' ').apply(lambda x:x.split('.')) 258 | for i in ['cod1','cod2','cpu0','cpu1']: 259 | crashdump_venus[f'fault_code_{i}'] = crashdump_venus['fault_code_list'].apply(lambda x:[get_alertname_code_2(x,i)]) 260 | 261 | 262 | crashdump_venus['other_module_list'] = crashdump_venus.loc[:,other_module_list].sum(1) 263 | crashdump_venus['module_content_list'] = crashdump_venus.loc[:,module_content_list].sum(1) 264 | crashdump_venus['module_cause_new'] = crashdump_venus.loc[:,other_module_list+module_content_list].sum(1) 265 | crashdump_venus['fault_code_content_list'] = crashdump_venus.loc[:,fault_code_content_list].sum(1) 266 | crashdump_venus['all_crashdump_venus'] = crashdump_venus.loc[:,other_module_list+module_content_list+fault_code_content_list].sum(1) 267 | 268 | f1_list = ['sn'] 269 | f2_list = ['other_module_list','module_content_list','module_cause_new','fault_code_content_list','all_crashdump_venus'] 270 | w2v_feats_df = crashdump_venus[f1_list].drop_duplicates() 271 | w2v_feats_df_list = [] 272 | for f1 in f1_list: 273 | for f2 in f2_list: 274 | w2v_fea_tmp = add_w2v_feats(crashdump_venus,w2v_feats_df,f1,f2,emb_size = 10,window = 5,min_count =5,) 275 | w2v_feats_df_list.append(w2v_fea_tmp) 276 | w2v_feats_df = w2v_feats_df_list[0] 277 | for i in w2v_feats_df_list[1:]: 278 | w2v_feats_df = w2v_feats_df.merge(i,on = 'sn',how = 'left') 279 | 280 | for i in other_module_list+module_content_list+fault_code_content_list: 281 | crashdump_venus[i] = crashdump_venus[i].astype(str) 282 | 283 | crashdump_venus = cat2num(crashdump_venus,other_module_list+module_content_list+fault_code_content_list) 284 | for i in other_module_list+module_content_list+fault_code_content_list: 285 | del crashdump_venus[i] 286 | gc.collect() 287 | crashdump_venus = crashdump_venus.merge(w2v_feats_df,on ='sn',how ='left').rename(columns ={'fault_time':'crashdump_venus_fault_time'} ) 288 | 289 | preliminary_train_label_dataset_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset.csv') 290 | preliminary_train_label_dataset_s_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset_s.csv') 291 | test = pd.read_csv(os.path.join(TEST_A_DIR, 'final_submit_dataset_b.csv'))[['sn', 'fault_time' ]] 292 | train = get_label(False)[['sn', 'fault_time', 'label',]] 293 | 294 | test_tmp = test[['sn', 'fault_time']] 295 | test_tmp = test_tmp.merge(crashdump_venus, on='sn').drop_duplicates(['sn', 'fault_time']).reset_index(drop=True) 296 | train_tmp = train[['sn', 'fault_time', 'label', ]] 297 | train_tmp = train_tmp.merge(crashdump_venus, on='sn').drop_duplicates(['sn', 'fault_time']).reset_index(drop=True) 298 | 299 | 300 | train_tmp['duration_fault_time'] = pd.to_datetime(train_tmp['fault_time']) - pd.to_datetime(train_tmp['crashdump_venus_fault_time']) 301 | test_tmp['duration_fault_time'] = pd.to_datetime(test_tmp['fault_time']) - pd.to_datetime(test_tmp['crashdump_venus_fault_time']) 302 | 303 | train_tmp['duration_fault_time'] = train_tmp['duration_fault_time'].apply(lambda x:x.total_seconds()) 304 | test_tmp['duration_fault_time'] = test_tmp['duration_fault_time'].apply(lambda x:x.total_seconds()) 305 | 306 | 307 | drop_cols = ['sn', 'fault_time', 'fault_code', 'module_cause', 'module','crashdump_venus_fault_time', 308 | 'module_cause_list', 'module_cause_new', 'fault_code_list','label','duration_fault_time', 309 | 'other_module_list', 'module_content_list', 'fault_code_content_list', 310 | 'all_crashdump_venus',] 311 | use_cols = [i for i in train_tmp.columns if i not in drop_cols] 312 | 313 | cat_cols = [f'{i}_LabelEnc' for i in other_module_list+module_content_list+fault_code_content_list] 314 | 315 | oof_prob = np.zeros((train.shape[0], 4)) 316 | 317 | test_prob = np.zeros((test.shape[0], 4)) 318 | # seeds = [42,4242,40424,1024,2048] 319 | seeds = [42 ] 320 | for seed in seeds: 321 | oof_prob, test_prob, fea_imp_df, model_list = run_cbt(train_tmp[use_cols], train_tmp[['label']], test_tmp[use_cols], k=5, 322 | seed=seed, cat_cols=cat_cols) 323 | oof_prob +=oof_prob/len(seeds) 324 | test_prob +=test_prob/len(seeds) 325 | 326 | 327 | weight = search_weight(train_tmp, train_tmp[['label']], oof_prob, init_weight=[1.0], class_num=4, step=0.001) 328 | oof_prob = oof_prob * np.array(weight) 329 | test_prob = test_prob * np.array(weight) 330 | 331 | 332 | target_df = train_tmp[['sn', 'fault_time', 'label']].drop_duplicates(['sn', 'fault_time']) 333 | submit_df = train_tmp[['sn', 'fault_time']] 334 | submit_df['label'] = oof_prob.argmax(axis=1) 335 | submit_df = submit_df.drop_duplicates(['sn', 'fault_time']) 336 | # submit_df = pd.read_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea1.csv')).rename(columns = {'crashdump_venus_label':'label'}) 337 | 338 | 339 | score = macro_f1(target_df=target_df, submit_df=submit_df) 340 | print(f'********************** BEST MACRO_F1 : {score} **********************') 341 | score = round(score, 5) 342 | 343 | print(fea_imp_df[:20]) 344 | y_pred = test_prob.argmax(axis=1) 345 | result = test_tmp[['sn', 'fault_time']] 346 | result['label'] = y_pred 347 | result = result.drop_duplicates(['sn', 'fault_time']) 348 | 349 | crashdump_venus_fea = pd.concat([submit_df,result],ignore_index = False,axis = 0) 350 | crashdump_venus_fea = crashdump_venus_fea.rename(columns = {'label':'crashdump_venus_label_v1'}) 351 | crashdump_venus_fea.to_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea_v1.csv'),index= False) 352 | print(crashdump_venus_fea['crashdump_venus_label_v1'].value_counts()) 353 | 354 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU AFFERO GENERAL PUBLIC LICENSE 2 | Version 3, 19 November 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU Affero General Public License is a free, copyleft license for 11 | software and other kinds of works, specifically designed to ensure 12 | cooperation with the community in the case of network server software. 13 | 14 | The licenses for most software and other practical works are designed 15 | to take away your freedom to share and change the works. By contrast, 16 | our General Public Licenses are intended to guarantee your freedom to 17 | share and change all versions of a program--to make sure it remains free 18 | software for all its users. 19 | 20 | When we speak of free software, we are referring to freedom, not 21 | price. Our General Public Licenses are designed to make sure that you 22 | have the freedom to distribute copies of free software (and charge for 23 | them if you wish), that you receive source code or can get it if you 24 | want it, that you can change the software or use pieces of it in new 25 | free programs, and that you know you can do these things. 26 | 27 | Developers that use our General Public Licenses protect your rights 28 | with two steps: (1) assert copyright on the software, and (2) offer 29 | you this License which gives you legal permission to copy, distribute 30 | and/or modify the software. 31 | 32 | A secondary benefit of defending all users' freedom is that 33 | improvements made in alternate versions of the program, if they 34 | receive widespread use, become available for other developers to 35 | incorporate. Many developers of free software are heartened and 36 | encouraged by the resulting cooperation. However, in the case of 37 | software used on network servers, this result may fail to come about. 38 | The GNU General Public License permits making a modified version and 39 | letting the public access it on a server without ever releasing its 40 | source code to the public. 41 | 42 | The GNU Affero General Public License is designed specifically to 43 | ensure that, in such cases, the modified source code becomes available 44 | to the community. It requires the operator of a network server to 45 | provide the source code of the modified version running there to the 46 | users of that server. Therefore, public use of a modified version, on 47 | a publicly accessible server, gives the public access to the source 48 | code of the modified version. 49 | 50 | An older license, called the Affero General Public License and 51 | published by Affero, was designed to accomplish similar goals. This is 52 | a different license, not a version of the Affero GPL, but Affero has 53 | released a new version of the Affero GPL which permits relicensing under 54 | this license. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | TERMS AND CONDITIONS 60 | 61 | 0. Definitions. 62 | 63 | "This License" refers to version 3 of the GNU Affero General Public License. 64 | 65 | "Copyright" also means copyright-like laws that apply to other kinds of 66 | works, such as semiconductor masks. 67 | 68 | "The Program" refers to any copyrightable work licensed under this 69 | License. Each licensee is addressed as "you". "Licensees" and 70 | "recipients" may be individuals or organizations. 71 | 72 | To "modify" a work means to copy from or adapt all or part of the work 73 | in a fashion requiring copyright permission, other than the making of an 74 | exact copy. The resulting work is called a "modified version" of the 75 | earlier work or a work "based on" the earlier work. 76 | 77 | A "covered work" means either the unmodified Program or a work based 78 | on the Program. 79 | 80 | To "propagate" a work means to do anything with it that, without 81 | permission, would make you directly or secondarily liable for 82 | infringement under applicable copyright law, except executing it on a 83 | computer or modifying a private copy. Propagation includes copying, 84 | distribution (with or without modification), making available to the 85 | public, and in some countries other activities as well. 86 | 87 | To "convey" a work means any kind of propagation that enables other 88 | parties to make or receive copies. Mere interaction with a user through 89 | a computer network, with no transfer of a copy, is not conveying. 90 | 91 | An interactive user interface displays "Appropriate Legal Notices" 92 | to the extent that it includes a convenient and prominently visible 93 | feature that (1) displays an appropriate copyright notice, and (2) 94 | tells the user that there is no warranty for the work (except to the 95 | extent that warranties are provided), that licensees may convey the 96 | work under this License, and how to view a copy of this License. If 97 | the interface presents a list of user commands or options, such as a 98 | menu, a prominent item in the list meets this criterion. 99 | 100 | 1. Source Code. 101 | 102 | The "source code" for a work means the preferred form of the work 103 | for making modifications to it. "Object code" means any non-source 104 | form of a work. 105 | 106 | A "Standard Interface" means an interface that either is an official 107 | standard defined by a recognized standards body, or, in the case of 108 | interfaces specified for a particular programming language, one that 109 | is widely used among developers working in that language. 110 | 111 | The "System Libraries" of an executable work include anything, other 112 | than the work as a whole, that (a) is included in the normal form of 113 | packaging a Major Component, but which is not part of that Major 114 | Component, and (b) serves only to enable use of the work with that 115 | Major Component, or to implement a Standard Interface for which an 116 | implementation is available to the public in source code form. A 117 | "Major Component", in this context, means a major essential component 118 | (kernel, window system, and so on) of the specific operating system 119 | (if any) on which the executable work runs, or a compiler used to 120 | produce the work, or an object code interpreter used to run it. 121 | 122 | The "Corresponding Source" for a work in object code form means all 123 | the source code needed to generate, install, and (for an executable 124 | work) run the object code and to modify the work, including scripts to 125 | control those activities. However, it does not include the work's 126 | System Libraries, or general-purpose tools or generally available free 127 | programs which are used unmodified in performing those activities but 128 | which are not part of the work. For example, Corresponding Source 129 | includes interface definition files associated with source files for 130 | the work, and the source code for shared libraries and dynamically 131 | linked subprograms that the work is specifically designed to require, 132 | such as by intimate data communication or control flow between those 133 | subprograms and other parts of the work. 134 | 135 | The Corresponding Source need not include anything that users 136 | can regenerate automatically from other parts of the Corresponding 137 | Source. 138 | 139 | The Corresponding Source for a work in source code form is that 140 | same work. 141 | 142 | 2. Basic Permissions. 143 | 144 | All rights granted under this License are granted for the term of 145 | copyright on the Program, and are irrevocable provided the stated 146 | conditions are met. This License explicitly affirms your unlimited 147 | permission to run the unmodified Program. The output from running a 148 | covered work is covered by this License only if the output, given its 149 | content, constitutes a covered work. This License acknowledges your 150 | rights of fair use or other equivalent, as provided by copyright law. 151 | 152 | You may make, run and propagate covered works that you do not 153 | convey, without conditions so long as your license otherwise remains 154 | in force. You may convey covered works to others for the sole purpose 155 | of having them make modifications exclusively for you, or provide you 156 | with facilities for running those works, provided that you comply with 157 | the terms of this License in conveying all material for which you do 158 | not control copyright. Those thus making or running the covered works 159 | for you must do so exclusively on your behalf, under your direction 160 | and control, on terms that prohibit them from making any copies of 161 | your copyrighted material outside their relationship with you. 162 | 163 | Conveying under any other circumstances is permitted solely under 164 | the conditions stated below. Sublicensing is not allowed; section 10 165 | makes it unnecessary. 166 | 167 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 168 | 169 | No covered work shall be deemed part of an effective technological 170 | measure under any applicable law fulfilling obligations under article 171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 172 | similar laws prohibiting or restricting circumvention of such 173 | measures. 174 | 175 | When you convey a covered work, you waive any legal power to forbid 176 | circumvention of technological measures to the extent such circumvention 177 | is effected by exercising rights under this License with respect to 178 | the covered work, and you disclaim any intention to limit operation or 179 | modification of the work as a means of enforcing, against the work's 180 | users, your or third parties' legal rights to forbid circumvention of 181 | technological measures. 182 | 183 | 4. Conveying Verbatim Copies. 184 | 185 | You may convey verbatim copies of the Program's source code as you 186 | receive it, in any medium, provided that you conspicuously and 187 | appropriately publish on each copy an appropriate copyright notice; 188 | keep intact all notices stating that this License and any 189 | non-permissive terms added in accord with section 7 apply to the code; 190 | keep intact all notices of the absence of any warranty; and give all 191 | recipients a copy of this License along with the Program. 192 | 193 | You may charge any price or no price for each copy that you convey, 194 | and you may offer support or warranty protection for a fee. 195 | 196 | 5. Conveying Modified Source Versions. 197 | 198 | You may convey a work based on the Program, or the modifications to 199 | produce it from the Program, in the form of source code under the 200 | terms of section 4, provided that you also meet all of these conditions: 201 | 202 | a) The work must carry prominent notices stating that you modified 203 | it, and giving a relevant date. 204 | 205 | b) The work must carry prominent notices stating that it is 206 | released under this License and any conditions added under section 207 | 7. This requirement modifies the requirement in section 4 to 208 | "keep intact all notices". 209 | 210 | c) You must license the entire work, as a whole, under this 211 | License to anyone who comes into possession of a copy. This 212 | License will therefore apply, along with any applicable section 7 213 | additional terms, to the whole of the work, and all its parts, 214 | regardless of how they are packaged. This License gives no 215 | permission to license the work in any other way, but it does not 216 | invalidate such permission if you have separately received it. 217 | 218 | d) If the work has interactive user interfaces, each must display 219 | Appropriate Legal Notices; however, if the Program has interactive 220 | interfaces that do not display Appropriate Legal Notices, your 221 | work need not make them do so. 222 | 223 | A compilation of a covered work with other separate and independent 224 | works, which are not by their nature extensions of the covered work, 225 | and which are not combined with it such as to form a larger program, 226 | in or on a volume of a storage or distribution medium, is called an 227 | "aggregate" if the compilation and its resulting copyright are not 228 | used to limit the access or legal rights of the compilation's users 229 | beyond what the individual works permit. Inclusion of a covered work 230 | in an aggregate does not cause this License to apply to the other 231 | parts of the aggregate. 232 | 233 | 6. Conveying Non-Source Forms. 234 | 235 | You may convey a covered work in object code form under the terms 236 | of sections 4 and 5, provided that you also convey the 237 | machine-readable Corresponding Source under the terms of this License, 238 | in one of these ways: 239 | 240 | a) Convey the object code in, or embodied in, a physical product 241 | (including a physical distribution medium), accompanied by the 242 | Corresponding Source fixed on a durable physical medium 243 | customarily used for software interchange. 244 | 245 | b) Convey the object code in, or embodied in, a physical product 246 | (including a physical distribution medium), accompanied by a 247 | written offer, valid for at least three years and valid for as 248 | long as you offer spare parts or customer support for that product 249 | model, to give anyone who possesses the object code either (1) a 250 | copy of the Corresponding Source for all the software in the 251 | product that is covered by this License, on a durable physical 252 | medium customarily used for software interchange, for a price no 253 | more than your reasonable cost of physically performing this 254 | conveying of source, or (2) access to copy the 255 | Corresponding Source from a network server at no charge. 256 | 257 | c) Convey individual copies of the object code with a copy of the 258 | written offer to provide the Corresponding Source. This 259 | alternative is allowed only occasionally and noncommercially, and 260 | only if you received the object code with such an offer, in accord 261 | with subsection 6b. 262 | 263 | d) Convey the object code by offering access from a designated 264 | place (gratis or for a charge), and offer equivalent access to the 265 | Corresponding Source in the same way through the same place at no 266 | further charge. You need not require recipients to copy the 267 | Corresponding Source along with the object code. If the place to 268 | copy the object code is a network server, the Corresponding Source 269 | may be on a different server (operated by you or a third party) 270 | that supports equivalent copying facilities, provided you maintain 271 | clear directions next to the object code saying where to find the 272 | Corresponding Source. Regardless of what server hosts the 273 | Corresponding Source, you remain obligated to ensure that it is 274 | available for as long as needed to satisfy these requirements. 275 | 276 | e) Convey the object code using peer-to-peer transmission, provided 277 | you inform other peers where the object code and Corresponding 278 | Source of the work are being offered to the general public at no 279 | charge under subsection 6d. 280 | 281 | A separable portion of the object code, whose source code is excluded 282 | from the Corresponding Source as a System Library, need not be 283 | included in conveying the object code work. 284 | 285 | A "User Product" is either (1) a "consumer product", which means any 286 | tangible personal property which is normally used for personal, family, 287 | or household purposes, or (2) anything designed or sold for incorporation 288 | into a dwelling. In determining whether a product is a consumer product, 289 | doubtful cases shall be resolved in favor of coverage. For a particular 290 | product received by a particular user, "normally used" refers to a 291 | typical or common use of that class of product, regardless of the status 292 | of the particular user or of the way in which the particular user 293 | actually uses, or expects or is expected to use, the product. A product 294 | is a consumer product regardless of whether the product has substantial 295 | commercial, industrial or non-consumer uses, unless such uses represent 296 | the only significant mode of use of the product. 297 | 298 | "Installation Information" for a User Product means any methods, 299 | procedures, authorization keys, or other information required to install 300 | and execute modified versions of a covered work in that User Product from 301 | a modified version of its Corresponding Source. The information must 302 | suffice to ensure that the continued functioning of the modified object 303 | code is in no case prevented or interfered with solely because 304 | modification has been made. 305 | 306 | If you convey an object code work under this section in, or with, or 307 | specifically for use in, a User Product, and the conveying occurs as 308 | part of a transaction in which the right of possession and use of the 309 | User Product is transferred to the recipient in perpetuity or for a 310 | fixed term (regardless of how the transaction is characterized), the 311 | Corresponding Source conveyed under this section must be accompanied 312 | by the Installation Information. But this requirement does not apply 313 | if neither you nor any third party retains the ability to install 314 | modified object code on the User Product (for example, the work has 315 | been installed in ROM). 316 | 317 | The requirement to provide Installation Information does not include a 318 | requirement to continue to provide support service, warranty, or updates 319 | for a work that has been modified or installed by the recipient, or for 320 | the User Product in which it has been modified or installed. Access to a 321 | network may be denied when the modification itself materially and 322 | adversely affects the operation of the network or violates the rules and 323 | protocols for communication across the network. 324 | 325 | Corresponding Source conveyed, and Installation Information provided, 326 | in accord with this section must be in a format that is publicly 327 | documented (and with an implementation available to the public in 328 | source code form), and must require no special password or key for 329 | unpacking, reading or copying. 330 | 331 | 7. Additional Terms. 332 | 333 | "Additional permissions" are terms that supplement the terms of this 334 | License by making exceptions from one or more of its conditions. 335 | Additional permissions that are applicable to the entire Program shall 336 | be treated as though they were included in this License, to the extent 337 | that they are valid under applicable law. If additional permissions 338 | apply only to part of the Program, that part may be used separately 339 | under those permissions, but the entire Program remains governed by 340 | this License without regard to the additional permissions. 341 | 342 | When you convey a copy of a covered work, you may at your option 343 | remove any additional permissions from that copy, or from any part of 344 | it. (Additional permissions may be written to require their own 345 | removal in certain cases when you modify the work.) You may place 346 | additional permissions on material, added by you to a covered work, 347 | for which you have or can give appropriate copyright permission. 348 | 349 | Notwithstanding any other provision of this License, for material you 350 | add to a covered work, you may (if authorized by the copyright holders of 351 | that material) supplement the terms of this License with terms: 352 | 353 | a) Disclaiming warranty or limiting liability differently from the 354 | terms of sections 15 and 16 of this License; or 355 | 356 | b) Requiring preservation of specified reasonable legal notices or 357 | author attributions in that material or in the Appropriate Legal 358 | Notices displayed by works containing it; or 359 | 360 | c) Prohibiting misrepresentation of the origin of that material, or 361 | requiring that modified versions of such material be marked in 362 | reasonable ways as different from the original version; or 363 | 364 | d) Limiting the use for publicity purposes of names of licensors or 365 | authors of the material; or 366 | 367 | e) Declining to grant rights under trademark law for use of some 368 | trade names, trademarks, or service marks; or 369 | 370 | f) Requiring indemnification of licensors and authors of that 371 | material by anyone who conveys the material (or modified versions of 372 | it) with contractual assumptions of liability to the recipient, for 373 | any liability that these contractual assumptions directly impose on 374 | those licensors and authors. 375 | 376 | All other non-permissive additional terms are considered "further 377 | restrictions" within the meaning of section 10. If the Program as you 378 | received it, or any part of it, contains a notice stating that it is 379 | governed by this License along with a term that is a further 380 | restriction, you may remove that term. If a license document contains 381 | a further restriction but permits relicensing or conveying under this 382 | License, you may add to a covered work material governed by the terms 383 | of that license document, provided that the further restriction does 384 | not survive such relicensing or conveying. 385 | 386 | If you add terms to a covered work in accord with this section, you 387 | must place, in the relevant source files, a statement of the 388 | additional terms that apply to those files, or a notice indicating 389 | where to find the applicable terms. 390 | 391 | Additional terms, permissive or non-permissive, may be stated in the 392 | form of a separately written license, or stated as exceptions; 393 | the above requirements apply either way. 394 | 395 | 8. Termination. 396 | 397 | You may not propagate or modify a covered work except as expressly 398 | provided under this License. Any attempt otherwise to propagate or 399 | modify it is void, and will automatically terminate your rights under 400 | this License (including any patent licenses granted under the third 401 | paragraph of section 11). 402 | 403 | However, if you cease all violation of this License, then your 404 | license from a particular copyright holder is reinstated (a) 405 | provisionally, unless and until the copyright holder explicitly and 406 | finally terminates your license, and (b) permanently, if the copyright 407 | holder fails to notify you of the violation by some reasonable means 408 | prior to 60 days after the cessation. 409 | 410 | Moreover, your license from a particular copyright holder is 411 | reinstated permanently if the copyright holder notifies you of the 412 | violation by some reasonable means, this is the first time you have 413 | received notice of violation of this License (for any work) from that 414 | copyright holder, and you cure the violation prior to 30 days after 415 | your receipt of the notice. 416 | 417 | Termination of your rights under this section does not terminate the 418 | licenses of parties who have received copies or rights from you under 419 | this License. If your rights have been terminated and not permanently 420 | reinstated, you do not qualify to receive new licenses for the same 421 | material under section 10. 422 | 423 | 9. Acceptance Not Required for Having Copies. 424 | 425 | You are not required to accept this License in order to receive or 426 | run a copy of the Program. Ancillary propagation of a covered work 427 | occurring solely as a consequence of using peer-to-peer transmission 428 | to receive a copy likewise does not require acceptance. However, 429 | nothing other than this License grants you permission to propagate or 430 | modify any covered work. These actions infringe copyright if you do 431 | not accept this License. Therefore, by modifying or propagating a 432 | covered work, you indicate your acceptance of this License to do so. 433 | 434 | 10. Automatic Licensing of Downstream Recipients. 435 | 436 | Each time you convey a covered work, the recipient automatically 437 | receives a license from the original licensors, to run, modify and 438 | propagate that work, subject to this License. You are not responsible 439 | for enforcing compliance by third parties with this License. 440 | 441 | An "entity transaction" is a transaction transferring control of an 442 | organization, or substantially all assets of one, or subdividing an 443 | organization, or merging organizations. If propagation of a covered 444 | work results from an entity transaction, each party to that 445 | transaction who receives a copy of the work also receives whatever 446 | licenses to the work the party's predecessor in interest had or could 447 | give under the previous paragraph, plus a right to possession of the 448 | Corresponding Source of the work from the predecessor in interest, if 449 | the predecessor has it or can get it with reasonable efforts. 450 | 451 | You may not impose any further restrictions on the exercise of the 452 | rights granted or affirmed under this License. For example, you may 453 | not impose a license fee, royalty, or other charge for exercise of 454 | rights granted under this License, and you may not initiate litigation 455 | (including a cross-claim or counterclaim in a lawsuit) alleging that 456 | any patent claim is infringed by making, using, selling, offering for 457 | sale, or importing the Program or any portion of it. 458 | 459 | 11. Patents. 460 | 461 | A "contributor" is a copyright holder who authorizes use under this 462 | License of the Program or a work on which the Program is based. The 463 | work thus licensed is called the contributor's "contributor version". 464 | 465 | A contributor's "essential patent claims" are all patent claims 466 | owned or controlled by the contributor, whether already acquired or 467 | hereafter acquired, that would be infringed by some manner, permitted 468 | by this License, of making, using, or selling its contributor version, 469 | but do not include claims that would be infringed only as a 470 | consequence of further modification of the contributor version. For 471 | purposes of this definition, "control" includes the right to grant 472 | patent sublicenses in a manner consistent with the requirements of 473 | this License. 474 | 475 | Each contributor grants you a non-exclusive, worldwide, royalty-free 476 | patent license under the contributor's essential patent claims, to 477 | make, use, sell, offer for sale, import and otherwise run, modify and 478 | propagate the contents of its contributor version. 479 | 480 | In the following three paragraphs, a "patent license" is any express 481 | agreement or commitment, however denominated, not to enforce a patent 482 | (such as an express permission to practice a patent or covenant not to 483 | sue for patent infringement). To "grant" such a patent license to a 484 | party means to make such an agreement or commitment not to enforce a 485 | patent against the party. 486 | 487 | If you convey a covered work, knowingly relying on a patent license, 488 | and the Corresponding Source of the work is not available for anyone 489 | to copy, free of charge and under the terms of this License, through a 490 | publicly available network server or other readily accessible means, 491 | then you must either (1) cause the Corresponding Source to be so 492 | available, or (2) arrange to deprive yourself of the benefit of the 493 | patent license for this particular work, or (3) arrange, in a manner 494 | consistent with the requirements of this License, to extend the patent 495 | license to downstream recipients. "Knowingly relying" means you have 496 | actual knowledge that, but for the patent license, your conveying the 497 | covered work in a country, or your recipient's use of the covered work 498 | in a country, would infringe one or more identifiable patents in that 499 | country that you have reason to believe are valid. 500 | 501 | If, pursuant to or in connection with a single transaction or 502 | arrangement, you convey, or propagate by procuring conveyance of, a 503 | covered work, and grant a patent license to some of the parties 504 | receiving the covered work authorizing them to use, propagate, modify 505 | or convey a specific copy of the covered work, then the patent license 506 | you grant is automatically extended to all recipients of the covered 507 | work and works based on it. 508 | 509 | A patent license is "discriminatory" if it does not include within 510 | the scope of its coverage, prohibits the exercise of, or is 511 | conditioned on the non-exercise of one or more of the rights that are 512 | specifically granted under this License. You may not convey a covered 513 | work if you are a party to an arrangement with a third party that is 514 | in the business of distributing software, under which you make payment 515 | to the third party based on the extent of your activity of conveying 516 | the work, and under which the third party grants, to any of the 517 | parties who would receive the covered work from you, a discriminatory 518 | patent license (a) in connection with copies of the covered work 519 | conveyed by you (or copies made from those copies), or (b) primarily 520 | for and in connection with specific products or compilations that 521 | contain the covered work, unless you entered into that arrangement, 522 | or that patent license was granted, prior to 28 March 2007. 523 | 524 | Nothing in this License shall be construed as excluding or limiting 525 | any implied license or other defenses to infringement that may 526 | otherwise be available to you under applicable patent law. 527 | 528 | 12. No Surrender of Others' Freedom. 529 | 530 | If conditions are imposed on you (whether by court order, agreement or 531 | otherwise) that contradict the conditions of this License, they do not 532 | excuse you from the conditions of this License. If you cannot convey a 533 | covered work so as to satisfy simultaneously your obligations under this 534 | License and any other pertinent obligations, then as a consequence you may 535 | not convey it at all. For example, if you agree to terms that obligate you 536 | to collect a royalty for further conveying from those to whom you convey 537 | the Program, the only way you could satisfy both those terms and this 538 | License would be to refrain entirely from conveying the Program. 539 | 540 | 13. Remote Network Interaction; Use with the GNU General Public License. 541 | 542 | Notwithstanding any other provision of this License, if you modify the 543 | Program, your modified version must prominently offer all users 544 | interacting with it remotely through a computer network (if your version 545 | supports such interaction) an opportunity to receive the Corresponding 546 | Source of your version by providing access to the Corresponding Source 547 | from a network server at no charge, through some standard or customary 548 | means of facilitating copying of software. This Corresponding Source 549 | shall include the Corresponding Source for any work covered by version 3 550 | of the GNU General Public License that is incorporated pursuant to the 551 | following paragraph. 552 | 553 | Notwithstanding any other provision of this License, you have 554 | permission to link or combine any covered work with a work licensed 555 | under version 3 of the GNU General Public License into a single 556 | combined work, and to convey the resulting work. The terms of this 557 | License will continue to apply to the part which is the covered work, 558 | but the work with which it is combined will remain governed by version 559 | 3 of the GNU General Public License. 560 | 561 | 14. Revised Versions of this License. 562 | 563 | The Free Software Foundation may publish revised and/or new versions of 564 | the GNU Affero General Public License from time to time. Such new versions 565 | will be similar in spirit to the present version, but may differ in detail to 566 | address new problems or concerns. 567 | 568 | Each version is given a distinguishing version number. If the 569 | Program specifies that a certain numbered version of the GNU Affero General 570 | Public License "or any later version" applies to it, you have the 571 | option of following the terms and conditions either of that numbered 572 | version or of any later version published by the Free Software 573 | Foundation. If the Program does not specify a version number of the 574 | GNU Affero General Public License, you may choose any version ever published 575 | by the Free Software Foundation. 576 | 577 | If the Program specifies that a proxy can decide which future 578 | versions of the GNU Affero General Public License can be used, that proxy's 579 | public statement of acceptance of a version permanently authorizes you 580 | to choose that version for the Program. 581 | 582 | Later license versions may give you additional or different 583 | permissions. However, no additional obligations are imposed on any 584 | author or copyright holder as a result of your choosing to follow a 585 | later version. 586 | 587 | 15. Disclaimer of Warranty. 588 | 589 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 590 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 594 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 595 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 597 | 598 | 16. Limitation of Liability. 599 | 600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 608 | SUCH DAMAGES. 609 | 610 | 17. Interpretation of Sections 15 and 16. 611 | 612 | If the disclaimer of warranty and limitation of liability provided 613 | above cannot be given local legal effect according to their terms, 614 | reviewing courts shall apply local law that most closely approximates 615 | an absolute waiver of all civil liability in connection with the 616 | Program, unless a warranty or assumption of liability accompanies a 617 | copy of the Program in return for a fee. 618 | 619 | END OF TERMS AND CONDITIONS 620 | 621 | How to Apply These Terms to Your New Programs 622 | 623 | If you develop a new program, and you want it to be of the greatest 624 | possible use to the public, the best way to achieve this is to make it 625 | free software which everyone can redistribute and change under these terms. 626 | 627 | To do so, attach the following notices to the program. It is safest 628 | to attach them to the start of each source file to most effectively 629 | state the exclusion of warranty; and each file should have at least 630 | the "copyright" line and a pointer to where the full notice is found. 631 | 632 | 633 | Copyright (C) 634 | 635 | This program is free software: you can redistribute it and/or modify 636 | it under the terms of the GNU Affero General Public License as published 637 | by the Free Software Foundation, either version 3 of the License, or 638 | (at your option) any later version. 639 | 640 | This program is distributed in the hope that it will be useful, 641 | but WITHOUT ANY WARRANTY; without even the implied warranty of 642 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 643 | GNU Affero General Public License for more details. 644 | 645 | You should have received a copy of the GNU Affero General Public License 646 | along with this program. If not, see . 647 | 648 | Also add information on how to contact you by electronic and paper mail. 649 | 650 | If your software can interact with users remotely through a computer 651 | network, you should also make sure that it provides a way for users to 652 | get its source. For example, if your program is a web application, its 653 | interface could display a "Source" link that leads users to an archive 654 | of the code. There are many ways you could offer source, and different 655 | solutions will be better for different programs; see section 13 for the 656 | specific requirements. 657 | 658 | You should also get your employer (if you work as a programmer) or school, 659 | if any, to sign a "copyright disclaimer" for the program, if necessary. 660 | For more information on this, and how to apply and follow the GNU AGPL, see 661 | . 662 | -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/LICENSE: -------------------------------------------------------------------------------- 1 | GNU AFFERO GENERAL PUBLIC LICENSE 2 | Version 3, 19 November 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU Affero General Public License is a free, copyleft license for 11 | software and other kinds of works, specifically designed to ensure 12 | cooperation with the community in the case of network server software. 13 | 14 | The licenses for most software and other practical works are designed 15 | to take away your freedom to share and change the works. By contrast, 16 | our General Public Licenses are intended to guarantee your freedom to 17 | share and change all versions of a program--to make sure it remains free 18 | software for all its users. 19 | 20 | When we speak of free software, we are referring to freedom, not 21 | price. Our General Public Licenses are designed to make sure that you 22 | have the freedom to distribute copies of free software (and charge for 23 | them if you wish), that you receive source code or can get it if you 24 | want it, that you can change the software or use pieces of it in new 25 | free programs, and that you know you can do these things. 26 | 27 | Developers that use our General Public Licenses protect your rights 28 | with two steps: (1) assert copyright on the software, and (2) offer 29 | you this License which gives you legal permission to copy, distribute 30 | and/or modify the software. 31 | 32 | A secondary benefit of defending all users' freedom is that 33 | improvements made in alternate versions of the program, if they 34 | receive widespread use, become available for other developers to 35 | incorporate. Many developers of free software are heartened and 36 | encouraged by the resulting cooperation. However, in the case of 37 | software used on network servers, this result may fail to come about. 38 | The GNU General Public License permits making a modified version and 39 | letting the public access it on a server without ever releasing its 40 | source code to the public. 41 | 42 | The GNU Affero General Public License is designed specifically to 43 | ensure that, in such cases, the modified source code becomes available 44 | to the community. It requires the operator of a network server to 45 | provide the source code of the modified version running there to the 46 | users of that server. Therefore, public use of a modified version, on 47 | a publicly accessible server, gives the public access to the source 48 | code of the modified version. 49 | 50 | An older license, called the Affero General Public License and 51 | published by Affero, was designed to accomplish similar goals. This is 52 | a different license, not a version of the Affero GPL, but Affero has 53 | released a new version of the Affero GPL which permits relicensing under 54 | this license. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | TERMS AND CONDITIONS 60 | 61 | 0. Definitions. 62 | 63 | "This License" refers to version 3 of the GNU Affero General Public License. 64 | 65 | "Copyright" also means copyright-like laws that apply to other kinds of 66 | works, such as semiconductor masks. 67 | 68 | "The Program" refers to any copyrightable work licensed under this 69 | License. Each licensee is addressed as "you". "Licensees" and 70 | "recipients" may be individuals or organizations. 71 | 72 | To "modify" a work means to copy from or adapt all or part of the work 73 | in a fashion requiring copyright permission, other than the making of an 74 | exact copy. The resulting work is called a "modified version" of the 75 | earlier work or a work "based on" the earlier work. 76 | 77 | A "covered work" means either the unmodified Program or a work based 78 | on the Program. 79 | 80 | To "propagate" a work means to do anything with it that, without 81 | permission, would make you directly or secondarily liable for 82 | infringement under applicable copyright law, except executing it on a 83 | computer or modifying a private copy. Propagation includes copying, 84 | distribution (with or without modification), making available to the 85 | public, and in some countries other activities as well. 86 | 87 | To "convey" a work means any kind of propagation that enables other 88 | parties to make or receive copies. Mere interaction with a user through 89 | a computer network, with no transfer of a copy, is not conveying. 90 | 91 | An interactive user interface displays "Appropriate Legal Notices" 92 | to the extent that it includes a convenient and prominently visible 93 | feature that (1) displays an appropriate copyright notice, and (2) 94 | tells the user that there is no warranty for the work (except to the 95 | extent that warranties are provided), that licensees may convey the 96 | work under this License, and how to view a copy of this License. If 97 | the interface presents a list of user commands or options, such as a 98 | menu, a prominent item in the list meets this criterion. 99 | 100 | 1. Source Code. 101 | 102 | The "source code" for a work means the preferred form of the work 103 | for making modifications to it. "Object code" means any non-source 104 | form of a work. 105 | 106 | A "Standard Interface" means an interface that either is an official 107 | standard defined by a recognized standards body, or, in the case of 108 | interfaces specified for a particular programming language, one that 109 | is widely used among developers working in that language. 110 | 111 | The "System Libraries" of an executable work include anything, other 112 | than the work as a whole, that (a) is included in the normal form of 113 | packaging a Major Component, but which is not part of that Major 114 | Component, and (b) serves only to enable use of the work with that 115 | Major Component, or to implement a Standard Interface for which an 116 | implementation is available to the public in source code form. A 117 | "Major Component", in this context, means a major essential component 118 | (kernel, window system, and so on) of the specific operating system 119 | (if any) on which the executable work runs, or a compiler used to 120 | produce the work, or an object code interpreter used to run it. 121 | 122 | The "Corresponding Source" for a work in object code form means all 123 | the source code needed to generate, install, and (for an executable 124 | work) run the object code and to modify the work, including scripts to 125 | control those activities. However, it does not include the work's 126 | System Libraries, or general-purpose tools or generally available free 127 | programs which are used unmodified in performing those activities but 128 | which are not part of the work. For example, Corresponding Source 129 | includes interface definition files associated with source files for 130 | the work, and the source code for shared libraries and dynamically 131 | linked subprograms that the work is specifically designed to require, 132 | such as by intimate data communication or control flow between those 133 | subprograms and other parts of the work. 134 | 135 | The Corresponding Source need not include anything that users 136 | can regenerate automatically from other parts of the Corresponding 137 | Source. 138 | 139 | The Corresponding Source for a work in source code form is that 140 | same work. 141 | 142 | 2. Basic Permissions. 143 | 144 | All rights granted under this License are granted for the term of 145 | copyright on the Program, and are irrevocable provided the stated 146 | conditions are met. This License explicitly affirms your unlimited 147 | permission to run the unmodified Program. The output from running a 148 | covered work is covered by this License only if the output, given its 149 | content, constitutes a covered work. This License acknowledges your 150 | rights of fair use or other equivalent, as provided by copyright law. 151 | 152 | You may make, run and propagate covered works that you do not 153 | convey, without conditions so long as your license otherwise remains 154 | in force. You may convey covered works to others for the sole purpose 155 | of having them make modifications exclusively for you, or provide you 156 | with facilities for running those works, provided that you comply with 157 | the terms of this License in conveying all material for which you do 158 | not control copyright. Those thus making or running the covered works 159 | for you must do so exclusively on your behalf, under your direction 160 | and control, on terms that prohibit them from making any copies of 161 | your copyrighted material outside their relationship with you. 162 | 163 | Conveying under any other circumstances is permitted solely under 164 | the conditions stated below. Sublicensing is not allowed; section 10 165 | makes it unnecessary. 166 | 167 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 168 | 169 | No covered work shall be deemed part of an effective technological 170 | measure under any applicable law fulfilling obligations under article 171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 172 | similar laws prohibiting or restricting circumvention of such 173 | measures. 174 | 175 | When you convey a covered work, you waive any legal power to forbid 176 | circumvention of technological measures to the extent such circumvention 177 | is effected by exercising rights under this License with respect to 178 | the covered work, and you disclaim any intention to limit operation or 179 | modification of the work as a means of enforcing, against the work's 180 | users, your or third parties' legal rights to forbid circumvention of 181 | technological measures. 182 | 183 | 4. Conveying Verbatim Copies. 184 | 185 | You may convey verbatim copies of the Program's source code as you 186 | receive it, in any medium, provided that you conspicuously and 187 | appropriately publish on each copy an appropriate copyright notice; 188 | keep intact all notices stating that this License and any 189 | non-permissive terms added in accord with section 7 apply to the code; 190 | keep intact all notices of the absence of any warranty; and give all 191 | recipients a copy of this License along with the Program. 192 | 193 | You may charge any price or no price for each copy that you convey, 194 | and you may offer support or warranty protection for a fee. 195 | 196 | 5. Conveying Modified Source Versions. 197 | 198 | You may convey a work based on the Program, or the modifications to 199 | produce it from the Program, in the form of source code under the 200 | terms of section 4, provided that you also meet all of these conditions: 201 | 202 | a) The work must carry prominent notices stating that you modified 203 | it, and giving a relevant date. 204 | 205 | b) The work must carry prominent notices stating that it is 206 | released under this License and any conditions added under section 207 | 7. This requirement modifies the requirement in section 4 to 208 | "keep intact all notices". 209 | 210 | c) You must license the entire work, as a whole, under this 211 | License to anyone who comes into possession of a copy. This 212 | License will therefore apply, along with any applicable section 7 213 | additional terms, to the whole of the work, and all its parts, 214 | regardless of how they are packaged. This License gives no 215 | permission to license the work in any other way, but it does not 216 | invalidate such permission if you have separately received it. 217 | 218 | d) If the work has interactive user interfaces, each must display 219 | Appropriate Legal Notices; however, if the Program has interactive 220 | interfaces that do not display Appropriate Legal Notices, your 221 | work need not make them do so. 222 | 223 | A compilation of a covered work with other separate and independent 224 | works, which are not by their nature extensions of the covered work, 225 | and which are not combined with it such as to form a larger program, 226 | in or on a volume of a storage or distribution medium, is called an 227 | "aggregate" if the compilation and its resulting copyright are not 228 | used to limit the access or legal rights of the compilation's users 229 | beyond what the individual works permit. Inclusion of a covered work 230 | in an aggregate does not cause this License to apply to the other 231 | parts of the aggregate. 232 | 233 | 6. Conveying Non-Source Forms. 234 | 235 | You may convey a covered work in object code form under the terms 236 | of sections 4 and 5, provided that you also convey the 237 | machine-readable Corresponding Source under the terms of this License, 238 | in one of these ways: 239 | 240 | a) Convey the object code in, or embodied in, a physical product 241 | (including a physical distribution medium), accompanied by the 242 | Corresponding Source fixed on a durable physical medium 243 | customarily used for software interchange. 244 | 245 | b) Convey the object code in, or embodied in, a physical product 246 | (including a physical distribution medium), accompanied by a 247 | written offer, valid for at least three years and valid for as 248 | long as you offer spare parts or customer support for that product 249 | model, to give anyone who possesses the object code either (1) a 250 | copy of the Corresponding Source for all the software in the 251 | product that is covered by this License, on a durable physical 252 | medium customarily used for software interchange, for a price no 253 | more than your reasonable cost of physically performing this 254 | conveying of source, or (2) access to copy the 255 | Corresponding Source from a network server at no charge. 256 | 257 | c) Convey individual copies of the object code with a copy of the 258 | written offer to provide the Corresponding Source. This 259 | alternative is allowed only occasionally and noncommercially, and 260 | only if you received the object code with such an offer, in accord 261 | with subsection 6b. 262 | 263 | d) Convey the object code by offering access from a designated 264 | place (gratis or for a charge), and offer equivalent access to the 265 | Corresponding Source in the same way through the same place at no 266 | further charge. You need not require recipients to copy the 267 | Corresponding Source along with the object code. If the place to 268 | copy the object code is a network server, the Corresponding Source 269 | may be on a different server (operated by you or a third party) 270 | that supports equivalent copying facilities, provided you maintain 271 | clear directions next to the object code saying where to find the 272 | Corresponding Source. Regardless of what server hosts the 273 | Corresponding Source, you remain obligated to ensure that it is 274 | available for as long as needed to satisfy these requirements. 275 | 276 | e) Convey the object code using peer-to-peer transmission, provided 277 | you inform other peers where the object code and Corresponding 278 | Source of the work are being offered to the general public at no 279 | charge under subsection 6d. 280 | 281 | A separable portion of the object code, whose source code is excluded 282 | from the Corresponding Source as a System Library, need not be 283 | included in conveying the object code work. 284 | 285 | A "User Product" is either (1) a "consumer product", which means any 286 | tangible personal property which is normally used for personal, family, 287 | or household purposes, or (2) anything designed or sold for incorporation 288 | into a dwelling. In determining whether a product is a consumer product, 289 | doubtful cases shall be resolved in favor of coverage. For a particular 290 | product received by a particular user, "normally used" refers to a 291 | typical or common use of that class of product, regardless of the status 292 | of the particular user or of the way in which the particular user 293 | actually uses, or expects or is expected to use, the product. A product 294 | is a consumer product regardless of whether the product has substantial 295 | commercial, industrial or non-consumer uses, unless such uses represent 296 | the only significant mode of use of the product. 297 | 298 | "Installation Information" for a User Product means any methods, 299 | procedures, authorization keys, or other information required to install 300 | and execute modified versions of a covered work in that User Product from 301 | a modified version of its Corresponding Source. The information must 302 | suffice to ensure that the continued functioning of the modified object 303 | code is in no case prevented or interfered with solely because 304 | modification has been made. 305 | 306 | If you convey an object code work under this section in, or with, or 307 | specifically for use in, a User Product, and the conveying occurs as 308 | part of a transaction in which the right of possession and use of the 309 | User Product is transferred to the recipient in perpetuity or for a 310 | fixed term (regardless of how the transaction is characterized), the 311 | Corresponding Source conveyed under this section must be accompanied 312 | by the Installation Information. But this requirement does not apply 313 | if neither you nor any third party retains the ability to install 314 | modified object code on the User Product (for example, the work has 315 | been installed in ROM). 316 | 317 | The requirement to provide Installation Information does not include a 318 | requirement to continue to provide support service, warranty, or updates 319 | for a work that has been modified or installed by the recipient, or for 320 | the User Product in which it has been modified or installed. Access to a 321 | network may be denied when the modification itself materially and 322 | adversely affects the operation of the network or violates the rules and 323 | protocols for communication across the network. 324 | 325 | Corresponding Source conveyed, and Installation Information provided, 326 | in accord with this section must be in a format that is publicly 327 | documented (and with an implementation available to the public in 328 | source code form), and must require no special password or key for 329 | unpacking, reading or copying. 330 | 331 | 7. Additional Terms. 332 | 333 | "Additional permissions" are terms that supplement the terms of this 334 | License by making exceptions from one or more of its conditions. 335 | Additional permissions that are applicable to the entire Program shall 336 | be treated as though they were included in this License, to the extent 337 | that they are valid under applicable law. If additional permissions 338 | apply only to part of the Program, that part may be used separately 339 | under those permissions, but the entire Program remains governed by 340 | this License without regard to the additional permissions. 341 | 342 | When you convey a copy of a covered work, you may at your option 343 | remove any additional permissions from that copy, or from any part of 344 | it. (Additional permissions may be written to require their own 345 | removal in certain cases when you modify the work.) You may place 346 | additional permissions on material, added by you to a covered work, 347 | for which you have or can give appropriate copyright permission. 348 | 349 | Notwithstanding any other provision of this License, for material you 350 | add to a covered work, you may (if authorized by the copyright holders of 351 | that material) supplement the terms of this License with terms: 352 | 353 | a) Disclaiming warranty or limiting liability differently from the 354 | terms of sections 15 and 16 of this License; or 355 | 356 | b) Requiring preservation of specified reasonable legal notices or 357 | author attributions in that material or in the Appropriate Legal 358 | Notices displayed by works containing it; or 359 | 360 | c) Prohibiting misrepresentation of the origin of that material, or 361 | requiring that modified versions of such material be marked in 362 | reasonable ways as different from the original version; or 363 | 364 | d) Limiting the use for publicity purposes of names of licensors or 365 | authors of the material; or 366 | 367 | e) Declining to grant rights under trademark law for use of some 368 | trade names, trademarks, or service marks; or 369 | 370 | f) Requiring indemnification of licensors and authors of that 371 | material by anyone who conveys the material (or modified versions of 372 | it) with contractual assumptions of liability to the recipient, for 373 | any liability that these contractual assumptions directly impose on 374 | those licensors and authors. 375 | 376 | All other non-permissive additional terms are considered "further 377 | restrictions" within the meaning of section 10. If the Program as you 378 | received it, or any part of it, contains a notice stating that it is 379 | governed by this License along with a term that is a further 380 | restriction, you may remove that term. If a license document contains 381 | a further restriction but permits relicensing or conveying under this 382 | License, you may add to a covered work material governed by the terms 383 | of that license document, provided that the further restriction does 384 | not survive such relicensing or conveying. 385 | 386 | If you add terms to a covered work in accord with this section, you 387 | must place, in the relevant source files, a statement of the 388 | additional terms that apply to those files, or a notice indicating 389 | where to find the applicable terms. 390 | 391 | Additional terms, permissive or non-permissive, may be stated in the 392 | form of a separately written license, or stated as exceptions; 393 | the above requirements apply either way. 394 | 395 | 8. Termination. 396 | 397 | You may not propagate or modify a covered work except as expressly 398 | provided under this License. Any attempt otherwise to propagate or 399 | modify it is void, and will automatically terminate your rights under 400 | this License (including any patent licenses granted under the third 401 | paragraph of section 11). 402 | 403 | However, if you cease all violation of this License, then your 404 | license from a particular copyright holder is reinstated (a) 405 | provisionally, unless and until the copyright holder explicitly and 406 | finally terminates your license, and (b) permanently, if the copyright 407 | holder fails to notify you of the violation by some reasonable means 408 | prior to 60 days after the cessation. 409 | 410 | Moreover, your license from a particular copyright holder is 411 | reinstated permanently if the copyright holder notifies you of the 412 | violation by some reasonable means, this is the first time you have 413 | received notice of violation of this License (for any work) from that 414 | copyright holder, and you cure the violation prior to 30 days after 415 | your receipt of the notice. 416 | 417 | Termination of your rights under this section does not terminate the 418 | licenses of parties who have received copies or rights from you under 419 | this License. If your rights have been terminated and not permanently 420 | reinstated, you do not qualify to receive new licenses for the same 421 | material under section 10. 422 | 423 | 9. Acceptance Not Required for Having Copies. 424 | 425 | You are not required to accept this License in order to receive or 426 | run a copy of the Program. Ancillary propagation of a covered work 427 | occurring solely as a consequence of using peer-to-peer transmission 428 | to receive a copy likewise does not require acceptance. However, 429 | nothing other than this License grants you permission to propagate or 430 | modify any covered work. These actions infringe copyright if you do 431 | not accept this License. Therefore, by modifying or propagating a 432 | covered work, you indicate your acceptance of this License to do so. 433 | 434 | 10. Automatic Licensing of Downstream Recipients. 435 | 436 | Each time you convey a covered work, the recipient automatically 437 | receives a license from the original licensors, to run, modify and 438 | propagate that work, subject to this License. You are not responsible 439 | for enforcing compliance by third parties with this License. 440 | 441 | An "entity transaction" is a transaction transferring control of an 442 | organization, or substantially all assets of one, or subdividing an 443 | organization, or merging organizations. If propagation of a covered 444 | work results from an entity transaction, each party to that 445 | transaction who receives a copy of the work also receives whatever 446 | licenses to the work the party's predecessor in interest had or could 447 | give under the previous paragraph, plus a right to possession of the 448 | Corresponding Source of the work from the predecessor in interest, if 449 | the predecessor has it or can get it with reasonable efforts. 450 | 451 | You may not impose any further restrictions on the exercise of the 452 | rights granted or affirmed under this License. For example, you may 453 | not impose a license fee, royalty, or other charge for exercise of 454 | rights granted under this License, and you may not initiate litigation 455 | (including a cross-claim or counterclaim in a lawsuit) alleging that 456 | any patent claim is infringed by making, using, selling, offering for 457 | sale, or importing the Program or any portion of it. 458 | 459 | 11. Patents. 460 | 461 | A "contributor" is a copyright holder who authorizes use under this 462 | License of the Program or a work on which the Program is based. The 463 | work thus licensed is called the contributor's "contributor version". 464 | 465 | A contributor's "essential patent claims" are all patent claims 466 | owned or controlled by the contributor, whether already acquired or 467 | hereafter acquired, that would be infringed by some manner, permitted 468 | by this License, of making, using, or selling its contributor version, 469 | but do not include claims that would be infringed only as a 470 | consequence of further modification of the contributor version. For 471 | purposes of this definition, "control" includes the right to grant 472 | patent sublicenses in a manner consistent with the requirements of 473 | this License. 474 | 475 | Each contributor grants you a non-exclusive, worldwide, royalty-free 476 | patent license under the contributor's essential patent claims, to 477 | make, use, sell, offer for sale, import and otherwise run, modify and 478 | propagate the contents of its contributor version. 479 | 480 | In the following three paragraphs, a "patent license" is any express 481 | agreement or commitment, however denominated, not to enforce a patent 482 | (such as an express permission to practice a patent or covenant not to 483 | sue for patent infringement). To "grant" such a patent license to a 484 | party means to make such an agreement or commitment not to enforce a 485 | patent against the party. 486 | 487 | If you convey a covered work, knowingly relying on a patent license, 488 | and the Corresponding Source of the work is not available for anyone 489 | to copy, free of charge and under the terms of this License, through a 490 | publicly available network server or other readily accessible means, 491 | then you must either (1) cause the Corresponding Source to be so 492 | available, or (2) arrange to deprive yourself of the benefit of the 493 | patent license for this particular work, or (3) arrange, in a manner 494 | consistent with the requirements of this License, to extend the patent 495 | license to downstream recipients. "Knowingly relying" means you have 496 | actual knowledge that, but for the patent license, your conveying the 497 | covered work in a country, or your recipient's use of the covered work 498 | in a country, would infringe one or more identifiable patents in that 499 | country that you have reason to believe are valid. 500 | 501 | If, pursuant to or in connection with a single transaction or 502 | arrangement, you convey, or propagate by procuring conveyance of, a 503 | covered work, and grant a patent license to some of the parties 504 | receiving the covered work authorizing them to use, propagate, modify 505 | or convey a specific copy of the covered work, then the patent license 506 | you grant is automatically extended to all recipients of the covered 507 | work and works based on it. 508 | 509 | A patent license is "discriminatory" if it does not include within 510 | the scope of its coverage, prohibits the exercise of, or is 511 | conditioned on the non-exercise of one or more of the rights that are 512 | specifically granted under this License. You may not convey a covered 513 | work if you are a party to an arrangement with a third party that is 514 | in the business of distributing software, under which you make payment 515 | to the third party based on the extent of your activity of conveying 516 | the work, and under which the third party grants, to any of the 517 | parties who would receive the covered work from you, a discriminatory 518 | patent license (a) in connection with copies of the covered work 519 | conveyed by you (or copies made from those copies), or (b) primarily 520 | for and in connection with specific products or compilations that 521 | contain the covered work, unless you entered into that arrangement, 522 | or that patent license was granted, prior to 28 March 2007. 523 | 524 | Nothing in this License shall be construed as excluding or limiting 525 | any implied license or other defenses to infringement that may 526 | otherwise be available to you under applicable patent law. 527 | 528 | 12. No Surrender of Others' Freedom. 529 | 530 | If conditions are imposed on you (whether by court order, agreement or 531 | otherwise) that contradict the conditions of this License, they do not 532 | excuse you from the conditions of this License. If you cannot convey a 533 | covered work so as to satisfy simultaneously your obligations under this 534 | License and any other pertinent obligations, then as a consequence you may 535 | not convey it at all. For example, if you agree to terms that obligate you 536 | to collect a royalty for further conveying from those to whom you convey 537 | the Program, the only way you could satisfy both those terms and this 538 | License would be to refrain entirely from conveying the Program. 539 | 540 | 13. Remote Network Interaction; Use with the GNU General Public License. 541 | 542 | Notwithstanding any other provision of this License, if you modify the 543 | Program, your modified version must prominently offer all users 544 | interacting with it remotely through a computer network (if your version 545 | supports such interaction) an opportunity to receive the Corresponding 546 | Source of your version by providing access to the Corresponding Source 547 | from a network server at no charge, through some standard or customary 548 | means of facilitating copying of software. This Corresponding Source 549 | shall include the Corresponding Source for any work covered by version 3 550 | of the GNU General Public License that is incorporated pursuant to the 551 | following paragraph. 552 | 553 | Notwithstanding any other provision of this License, you have 554 | permission to link or combine any covered work with a work licensed 555 | under version 3 of the GNU General Public License into a single 556 | combined work, and to convey the resulting work. The terms of this 557 | License will continue to apply to the part which is the covered work, 558 | but the work with which it is combined will remain governed by version 559 | 3 of the GNU General Public License. 560 | 561 | 14. Revised Versions of this License. 562 | 563 | The Free Software Foundation may publish revised and/or new versions of 564 | the GNU Affero General Public License from time to time. Such new versions 565 | will be similar in spirit to the present version, but may differ in detail to 566 | address new problems or concerns. 567 | 568 | Each version is given a distinguishing version number. If the 569 | Program specifies that a certain numbered version of the GNU Affero General 570 | Public License "or any later version" applies to it, you have the 571 | option of following the terms and conditions either of that numbered 572 | version or of any later version published by the Free Software 573 | Foundation. If the Program does not specify a version number of the 574 | GNU Affero General Public License, you may choose any version ever published 575 | by the Free Software Foundation. 576 | 577 | If the Program specifies that a proxy can decide which future 578 | versions of the GNU Affero General Public License can be used, that proxy's 579 | public statement of acceptance of a version permanently authorizes you 580 | to choose that version for the Program. 581 | 582 | Later license versions may give you additional or different 583 | permissions. However, no additional obligations are imposed on any 584 | author or copyright holder as a result of your choosing to follow a 585 | later version. 586 | 587 | 15. Disclaimer of Warranty. 588 | 589 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 590 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 594 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 595 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 597 | 598 | 16. Limitation of Liability. 599 | 600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 608 | SUCH DAMAGES. 609 | 610 | 17. Interpretation of Sections 15 and 16. 611 | 612 | If the disclaimer of warranty and limitation of liability provided 613 | above cannot be given local legal effect according to their terms, 614 | reviewing courts shall apply local law that most closely approximates 615 | an absolute waiver of all civil liability in connection with the 616 | Program, unless a warranty or assumption of liability accompanies a 617 | copy of the Program in return for a fee. 618 | 619 | END OF TERMS AND CONDITIONS 620 | 621 | How to Apply These Terms to Your New Programs 622 | 623 | If you develop a new program, and you want it to be of the greatest 624 | possible use to the public, the best way to achieve this is to make it 625 | free software which everyone can redistribute and change under these terms. 626 | 627 | To do so, attach the following notices to the program. It is safest 628 | to attach them to the start of each source file to most effectively 629 | state the exclusion of warranty; and each file should have at least 630 | the "copyright" line and a pointer to where the full notice is found. 631 | 632 | 633 | Copyright (C) 634 | 635 | This program is free software: you can redistribute it and/or modify 636 | it under the terms of the GNU Affero General Public License as published 637 | by the Free Software Foundation, either version 3 of the License, or 638 | (at your option) any later version. 639 | 640 | This program is distributed in the hope that it will be useful, 641 | but WITHOUT ANY WARRANTY; without even the implied warranty of 642 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 643 | GNU Affero General Public License for more details. 644 | 645 | You should have received a copy of the GNU Affero General Public License 646 | along with this program. If not, see . 647 | 648 | Also add information on how to contact you by electronic and paper mail. 649 | 650 | If your software can interact with users remotely through a computer 651 | network, you should also make sure that it provides a way for users to 652 | get its source. For example, if your program is a web application, its 653 | interface could display a "Source" link that leads users to an archive 654 | of the code. There are many ways you could offer source, and different 655 | solutions will be better for different programs; see section 13 for the 656 | specific requirements. 657 | 658 | You should also get your employer (if you work as a programmer) or school, 659 | if any, to sign a "copyright disclaimer" for the program, if necessary. 660 | For more information on this, and how to apply and follow the GNU AGPL, see 661 | . 662 | -------------------------------------------------------------------------------- /3rd_PanJiu_AIOps_Competition/code/generate_feature.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | import pickle 4 | from collections import Counter 5 | from utils import get_new_cols 6 | import numpy as np 7 | import pandas as pd 8 | from tqdm import tqdm 9 | from gensim.models import Word2Vec 10 | from utils import GENERATION_DIR 11 | from utils import KEY_1, KEY_2, KEY_3, KEY_4 12 | from tqdm import tqdm 13 | from scipy import stats 14 | 15 | def cat2num(df, cat_cols, Transfer2num=True): 16 | ''' 17 | 18 | :param df: 19 | :param cat_cols: 类别特征列表 20 | :param Transfer2num: 类别特征转换为数值特征 21 | :return: 22 | ''' 23 | if Transfer2num: 24 | 25 | print('Transfer category feature to num feature ') 26 | for col in cat_cols: 27 | 28 | if not os.path.exists(os.path.join(GENERATION_DIR, f'{col}_map.pkl')): 29 | print(f'Transfer : {col}') 30 | tmp_map = dict(zip(df[col].unique(), range(df[col].nunique()))) 31 | with open(os.path.join(GENERATION_DIR, f'{col}_map.pkl'), 'wb') as f: 32 | pickle.dump(tmp_map, f) 33 | else: 34 | with open(os.path.join(GENERATION_DIR, f'{col}_map.pkl'), 'rb') as f: 35 | tmp_map = pickle.load(f) 36 | df[f'{col}_LabelEnc'] = df[col].map(tmp_map).fillna(-1).astype(int) 37 | else: 38 | print('Transfer category feature to category feature ') 39 | for col in cat_cols: 40 | df[col] = df[col].astype('category') 41 | print('Transfer category feature to num feature Down...') 42 | return df 43 | 44 | def add_minutes(x, minutes=5): 45 | dt = datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') 46 | out_date = (dt + datetime.timedelta(minutes=minutes) 47 | ).strftime('%Y-%m-%d %H:%M:%S') 48 | return out_date 49 | 50 | 51 | def time_process(df, time_cols, minutes_): 52 | df[f'time_{minutes_}'] = df[time_cols].apply( 53 | lambda x: add_minutes(str(x), minutes_)) 54 | return df 55 | 56 | 57 | def get_fea(x, fea): 58 | if fea in x: 59 | return 1 60 | else: 61 | return 0 62 | 63 | 64 | def get_last_msg_cnt(x): 65 | last_msg = x[-1] 66 | cnt = x.count(last_msg) 67 | return cnt 68 | 69 | 70 | def get_first_msg_cnt(x): 71 | first_msg = x[0] 72 | cnt = x.count(first_msg) 73 | return cnt 74 | 75 | 76 | def add_last_next_time4fault(label, preliminary_submit_dataset_a, 77 | time_interval, next_time_list): 78 | print(f'添加自定义异常出现的时间间隔{time_interval}的前后的时间点') 79 | for i in tqdm([-i for i in next_time_list] + next_time_list): 80 | label = time_process(label, 'fault_time', i * time_interval) 81 | preliminary_submit_dataset_a = time_process( 82 | preliminary_submit_dataset_a, 'fault_time', i * time_interval) 83 | 84 | return label, preliminary_submit_dataset_a 85 | 86 | 87 | def get_msg_text_fea(df, msg_type='last'): 88 | print(f'获取 msg text {msg_type}特征') 89 | 90 | df_fea = df.groupby(['sn', 'fault_time']).agg( 91 | {'msg_list': 'sum', 'msg_0': 'sum', 'msg_1': 'sum', 'msg_2': 'sum'}).reset_index() 92 | df_fea['msg_list_unique'] = df_fea['msg_list'].apply(lambda x: str(set(x))) 93 | df_fea['msg_0_unique'] = df_fea['msg_0'].apply(lambda x: str(set(x))) 94 | df_fea['msg_1_unique'] = df_fea['msg_1'].apply(lambda x: str(set(x))) 95 | df_fea['msg_2_unique'] = df_fea['msg_2'].apply(lambda x: str(set(x))) 96 | 97 | df_fea['msg_list_list'] = df_fea['msg_list'].apply(lambda x: str(x)) 98 | df_fea['msg_0_list'] = df_fea['msg_0'].apply(lambda x: str(x)) 99 | df_fea['msg_1_list'] = df_fea['msg_1'].apply(lambda x: str(x)) 100 | df_fea['msg_2_list'] = df_fea['msg_2'].apply(lambda x: str(x)) 101 | 102 | df_fea['msg_0_first'] = df_fea['msg_0'].apply(lambda x: x[0]) 103 | df_fea['msg_1_first'] = df_fea['msg_1'].apply(lambda x: x[0]) 104 | df_fea['msg_2_first'] = df_fea['msg_2'].apply(lambda x: x[0]) 105 | 106 | df_fea['msg_0_last'] = df_fea['msg_0'].apply(lambda x: x[-1]) 107 | df_fea['msg_1_last'] = df_fea['msg_1'].apply(lambda x: x[-1]) 108 | df_fea['msg_2_last'] = df_fea['msg_2'].apply(lambda x: x[-1]) 109 | 110 | df_fea['msg_last'] = df.groupby(['sn', 'fault_time']).apply( 111 | lambda x: x['msg'].to_list()[-1]).values 112 | df_fea['msg_first'] = df.groupby(['sn', 'fault_time']).apply( 113 | lambda x: x['msg'].to_list()[0]).values 114 | 115 | df_fea['last_msg_cnt'] = df_fea['msg_list'].apply( 116 | lambda x: get_last_msg_cnt(x)) 117 | df_fea['first_msg_cnt'] = df_fea['msg_list'].apply( 118 | lambda x: get_first_msg_cnt(x)) 119 | cat_cols = ['msg_list', 'msg_0', 'msg_1', 'msg_2', 120 | 'msg_list_unique', 'msg_0_unique', 'msg_1_unique', 'msg_2_unique', 121 | 'msg_list_list', 'msg_0_list', 'msg_1_list', 'msg_2_list', 122 | 'msg_0_first', 'msg_1_first', 'msg_2_first', 'msg_0_last', 'msg_1_last', 123 | 'msg_2_last', 'msg_last', 'msg_first'] 124 | num_cols = ['last_msg_cnt', 'first_msg_cnt'] 125 | id_cols = ['sn', 'fault_time'] 126 | 127 | df_fea = df_fea.rename( 128 | columns={ 129 | i: f'{msg_type}_{i}' for i in (cat_cols + num_cols)}) 130 | cat_cols = [f'{msg_type}_{i}' for i in cat_cols] 131 | for cat_col in cat_cols: 132 | df_fea[cat_col] = df_fea[cat_col].astype(str) 133 | df_fea = cat2num(df_fea, cat_cols, Transfer2num=True) 134 | for i in cat_cols: 135 | del df_fea[i] 136 | return df_fea 137 | 138 | def add_w2v_feats(all_data,w2v_feats_df,f1,f2,emb_size = 32,window = 5,min_count =5,): 139 | print(f'生成 {f1}_{f2}_w2v 特征') 140 | 141 | df_fea = all_data.groupby(f1).agg({f2:'sum'}).reset_index() 142 | df_emb = df_fea[[f1 ]] 143 | sencences = df_fea[f2].to_list() 144 | if not os.path.exists(os.path.join(GENERATION_DIR, f'{f1}_{f2}_w2v_model.pkl')): 145 | print(f'{f1}_{f2}_w2v_model 不存在,开始训练......') 146 | model = Word2Vec(sencences, vector_size=emb_size, window=window, 147 | min_count=min_count, sg=0, hs=1, seed=42) 148 | with open(os.path.join(GENERATION_DIR, f'{f1}_{f2}_w2v_model.pkl'), 'wb') as f: 149 | pickle.dump(model, f) 150 | else: 151 | print(f'{f1}_{f2}_w2v_model 已存在,开始读取......') 152 | with open(os.path.join(GENERATION_DIR, f'{f1}_{f2}_w2v_model.pkl'), 'rb') as f: 153 | model = pickle.load(f) 154 | 155 | emb_matrix_mean = [] 156 | for sent in sencences: 157 | vec = [] 158 | for w in sent: 159 | if w in model.wv: 160 | vec.append(model.wv[w]) 161 | if len(vec) >0: 162 | emb_matrix_mean.append(np.mean(vec,axis = 0)) 163 | else: 164 | emb_matrix_mean.append([0]*emb_size) 165 | df_emb_mean = pd.DataFrame(emb_matrix_mean).add_prefix(f'{f1}_{f2}_w2v_') 166 | 167 | df_emb = pd.concat([df_emb,df_emb_mean],axis = 1) 168 | w2v_feats_df = w2v_feats_df.merge(df_emb,on = f1,how ='left') 169 | return w2v_feats_df 170 | def get_w2v_feats(all_data,f1_list,f2_list): 171 | all_data['msg_list'] = all_data['msg'].apply(lambda x: [i.strip() for i in x.split(' | ')]) 172 | all_data['msg_0'] = all_data['msg'].apply(lambda x: [get_msg_location(x.split(' | '), 0)]) 173 | all_data['msg_1'] = all_data['msg'].apply(lambda x: [get_msg_location(x.split(' | '), 1)]) 174 | all_data['msg_2'] = all_data['msg'].apply(lambda x: [get_msg_location(x.split(' | '), 2)]) 175 | w2v_feats_df = all_data[f1_list].drop_duplicates() 176 | for f1 in f1_list: 177 | for f2 in f2_list: 178 | w2v_feats_df = add_w2v_feats(all_data,w2v_feats_df,f1,f2,emb_size = 10,window = 5,min_count =5,) 179 | print(f'w2v_feats 的特征维度: {w2v_feats_df.shape}') 180 | return w2v_feats_df 181 | 182 | 183 | 184 | def get_time_diff_feats_v2(all_data): 185 | print('生成时间差特征 time_diff_feats_v2') 186 | all_data['duration_seconds'] = all_data['time_interval'] 187 | all_data['duration_minutes'] = all_data['time_interval'] / 60 188 | df_merge_log = all_data[['sn', 'fault_time', 'label', 'time', 'msg', 189 | 'server_model', 'time_interval', 'duration_seconds', 190 | 'duration_minutes']] 191 | df_merge_log['fault_id'] = df_merge_log['sn'] + '_' + df_merge_log['fault_time'] + '_' + df_merge_log[ 192 | 'server_model'] 193 | f1_list = ['fault_id', 'sn', 'server_model'] 194 | f2_list = ['duration_minutes', 'duration_seconds'] 195 | time_diff_feats_v2 = df_merge_log[['sn', 'fault_time', 'fault_id', 'server_model']].drop_duplicates().reset_index( 196 | drop=True) 197 | 198 | for f1 in f1_list: 199 | for f2 in f2_list: 200 | func_opt = ['count', 'nunique', 'min', 'max', 'median', 'sum'] 201 | for opt in func_opt: 202 | tmp = df_merge_log.groupby([f1])[f2].agg([(f'{f2}_in_{f1}_' + opt, opt)]).reset_index() 203 | # print(f'{f1}_in_{f2}_{opt}:{tmp.shape}' ) 204 | time_diff_feats_v2 = time_diff_feats_v2.merge(tmp, on=f1, how='left') 205 | 206 | temp = df_merge_log.groupby([f1])[f2].apply(lambda x: stats.mode(x)[0][0]) 207 | time_diff_feats_v2[f'{f2}_in_{f1}_mode'] = time_diff_feats_v2[f1].map(temp).fillna(np.nan) 208 | secs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] 209 | for sec in secs: 210 | temp = df_merge_log.groupby([f1])[f2].quantile(sec).reset_index( 211 | name=f'log_{f2}_in_{f1}_quantile_' + str(sec * 100)) 212 | # print(f'log_{f1}_in_{f2}_quantile_{str(sec * 100)}:{tmp.shape}' ) 213 | time_diff_feats_v2 = pd.merge(time_diff_feats_v2, temp, on=f1, how='left') 214 | del time_diff_feats_v2['fault_id'] 215 | return time_diff_feats_v2 216 | 217 | def get_feature(data, time_list, log_fea, fea_num, key): 218 | print(f'当前特征维度{data.shape}') 219 | fea_df_list = [] 220 | fea_cnt_list = ['OEM record c2', 'Processor CPU_Core_Error', '001c4c', 'System Event Sys_Event','OEM CPU0 MCERR', 221 | 'OEM CPU0 CATERR', 'Reading 0 < Threshold 2 degrees C', '0203c0a80101', 222 | 'Unknown CPU0 MCERR', 'Unknown CPU0 CATERR','Memory', 'Correctable ECC logging limit reached', 223 | 'Memory MEM_CHE0_Status', 'Memory Memory_Status', 'Memory #0x87', 'Memory CPU0F0_DIMM_Stat', 224 | 'Drive Fault', 'NMI/Diag Interrupt', 'Failure detected', 'Power Supply AC lost', ] 225 | for time_tmp in tqdm(time_list): 226 | print(f'获取异常前后 {time_tmp} min的数据进行聚合') 227 | tmp1 = data[(pd.to_datetime(data['time']) < pd.to_datetime(data[f'time_{time_tmp}'])) & (pd.to_datetime(data['time']) > pd.to_datetime(data[f'time_-{time_tmp}']))].sort_values( 228 | ['sn', 'fault_time']) 229 | tmp1 = tmp1.groupby(key).apply( 230 | lambda x: ' | '.join(x['msg'].to_list())).reset_index().rename(columns={0: 'msg'}) 231 | tmp1[f'msg_len'] = tmp1['msg'].apply(lambda x: len(x.split(' | '))) 232 | # tmp1[f'msg_len_two'] = tmp1['msg'].apply(lambda x: len(x)) 233 | # 添加数字个数 234 | # tmp1[f'msg_num_two'] = tmp1['msg'].apply( 235 | # lambda x: len([int(s) for s in re.findall(r'\b\d+\b', x)])) 236 | print(f'根据异常前后 {time_tmp} min的数据的日志数据提取 {fea_num} 个稀疏特征') 237 | feature = log_fea + ['msg_len'] 238 | for fea in feature: 239 | tmp1[fea] = tmp1['msg'].apply(lambda x: get_fea(x, fea)) 240 | # 添加计数特征 241 | if fea in fea_cnt_list: 242 | tmp1[f'{fea}_cnt'] = tmp1['msg'].apply(lambda x:x.replace('|',' ').replace('_',' ').split(' ').count(fea)) 243 | feature.append(f'{fea}_cnt') 244 | tmp1_new_col_map = {i: i + '_' + str(int(time_tmp)) for i in feature} 245 | tmp1 = tmp1.rename(columns=tmp1_new_col_map) 246 | del tmp1['msg'] 247 | fea_df_list.append(tmp1) 248 | fea_df = fea_df_list[-1] 249 | print(fea_df.shape) 250 | for i in fea_df_list[:-1]: 251 | fea_df = fea_df.merge(i, on=key, how='left') 252 | print(fea_df.shape) 253 | return fea_df 254 | 255 | 256 | def get_msg_location(x, num): 257 | try: 258 | return x[num] 259 | except BaseException: 260 | return '其它' 261 | 262 | 263 | def get_nearest_msg_fea(train, test): 264 | print('生成 nearest_msg 特征') 265 | df = pd.concat([train, test], axis=0, ignore_index=True) 266 | df['duration_minutes'] = (pd.to_datetime(df['fault_time']) - pd.to_datetime(df['time'])).apply( 267 | lambda x: x.total_seconds()) 268 | df = df.sort_values( 269 | ['sn', 'server_model', 'fault_time', 'time']).reset_index(drop=True) 270 | df['duration_minutes_abs'] = np.abs(df['duration_minutes']) 271 | 272 | df['duration_minutes_abs_rank'] = df.groupby(['sn', 'server_model', 'fault_time'])['duration_minutes_abs'].rank( 273 | method='first', ascending=False) 274 | 275 | key = ['sn', 'server_model', 'fault_time', 'duration_minutes_abs'] 276 | df = df.sort_values(key, ascending=False) 277 | df = df.drop_duplicates( 278 | ['sn', 'server_model', 'fault_time', ], keep='first') 279 | 280 | df.loc[df['duration_minutes'] == 281 | df['duration_minutes_abs'], 'last_or_next'] = 1 282 | df.loc[df['duration_minutes'] != 283 | df['duration_minutes_abs'], 'last_or_next'] = 0 284 | df['msg_cnt'] = df['msg'].map(df['msg'].value_counts()) 285 | df['msg_0'] = df['msg'].apply( 286 | lambda x: get_msg_location( 287 | x.split(' | '), 0)) 288 | df['msg_0_cnt'] = df['msg_0'].map(df['msg_0'].value_counts()) 289 | df['msg_1'] = df['msg'].apply( 290 | lambda x: get_msg_location( 291 | x.split(' | '), 1)) 292 | df['msg_1_cnt'] = df['msg_1'].map(df['msg_1'].value_counts()) 293 | df['msg_2'] = df['msg'].apply( 294 | lambda x: get_msg_location( 295 | x.split(' | '), 2)) 296 | df['msg_2_cnt'] = df['msg_2'].map(df['msg_2'].value_counts()) 297 | cat_feats = ['msg', 'msg_0', 'msg_1', 298 | 'msg_2'] # ,'server_model_day_date','server_model_dayofmonth','server_model_dayofweek','server_model_hour'] 299 | # for name in cat_feats: 300 | # le = LabelEncoder() 301 | # df[f'{name}_LabelEnc'] = le.fit_transform(df[name]) 302 | df = cat2num(df,cat_feats) 303 | df = df.drop_duplicates().reset_index(drop=True) 304 | df = df[['sn', 'server_model', 'fault_time', 'msg_cnt', 305 | 'msg_0_cnt', 'msg_1_cnt', 'msg_2_cnt', 306 | # 'duration_minutes_abs','duration_minutes', 'duration_minutes_abs_rank', 307 | 'last_or_next', 'msg_LabelEnc', 'msg_0_LabelEnc', 'msg_1_LabelEnc', 'msg_2_LabelEnc']] 308 | print(f'生成 nearest_msg 特征完毕,特征维度{df.shape}') 309 | return df 310 | 311 | def get_server_model_time_interval_stat_fea(all_data): 312 | server_model_time_interval_stat_fea = all_data.groupby('server_model').agg({'time_interval':['min','max','mean','median']}).reset_index() 313 | server_model_time_interval_stat_fea = get_new_cols(server_model_time_interval_stat_fea,key = ['server_model' ]) 314 | 315 | server_model_time_interval_stat_fea.columns = ['server_model', 'sm_time_interval_min', 'sm_ttime_interval_max', 316 | 'sm_ttime_interval_mean', 'sm_ttime_interval_median'] 317 | return server_model_time_interval_stat_fea 318 | 319 | def get_server_model_sn_fea_2(train, test): 320 | df = pd.concat([train[['sn', 'server_model']], 321 | test[['sn', 'server_model']]], ignore_index=True) 322 | df['server_model_count_sn_2'] = df.groupby( 323 | ['server_model'])['sn'].transform('count') 324 | df['server_model_nunique_sn_2'] = df.groupby( 325 | ['server_model'])['sn'].transform('nunique') 326 | df['sn_cnt_2'] = df['sn'].map(df['sn'].value_counts()) 327 | return df.drop_duplicates().reset_index(drop=True) 328 | 329 | 330 | def get_4_time_stat_fea(df): 331 | print(' 生成时间统计特征') 332 | time_stat_fea_df = df.groupby(['sn', 'fault_time', 'server_model']).agg( 333 | {'duration_minutes': ['min', 'max', 'mean', 'median', 'skew', 'sum', 'std', 'count'], 334 | 'log_duration_minutes': ['min', 'max', 'mean', 'median', 'skew', 'sum', 'std'], 335 | 'time_diff_1': ['min', 'max', 'mean', 'median', 'skew', 'sum', 'std'], 336 | 'log_time_diff_1': ['min', 'max', 'median'], 337 | }).reset_index() 338 | new_time_stat_cols = [] 339 | for i in time_stat_fea_df.columns: 340 | if i[0] in ['sn', 'fault_time', 'server_model']: 341 | new_time_stat_cols.append(i[0]) 342 | else: 343 | new_time_stat_cols.append(f'{i[0]}_{i[1]}') 344 | # print(f'{i[0]}_{i[1]}') 345 | time_stat_fea_df.loc[time_stat_fea_df[i[0]] 346 | [i[1]] == -np.inf, (i[0], i[1])] = -20 347 | time_stat_fea_df.loc[time_stat_fea_df[i[0]] 348 | [i[1]] == np.inf, (i[0], i[1])] = 30 349 | time_stat_fea_df.columns = new_time_stat_cols 350 | time_stat_fea_df['duration_minutes_range'] = time_stat_fea_df['duration_minutes_max'] - time_stat_fea_df[ 351 | 'duration_minutes_min'] 352 | time_stat_fea_df['log_duration_minutes_range'] = time_stat_fea_df['log_duration_minutes_max'] - time_stat_fea_df[ 353 | 'log_duration_minutes_min'] 354 | time_stat_fea_df['time_diff_1_range'] = time_stat_fea_df['time_diff_1_max'] - \ 355 | time_stat_fea_df['time_diff_1_min'] 356 | time_stat_fea_df['log_time_diff_1_range'] = time_stat_fea_df['log_time_diff_1_max'] - time_stat_fea_df[ 357 | 'log_time_diff_1_min'] 358 | time_stat_fea_df['duration_minutes_freq'] = time_stat_fea_df['duration_minutes_range'] / time_stat_fea_df[ 359 | 'duration_minutes_count'] 360 | print(f' 生成时间统计特征完毕,特征维度:{time_stat_fea_df.shape}') 361 | return time_stat_fea_df 362 | 363 | 364 | def get_time_std_fea(train, test): 365 | print('生成 server_model 特征') 366 | df = pd.concat([train, test], axis=0, ignore_index=True) 367 | # df['year'] = df['time'].dt.year 368 | # df['month'] = df['time'].dt.month 369 | df['hour'] = df['time'].dt.hour 370 | # df['week'] = df['time'].dt.week 371 | df['minute'] = df['time'].dt.minute 372 | time_std = df.groupby(['sn', 'server_model']).agg( 373 | {'hour': 'std', 'minute': 'std'}).reset_index() 374 | time_std = time_std.rename( 375 | columns={ 376 | 'hour': 'hour_std', 377 | 'minute': 'minute_std'}) 378 | return time_std 379 | 380 | 381 | def get_key(all_data): 382 | all_data['msg_list'] = all_data['msg'].apply(lambda x: [i.strip() for i in x.split(' | ')]) 383 | class_fea_cnt_list = [] 384 | for label in [0,1,2,3]: 385 | class_df = all_data.query(f'label =={label}') 386 | counter = Counter() 387 | for i in class_df['msg_list']: 388 | counter.update(i) 389 | class_fea_cnt = pd.DataFrame({i[0]:i[1] for i in counter.most_common()},index = [f'fea_cnt_{label}']).T.reset_index().rename(columns = {'index':'fea'}) 390 | class_fea_cnt_list.append(class_fea_cnt) 391 | 392 | fea_cnt_df = class_fea_cnt_list[0] 393 | for tmp in class_fea_cnt_list[1:]: 394 | fea_cnt_df = fea_cnt_df.merge(tmp,on = 'fea') 395 | 396 | fea_cnt_df['fea_cnt_sum'] = fea_cnt_df.loc[:,['fea_cnt_0', 'fea_cnt_1', 'fea_cnt_2', 'fea_cnt_3']].sum(1) 397 | 398 | all_fea_cnt = fea_cnt_df['fea_cnt_sum'].sum() 399 | 400 | for i in ['fea_cnt_0', 'fea_cnt_1', 'fea_cnt_2', 'fea_cnt_3']: 401 | fea_cnt_df[f'{i}_ratio'] = fea_cnt_df[i]/fea_cnt_df['fea_cnt_sum'] 402 | fea_cnt_df[f'{i}_all_ratio'] = fea_cnt_df[i]/all_fea_cnt 403 | 404 | fea_cnt_df['fea_cnt_ratio_std'] = fea_cnt_df.loc[:,['fea_cnt_0_ratio','fea_cnt_1_ratio','fea_cnt_2_ratio','fea_cnt_3_ratio', ]].std(1) 405 | fea_cnt_df['fea_cnt_std'] = fea_cnt_df.loc[:,['fea_cnt_0', 'fea_cnt_1','fea_cnt_2','fea_cnt_3',]].std(1) 406 | 407 | fea_cnt_df['fea_cnt_all_ratio_std'] = fea_cnt_df.loc[:,['fea_cnt_0_all_ratio','fea_cnt_1_all_ratio', 408 | 'fea_cnt_2_all_ratio','fea_cnt_3_all_ratio',]].std(1) 409 | 410 | fea_cnt_df = fea_cnt_df[~fea_cnt_df['fea_cnt_ratio_std'].isnull()].sort_values('fea_cnt_ratio_std',ascending = False) 411 | 412 | fea_cnt_df['fea_max'] = np.argmax(fea_cnt_df.loc[:,['fea_cnt_0', 'fea_cnt_1', 'fea_cnt_2', 'fea_cnt_3',]].values,axis = 1) 413 | key_0 = fea_cnt_df.query('fea_max ==0 ')['fea'].to_list() 414 | key_1 = fea_cnt_df.query('fea_max ==1 ')['fea'].to_list() 415 | key_2 = fea_cnt_df.query('fea_max ==2 ')['fea'].to_list() 416 | key_3 = fea_cnt_df.query('fea_max ==3 ')['fea'].to_list() 417 | # key_1 = ['OEM record c2','Processor CPU_Core_Error','001c4c','System Event Sys_Event','Power Supply PS0_Status','Temperature CPU0_Margin_Temp','Reading 51 > Threshold 85 degrees C','Lower Non-critical going low','Temperature CPU1_Margin_Temp','System ACPI Power State #0x7d','Lower Critical going low'] 418 | # key_2 = ['OEM CPU0 MCERR','OEM CPU0 CATERR','Reading 0 < Threshold 2 degrees C','0203c0a80101','Unknown CPU0 MCERR','Unknown CPU0 CATERR','Microcontroller #0x3b','System Boot Initiated','Processor #0xfa','Power Unit Pwr Unit Status','Hard reset','Power off/down','System Event #0xff','Memory CPU1A1_DIMM_Stat','000000','Power cycle','OEM record c3','Memory CPU1C0_DIMM_Stat','Reading 0 < Threshold 1 degrees C','IERR'] 419 | # key_3 = ['Memory','Correctable ECC logging limit reached','Memory MEM_CHE0_Status','Memory Memory_Status','Memory #0x87','Memory CPU0F0_DIMM_Stat','Memory Device Disabled','Memory #0xe2','OS Stop/Shutdown OS Status','System Boot Initiated System Restart','OS Boot BIOS_Boot_Up','System Boot Initiated BIOS_Boot_UP','Memory DIMM101','OS graceful shutdown','OS Critical Stop OS Status','Memory #0xf9','Memory CPU0C0_DIMM_Stat','Memory DIMM111','Memory DIMM021',] 420 | # key_4 = ['Drive Fault','NMI/Diag Interrupt','Failure detected','Power Supply AC lost','Power Supply PSU0_Supply','AC out-of-range, but present','Predictive failure','Drive Present','Temperature Temp_DIMM_KLM','Temperature Temp_DIMM_DEF','Power Supply PS1_Status','Identify Status','Power Supply PS2_Status','Temperature DIMMG1_Temp','Upper Non-critical going high','Temperature DIMMG0_Temp','Upper Critical going high','Power Button pressed','System Boot Initiated #0xb8','Deasserted'] 421 | return key_0,key_1,key_2,key_3 422 | 423 | def get_class_key_words_nunique(all_data): 424 | print('获取 class_key_words_nunique 特征') 425 | 426 | key_0,key_1,key_2,key_3 = get_key(all_data) 427 | 428 | df = all_data[['sn', 'fault_time', 'msg_list']] 429 | df_tmp = df.groupby(['sn' ]).agg({'msg_list':'sum'}).reset_index() 430 | df_tmp['class_0_key_words_nunique'] = df_tmp['msg_list'].apply(lambda x:len(set(x)&set(key_0))) 431 | df_tmp['class_1_key_words_nunique'] = df_tmp['msg_list'].apply(lambda x:len(set(x)&set(key_1))) 432 | df_tmp['class_2_key_words_nunique'] = df_tmp['msg_list'].apply(lambda x:len(set(x)&set(key_2))) 433 | df_tmp['class_3_key_words_nunique'] = df_tmp['msg_list'].apply(lambda x:len(set(x)&set(key_3))) 434 | del df_tmp['msg_list'] 435 | return df_tmp 436 | def get_key_for_top_fea(train,test): 437 | KEY_FOR_TOP_COLS = [] 438 | print('添加 key_for_top_fea 特征') 439 | for TIME in [3, 5, 10, 15, 30, 45, 60, 90, 120, 240, 360, 480, 540, 600,60000000]: 440 | for i in range(10): 441 | train[f'KEY_FOR_TOP_{i}_{TIME}'] = train[f'{KEY_1[i]}_{TIME}'].astype(str)+'_'+train[f'{KEY_2[i]}_{TIME}'].astype(str)+'_'+train[f'{KEY_3[i]}_{TIME}'].astype(str)+'_'+train[f'{KEY_4[i]}_{TIME}'].astype(str) 442 | test[f'KEY_FOR_TOP_{i}_{TIME}'] = test[f'{KEY_1[i]}_{TIME}'].astype(str)+'_'+test[f'{KEY_2[i]}_{TIME}'].astype(str)+'_'+test[f'{KEY_3[i]}_{TIME}'].astype(str)+'_'+test[f'{KEY_4[i]}_{TIME}'].astype(str) 443 | KEY_FOR_TOP_COLS.append(f'KEY_FOR_TOP_{i}_{TIME}') 444 | train = cat2num(train,KEY_FOR_TOP_COLS) 445 | test = cat2num(test,KEY_FOR_TOP_COLS) 446 | for KEY_FOR_TOP_COL in KEY_FOR_TOP_COLS: 447 | del train[KEY_FOR_TOP_COL] 448 | del test[KEY_FOR_TOP_COL] 449 | return train,test 450 | 451 | def get_key_word_cross_fea(train,test): 452 | print('获取关键词交叉特征......') 453 | KEY_WORDS_MAP = {'CPU0':KEY_1,'CPU1':KEY_2,'CPU2':KEY_3,'CPU3':KEY_4} 454 | KEY_WORDS_CROSS_COLS =[] 455 | for KEY_WORDS in KEY_WORDS_MAP: 456 | for i in [3, 5, 10, 15, 30, 45, 60, 90, 120, 240, 360, 480, 540, 600,60000000]: 457 | KEY_WORDS_COLS = [f'{col}_{i}' for col in KEY_WORDS_MAP[KEY_WORDS]] 458 | train[f'{KEY_WORDS}_WORDS_{i}'] = train[KEY_WORDS_COLS].astype(str).sum(1) 459 | test[f'{KEY_WORDS}_WORDS_{i}'] = test[KEY_WORDS_COLS].astype(str).sum(1) 460 | KEY_WORDS_CROSS_COLS.append(f'{KEY_WORDS}_WORDS_{i}') 461 | train = cat2num(train,KEY_WORDS_CROSS_COLS) 462 | test = cat2num(test,KEY_WORDS_CROSS_COLS) 463 | 464 | for COLS in KEY_WORDS_CROSS_COLS: 465 | del train[COLS] 466 | del test[COLS] 467 | print('获取关键词交叉特征完毕......') 468 | return train,test 469 | def get_time_quantile_fea(df): 470 | print(' 生成时间分位数特征') 471 | secs = [0.2, 0.4, 0.6, 0.8] 472 | time_fea_list = [] 473 | for sec in tqdm(secs): 474 | for time_fea_type in [ 475 | 'duration_minutes', 'log_duration_minutes', 'time_diff_1', 'log_time_diff_1']: 476 | temp = df.groupby(['sn', 'server_model', 'fault_time'])[time_fea_type].quantile(sec).reset_index( 477 | name=f'{time_fea_type}_' + str(sec * 100)) 478 | 479 | time_fea_list.append(temp) 480 | time_fea_df = time_fea_list[0] 481 | for time_fea in time_fea_list[1:]: 482 | time_fea_df = time_fea_df.merge( 483 | time_fea, how='left', on=[ 484 | 'sn', 'server_model', 'fault_time']) 485 | print(f' 生成时间分位数特征完毕,特征维度:{time_fea_df.shape}') 486 | return time_fea_df 487 | 488 | 489 | def get_server_model_fea(train, test): 490 | print('生成 server_model 特征') 491 | df = pd.concat([train, test], axis=0, ignore_index=True) 492 | df['server_model_count_sn'] = df.groupby( 493 | ['server_model'])['sn'].transform('count') 494 | df['server_model_nunique_sn'] = df.groupby( 495 | ['server_model'])['sn'].transform('nunique') 496 | # df['server_model_count'] = df.groupby('server_model')['server_model'].transform('count') 497 | # df['server_model_cnt_quantile'] = df['server_model'].map( 498 | # df['server_model'].value_counts().rank() / len(df['server_model'].unique())) 499 | # df['server_model_cnt_rank'] = df[f'server_model_cnt_quantile'].rank(method='min') 500 | 501 | df['sn_cnt'] = df['sn'].map(df['sn'].value_counts()) 502 | df['sn_freq'] = df['sn'].map(df['sn'].value_counts() / len(df)) 503 | df['server_model_cnt'] = df['server_model'].map( 504 | df['server_model'].value_counts()) 505 | df['server_model_freq'] = df['server_model'].map( 506 | df['server_model'].value_counts() / len(df)) 507 | select_cols = ['sn', 'server_model', 508 | 'server_model_count_sn', 'server_model_nunique_sn', 509 | 'sn_cnt', 'sn_freq', 'server_model_cnt', 'server_model_freq' 510 | # 'server_model_count','server_model_cnt_quantile', 'server_model_cnt_rank' 511 | ] 512 | server_model_fea = df[select_cols] 513 | 514 | cat_feats = [ 515 | 'server_model'] # ,'server_model_day_date','server_model_dayofmonth','server_model_dayofweek','server_model_hour'] 516 | # for name in cat_feats: 517 | # le = LabelEncoder() 518 | # server_model_fea[f'{name}_LabelEnc'] = le.fit_transform( 519 | # server_model_fea[name]) 520 | server_model_fea = cat2num(server_model_fea, cat_feats, Transfer2num=True) 521 | server_model_fea = server_model_fea.drop_duplicates().reset_index(drop=True) 522 | print(f'生成 server_model 特征完毕,特征维度:{server_model_fea.shape}') 523 | 524 | return server_model_fea 525 | 526 | 527 | def get_time_type_msg_unique_fea(df): 528 | df['msg_list'] = df['msg'].apply( 529 | lambda x: [i.strip() for i in x.split(' | ')]) 530 | 531 | df['msg_0'] = df['msg'].apply( 532 | lambda x: [ 533 | get_msg_location( 534 | x.split(' | '), 535 | 0)]) 536 | df['msg_1'] = df['msg'].apply( 537 | lambda x: [ 538 | get_msg_location( 539 | x.split(' | '), 540 | 1)]) 541 | df['msg_2'] = df['msg'].apply( 542 | lambda x: [ 543 | get_msg_location( 544 | x.split(' | '), 545 | 2)]) 546 | 547 | df = df.groupby(['sn', 'fault_time']).agg( 548 | {'msg_list': 'sum', 'msg_0': 'sum', 'msg_1': 'sum', 'msg_2': 'sum'}).reset_index() 549 | 550 | df['msg_set'] = df['msg_list'].apply(lambda x: '|'.join(list(set(x)))) 551 | 552 | df['msg_0_set'] = df['msg_0'].apply(lambda x: '|'.join(list(set(x)))) 553 | df['msg_1_set'] = df['msg_1'].apply(lambda x: '|'.join(list(set(x)))) 554 | df['msg_2_set'] = df['msg_2'].apply(lambda x: '|'.join(list(set(x)))) 555 | df = df[['sn', 'fault_time', 'msg_set', 556 | 'msg_0_set', 'msg_1_set', 'msg_2_set']] 557 | return df 558 | 559 | 560 | def get_msg_unique_fea(train, test, time_type='last'): 561 | print('生成msg_unique_ fea') 562 | common_cols = ['msg_set', 'msg_0_set', 'msg_1_set', 'msg_2_set'] 563 | df = pd.concat([train, test], axis=0, ignore_index=True) 564 | df['time_interval'] = ( 565 | pd.to_datetime( 566 | df['fault_time']) - 567 | df['time']).apply( 568 | lambda x: x.total_seconds()) 569 | 570 | last_fea = get_time_type_msg_unique_fea(df.query('time_interval >0')) 571 | last_fea = last_fea.rename(columns={i: f'last_{i}' for i in common_cols}) 572 | next_fea = get_time_type_msg_unique_fea(df.query('time_interval <0')) 573 | next_fea = next_fea.rename(columns={i: f'next_{i}' for i in common_cols}) 574 | all_fea = get_time_type_msg_unique_fea(df) 575 | all_fea = all_fea.rename(columns={i: f'all_{i}' for i in common_cols}) 576 | msg_unique_fea = all_fea.merge( 577 | last_fea, on=['sn', 'fault_time'], how='outer') 578 | msg_unique_fea = msg_unique_fea.merge( 579 | next_fea, on=['sn', 'fault_time'], how='outer') 580 | return msg_unique_fea 581 | 582 | 583 | def get_duration_minutes_fea(train, test): 584 | print('生成 duration_minutes 特征') 585 | df = pd.concat([train, test], axis=0, ignore_index=True) 586 | df['duration_minutes'] = (pd.to_datetime(df['fault_time']) - pd.to_datetime(df['time'])).apply( 587 | lambda x: x.total_seconds()) 588 | df['log_duration_minutes'] = np.log(df['duration_minutes']) 589 | 590 | df = df.sort_values(['sn', 'label', 'server_model', 591 | 'fault_time', 'time']).reset_index(drop=True) 592 | df['time_diff_1'] = (df.groupby(['sn', 'server_model', 'fault_time'])['time'].diff(1)).apply( 593 | lambda x: x.total_seconds()) 594 | df['time_diff_1'] = df['time_diff_1'].fillna(0) 595 | df['log_time_diff_1'] = np.log(df['time_diff_1']) 596 | 597 | # time_quantile_fea_df = get_time_quantile_fea(df) 598 | # time_stat_fea_df = get_4_time_stat_fea(df) 599 | # df_tmp = time_quantile_fea_df.merge(time_stat_fea_df, on= ['sn', 'server_model','fault_time'],how = 'left') 600 | time_stat_fea_df = get_4_time_stat_fea(df) 601 | df_tmp = time_stat_fea_df 602 | print(f'生成 duration_minutes 特征完毕,特征维度{df_tmp.shape}') 603 | return df_tmp 604 | 605 | 606 | def get_msg_text_fea_all(all_data): 607 | all_data['label'] = all_data['label'].fillna(-1) 608 | all_data['msg_list'] = all_data['msg'].apply(lambda x: [i.strip() for i in x.split(' | ')]) 609 | all_data['msg_0'] = all_data['msg'].apply(lambda x: [get_msg_location(x.split(' | '), 0)]) 610 | all_data['msg_1'] = all_data['msg'].apply(lambda x: [get_msg_location(x.split(' | '), 1)]) 611 | all_data['msg_2'] = all_data['msg'].apply(lambda x: [get_msg_location(x.split(' | '), 2)]) 612 | 613 | all_data = all_data.sort_values(['sn', 'fault_time', 'time']).reset_index(drop=True) 614 | del all_data['label'] 615 | last_data = all_data.query('time_interval >0') 616 | next_data = all_data.query('time_interval <=0') 617 | 618 | # id_cols = ['sn', 'fault_time', 'label'] 619 | 620 | # all_msg_text_fea = get_msg_text_fea(all_data, msg_type='all') 621 | last_msg_text_fea = get_msg_text_fea(last_data, msg_type='last') 622 | # next_msg_text_fea = get_msg_text_fea(next_data, msg_type='next') 623 | msg_text_fea = last_msg_text_fea 624 | return msg_text_fea 625 | 626 | def get_test_key_words(train,test): 627 | 628 | df = pd.concat([train[['sn', 'fault_time', 'label','msg']],test[['sn', 'fault_time', 'msg']]],ignore_index = True).drop_duplicates(['sn', 'fault_time', 'msg']) 629 | df['label'] = df['label'].fillna(5) 630 | df['msg_list'] = df['msg'].apply(lambda x:[i.strip() for i in x.split(' | ')]) 631 | words_cnt_df_list = [] 632 | for label in df['label'].unique(): 633 | label = int(label) 634 | df_tmp = df.query(f'label == {label}') 635 | counter = Counter() 636 | for words in df_tmp['msg_list']: 637 | words = [i.replace('_',' ') for i in words] 638 | # word_list = [] 639 | # for i in words: 640 | # word_list+=i.split(' ') 641 | # words = word_list 642 | counter.update(words) 643 | words_cnt_df = pd.DataFrame(counter,index = [0]).T.reset_index().rename(columns = {'index':'word',0:f'cnt_{label}'}) 644 | words_cnt_df_list.append(words_cnt_df) 645 | words_cnt_df = words_cnt_df_list[0] 646 | for i in words_cnt_df_list[1:]: 647 | words_cnt_df = words_cnt_df.merge(i,on = 'word',how = 'outer' ) 648 | 649 | words_cnt_df = words_cnt_df.fillna(-1) 650 | words_cnt_df1 = words_cnt_df.query('cnt_0 >10 and cnt_2 >10 and cnt_1 >10 and cnt_3>10 and cnt_5>10 ') 651 | cnt_class = ['cnt_0','cnt_1','cnt_2','cnt_3','cnt_5'] 652 | words_cnt_df1['word_cnt_sum'] = words_cnt_df1.loc[:,cnt_class].sum(1) 653 | for i in cnt_class: 654 | words_cnt_df1[f'{i}_ratio'] = words_cnt_df1[i]/words_cnt_df1['word_cnt_sum'] 655 | words_cnt_df1['word_cnt_ratio_std'] = words_cnt_df1.loc[:,['cnt_0_ratio','cnt_1_ratio', 'cnt_2_ratio', 'cnt_3_ratio']].std(1) 656 | words_cnt_df1['cnt_1_0_diff'] = (words_cnt_df1['cnt_1_ratio'] - words_cnt_df1['cnt_0_ratio']) 657 | test_key_words = words_cnt_df1.sort_values('cnt_5',ascending = False)['word'].to_list()[5:40] 658 | return test_key_words 659 | 660 | def get_w2v_mean(w2v_model,sentences): 661 | emb_matrix = list() 662 | vec = list() 663 | for w in sentences.split(): 664 | if w in w2v_model.wv: 665 | vec.append(w2v_model.wv[w]) 666 | if len(vec) > 0: 667 | emb_matrix.append(np.mean(vec, axis=0)) 668 | else: 669 | emb_matrix.append([0] * w2v_model.vector_size) 670 | return emb_matrix 671 | def get_tfidf_svd(tfv,svd,sentences, n_components=16): 672 | X_tfidf = tfv.transform(sentences) 673 | X_svd = svd.transform(X_tfidf) 674 | return np.mean(X_svd, axis=0) 675 | def get_w2v_tfidf_fea(all_data): 676 | print('w2v编码') 677 | df = all_data 678 | df['msg_list'] = df['msg'].apply(lambda x: [i.strip().lower().replace(' ','_') for i in x.split(" | ")]) 679 | df = df.groupby(['sn']).agg({'msg_list': 'sum'}).reset_index() 680 | df['text'] = df['msg_list'].apply(lambda x: ' '.join(x)) 681 | 682 | sentences_list = df['text'].values.tolist() 683 | sentences = [] 684 | for s in sentences_list: 685 | sentences.append([w for w in s.split()]) 686 | w2v_model = Word2Vec(sentences, vector_size=10, window=3, min_count=5, sg=0, hs=1, seed=2022) 687 | df['text_w2v'] = df['text'].apply(lambda x: get_w2v_mean(w2v_model, x)[0]) 688 | 689 | print('tfidf编码') 690 | X = df['text'].to_list() 691 | tfv = TfidfVectorizer(ngram_range=(1, 3), min_df=5, max_features=50000) 692 | tfv.fit(X) 693 | X_tfidf = tfv.transform(X) 694 | svd = TruncatedSVD(n_components=16) # 降维 695 | svd.fit(X_tfidf) 696 | df['text_tfidf'] = df['text'].apply(lambda x: get_tfidf_svd(tfv, svd, x.split())) 697 | 698 | print("doc2vec编码") 699 | texts = df['text'].tolist() 700 | documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(texts)] 701 | model = Doc2Vec(documents, window=5, min_count=3, workers=4) 702 | docvecs = model.docvecs 703 | df['doc2vec'] = [docvecs[i] for i in range(len(docvecs))] 704 | 705 | for i in range(32): 706 | df[f'msg_w2v_{i}'] = df['text_w2v'].apply(lambda x: x[i]) 707 | for i in range(16): 708 | df[f'msg_tfv_{i}'] = df['text_tfidf'].apply(lambda x: x[i]) 709 | for i in range(100): 710 | df[f'msg_doc2vec_{i}'] = df['doc2vec'].apply(lambda x: x[i]) 711 | 712 | save_cols = [i for i in df.columns if i not in ['msg_list', 'text', 'text_w2v', 'text_tfidf', 'doc2vec']] 713 | return df[save_cols] 714 | 715 | # w2v_tfidf_fea = get_w2v_tfidf_fea(all_data) 716 | class BetaEncoder(object): 717 | 718 | def __init__(self, group): 719 | 720 | self.group = group 721 | self.stats = None 722 | 723 | # get counts from df 724 | def fit(self, df, target_col): 725 | # 先验均值 726 | self.prior_mean = np.mean(df[target_col]) 727 | stats = df[[target_col, self.group]].groupby(self.group) 728 | # count和sum 729 | stats = stats.agg(['sum', 'count'])[target_col] 730 | stats.rename(columns={'sum': 'n', 'count': 'N'}, inplace=True) 731 | stats.reset_index(level=0, inplace=True) 732 | self.stats = stats 733 | 734 | # extract posterior statistics 735 | def transform(self, df, stat_type, N_min=1): 736 | 737 | df_stats = pd.merge(df[[self.group]], self.stats, how='left') 738 | n = df_stats['n'].copy() 739 | N = df_stats['N'].copy() 740 | 741 | # fill in missing 742 | nan_indexs = np.isnan(n) 743 | n[nan_indexs] = self.prior_mean 744 | N[nan_indexs] = 1.0 745 | 746 | # prior parameters 747 | N_prior = np.maximum(N_min - N, 0) 748 | alpha_prior = self.prior_mean * N_prior 749 | beta_prior = (1 - self.prior_mean) * N_prior 750 | 751 | # posterior parameters 752 | alpha = alpha_prior + n 753 | beta = beta_prior + N - n 754 | 755 | # calculate statistics 756 | if stat_type == 'mean': 757 | num = alpha 758 | dem = alpha + beta 759 | 760 | elif stat_type == 'mode': 761 | num = alpha - 1 762 | dem = alpha + beta - 2 763 | 764 | elif stat_type == 'median': 765 | num = alpha - 1 / 3 766 | dem = alpha + beta - 2 / 3 767 | 768 | elif stat_type == 'var': 769 | num = alpha * beta 770 | dem = (alpha + beta) ** 2 * (alpha + beta + 1) 771 | 772 | elif stat_type == 'skewness': 773 | num = 2 * (beta - alpha) * np.sqrt(alpha + beta + 1) 774 | dem = (alpha + beta + 2) * np.sqrt(alpha * beta) 775 | 776 | elif stat_type == 'kurtosis': 777 | num = 6 * (alpha - beta) ** 2 * (alpha + beta + 1) - \ 778 | alpha * beta * (alpha + beta + 2) 779 | dem = alpha * beta * (alpha + beta + 2) * (alpha + beta + 3) 780 | 781 | # replace missing 782 | value = num / dem 783 | value[np.isnan(value)] = np.nanmedian(value) 784 | return value 785 | 786 | 787 | def get_beta_target(train, test): 788 | N_min = 1000 789 | feature_cols = [] 790 | 791 | # encode variables 792 | for c in ['server_model']: 793 | # fit encoder 794 | be = BetaEncoder(c) 795 | be.fit(train, 'label') 796 | 797 | # mean 798 | feature_name = f'{c}_mean' 799 | train[feature_name] = be.transform(train, 'mean', N_min) 800 | test[feature_name] = be.transform(test, 'mean', N_min) 801 | feature_cols.append(feature_name) 802 | 803 | # mode 804 | feature_name = f'{c}_mode' 805 | train[feature_name] = be.transform(train, 'mode', N_min) 806 | test[feature_name] = be.transform(test, 'mode', N_min) 807 | feature_cols.append(feature_name) 808 | 809 | # median 810 | feature_name = f'{c}_median' 811 | train[feature_name] = be.transform(train, 'median', N_min) 812 | test[feature_name] = be.transform(test, 'median', N_min) 813 | feature_cols.append(feature_name) 814 | 815 | # var 816 | feature_name = f'{c}_var' 817 | train[feature_name] = be.transform(train, 'var', N_min) 818 | test[feature_name] = be.transform(test, 'var', N_min) 819 | feature_cols.append(feature_name) 820 | 821 | # # skewness 822 | # feature_name = f'{c}_skewness' 823 | # train[feature_name] = be.transform(train, 'skewness', N_min) 824 | # test[feature_name] = be.transform(test, 'skewness', N_min) 825 | # feature_cols.append(feature_name) 826 | 827 | # kurtosis 828 | feature_name = f'{c}_kurtosis' 829 | train[feature_name] = be.transform(train, 'kurtosis', N_min) 830 | test[feature_name] = be.transform(test, 'kurtosis', N_min) 831 | feature_cols.append(feature_name) 832 | df = train.append(test).reset_index(drop=True) 833 | df = df[['sn', 'fault_time', 'server_model', 'server_model_mean', 834 | 'server_model_mode', 'server_model_median', 'server_model_var', 835 | 'server_model_kurtosis']].drop_duplicates().reset_index(drop=True) 836 | return df 837 | --------------------------------------------------------------------------------