├── .gitignore
├── 3rd_PanJiu_AIOps_Competition
├── model
│ └── model.pkl
├── code
│ ├── requirements.txt
│ ├── .DS_Store
│ ├── log.py
│ ├── stacking.py
│ ├── generate_pseudo_label.py
│ ├── model.py
│ ├── utils.py
│ ├── lgb_fs.py
│ ├── catboost_fs.py
│ ├── get_crashdump_venus_fea.py
│ └── generate_feature.py
├── data
│ ├── 数据集下载地址
│ └── .DS_Store
├── tcdata
│ └── 数据集下载地址
├── .DS_Store
├── feature
│ └── .DS_Store
├── 答辩PPT
│ ├── .DS_Store
│ └── 悦智AI实验室_20220525.pdf
├── user_data
│ └── .DS_Store
├── docker_push.sh
├── run.sh
├── run.log
├── Dockerfile
├── README.md
├── log
│ └── catboost.log
└── LICENSE
├── .DS_Store
├── README.md
├── .idea
└── workspace.xml
└── LICENSE
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .idea
3 |
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/model/model.pkl:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/code/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit_learn==1.0.2
2 |
--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yz-intelligence/AI-Competition/HEAD/.DS_Store
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/data/数据集下载地址:
--------------------------------------------------------------------------------
1 | https://tianchi.aliyun.com/competition/entrance/531947/information?lang=zh-cn
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/tcdata/数据集下载地址:
--------------------------------------------------------------------------------
1 | https://tianchi.aliyun.com/competition/entrance/531947/information?lang=zh-cn
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yz-intelligence/AI-Competition/HEAD/3rd_PanJiu_AIOps_Competition/.DS_Store
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/code/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yz-intelligence/AI-Competition/HEAD/3rd_PanJiu_AIOps_Competition/code/.DS_Store
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yz-intelligence/AI-Competition/HEAD/3rd_PanJiu_AIOps_Competition/data/.DS_Store
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/feature/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yz-intelligence/AI-Competition/HEAD/3rd_PanJiu_AIOps_Competition/feature/.DS_Store
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/答辩PPT/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yz-intelligence/AI-Competition/HEAD/3rd_PanJiu_AIOps_Competition/答辩PPT/.DS_Store
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/user_data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yz-intelligence/AI-Competition/HEAD/3rd_PanJiu_AIOps_Competition/user_data/.DS_Store
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # AI-Competition
2 | 开源往期获奖竞赛代码
3 |
4 | - [第三届阿里云磐久智维算法大赛亚军方案](./3rd_PanJiu_AIOps_Competition/README.md)
5 | - INFO:Catboost、伪标签、对抗验证、毫秒级预测(所有竞赛中最优)
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/答辩PPT/悦智AI实验室_20220525.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yz-intelligence/AI-Competition/HEAD/3rd_PanJiu_AIOps_Competition/答辩PPT/悦智AI实验室_20220525.pdf
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/docker_push.sh:
--------------------------------------------------------------------------------
1 | # 创建镜像 并提交到你的镜像仓库
2 | rm -rf result.zip
3 | # built 镜像
4 | docker build -t [你的仓库地址]:[TAG] .
5 | # push 镜像
6 | docker push [你的仓库地址]:[TAG]
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/run.sh:
--------------------------------------------------------------------------------
1 | rm -rf model
2 | #unzip model.zip
3 | python3 code/get_crashdump_venus_fea.py
4 | python3 code/catboost_fs.py
5 | zip -j result.zip prediction_result/catboost_result.csv
6 |
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/run.log:
--------------------------------------------------------------------------------
1 | Archive: model.zip
2 | creating: model/deberta-base/
3 | inflating: model/debert_model_v21_128_fs_flod_5.h5
4 | inflating: model/debert_model_v21_128_fs_flod_6.h5
5 | inflating: model/debert_model_v21_128_fs_flod_8.h5
6 | inflating: model/README.txt
7 | inflating: model/weight_cs6399_fold_8_v21_128_fs.npy
8 | inflating: model/weight_cs6558_fold_5_v21_128_fs.npy
9 | inflating: model/weight_cs6614_fold_6_v21_128_fs.npy
10 | inflating: model/weight_fs6138_fold_8_v21_128_fs.npy
11 | inflating: model/weight_fs6280_fold_5_v21_128_fs.npy
12 | inflating: model/weight_fs6359_fold_6_v21_128_fs.npy
13 |
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/Dockerfile:
--------------------------------------------------------------------------------
1 | # Base Images
2 | ## 从天池基础镜像构建
3 | FROM registry.cn-shanghai.aliyuncs.com/tcc-public/python:3
4 | ## 把当前文件夹里的文件构建到镜像的根目录下
5 | ADD . /
6 | ## 指定默认工作目录为根目录(需要把run.sh和生成的结果文件都放在该文件夹下,提交后才能运行)
7 | WORKDIR /
8 |
9 | ## 安装所需要的包
10 | RUN pip config set global.index-url http://mirrors.aliyun.com/pypi/simple/
11 | RUN pip config set install.trusted-host mirrors.aliyun.com
12 | RUN pip3 install -r code/requirements.txt
13 | RUN pip install --upgrade pip
14 | RUN apt -y update
15 | RUN apt install zip
16 | RUN apt install vim -y
17 | RUN apt install screen -y
18 | RUN pip install catboost
19 | RUN pip install scikit-learn
20 | RUN pip install tqdm
21 | RUN pip install lightgbm
22 | RUN pip install gensim==4.1.2
23 |
24 | ## 镜像启动后统一执行 sh run.sh
25 | CMD ["sh", "run.sh"]
26 |
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/code/log.py:
--------------------------------------------------------------------------------
1 |
2 | import logging
3 | import os
4 |
5 |
6 | class Logger:
7 | def __init__(self, name, log_path, mode='a'):
8 | """
9 | 程序运行日志类的构造函数
10 | :param name: 需要保存的日志文件名称,默认后缀名称为 .log
11 | :param log_path: 需要保存的日志文件路径
12 | :param mode: 日志写入模式, a:追加, w:覆盖
13 | 使用说明:
14 | 1、创建日志实例对象
15 | logger = Logger("textCNN_train", log_path="../logs").get_log
16 | 2、将关键信息通过日志实例对象写入日志文件
17 | logger.info("")
18 | """
19 | self.__name = name
20 | self.logger = logging.getLogger(self.__name)
21 | self.logger.setLevel(logging.DEBUG)
22 | self.log_path = log_path
23 | self.mode = mode
24 |
25 | # 创建一个handler,用于写入日志文件
26 | # log_path = os.path.dirname(os.path.abspath(__file__))
27 | # 指定utf-8格式编码,避免输出的日志文本乱码
28 | logname = os.path.join(self.log_path, self.__name + '.log') # 指定输出的日志文件名
29 | # 定义handler的输出格式
30 | formatter = logging.Formatter(
31 | '%(asctime)s-%(filename)s-[日志信息]-[%(module)s-%(funcName)s-line:%(lineno)d]-%(levelname)s: %(message)s')
32 |
33 | fh = logging.FileHandler(logname, mode=self.mode, encoding='utf-8') # 不拆分日志文件,a指追加模式,w为覆盖模式
34 | fh.setLevel(logging.DEBUG)
35 |
36 | # 创建一个handler,用于将日志输出到控制台
37 | ch = logging.StreamHandler()
38 | ch.setLevel(logging.DEBUG)
39 |
40 | fh.setFormatter(formatter)
41 | ch.setFormatter(formatter)
42 |
43 | # 给logger添加handler
44 | self.logger.addHandler(fh)
45 | self.logger.addHandler(ch)
46 |
47 | @property
48 | def get_log(self):
49 | """定义一个函数,回调logger实例"""
50 | return self.logger
51 |
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/code/stacking.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | import pandas as pd
4 | from utils import RESULT_DIR
5 | lgb_result = pd.read_csv(os.path.join(RESULT_DIR,'lgb_prob_result.csv'))
6 | lgb_result = lgb_result[lgb_result['label'].isnull()]
7 | print(lgb_result.columns)
8 | del lgb_result['label']
9 |
10 | cat_result = pd.read_csv(os.path.join(RESULT_DIR,'cat_prob_result.csv'))
11 | cat_result = cat_result[cat_result['label'].isnull()]
12 | del cat_result['label']
13 |
14 | # bert_result = pd.read_csv(os.path.join(RESULT_DIR,'bert_prob_result.csv'))
15 |
16 | model_weight = {'lgb':0.2,'cat':0.8,'bert':0.2}
17 | print(f'MODEL WEIGHT: {model_weight}')
18 | # for i in ['bert_class_0', 'bert_class_1', 'bert_class_2','bert_class_3']:
19 | # bert_result[i] = bert_result[i]*model_weight['bert']
20 |
21 | for i in ['cat_class_0', 'cat_class_1', 'cat_class_2', 'cat_class_3']:
22 | cat_result[i] = cat_result[i]*model_weight['cat']
23 |
24 | for i in ['lgb_class_0', 'lgb_class_1', 'lgb_class_2', 'lgb_class_3']:
25 | lgb_result[i] = lgb_result[i]*model_weight['lgb']
26 |
27 | result= lgb_result.merge(cat_result,on =['sn', 'fault_time'],how ='left' )
28 |
29 | # result= bert_result.merge(cat_result,on =['sn', 'fault_time'],how ='left' )
30 | #
31 | # result['class_0'] =result.loc[:,['cat_class_0','bert_class_0']].sum(1)
32 | # result['class_1'] =result.loc[:,['cat_class_1','bert_class_0']].sum(1)
33 | # result['class_2'] =result.loc[:,['cat_class_2','bert_class_0']].sum(1)
34 | # result['class_3'] =result.loc[:,['cat_class_3','bert_class_0']].sum(1)
35 |
36 | result['class_0'] =result.loc[:,['lgb_class_0','cat_class_0']].sum(1)
37 | result['class_1'] =result.loc[:,['lgb_class_1','cat_class_1']].sum(1)
38 | result['class_2'] =result.loc[:,['lgb_class_2','cat_class_2']].sum(1)
39 | result['class_3'] =result.loc[:,['lgb_class_3','cat_class_3']].sum(1)
40 |
41 | result['label'] = np.argmax(result.loc[:,['class_0', 'class_1', 'class_2', 'class_3']].values,axis = 1)
42 | result = result[['sn', 'fault_time','label']]
43 | result.to_csv(os.path.join(RESULT_DIR,'stacking_result.csv'),index = False)
44 |
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/README.md:
--------------------------------------------------------------------------------
1 | # 第三届阿里云磐久智维算法大赛亚军方案
2 |
3 | ## 官网地址
4 |
5 | https://tianchi.aliyun.com/competition/entrance/531947/introduction
6 |
7 | ## 项目目录结构
8 | ```
9 | ├── Dockerfile
10 | ├── README.md
11 | ├── code
12 | │ ├── catboost_fs.py +++++++++++++++++++++++++++++++ 模型训练代码
13 | │ ├── generate_feature.py ++++++++++++++++++++++++++ 特征生成代码
14 | │ ├── generate_pseudo_label.py ++++++++++++++++++++ 伪标签代码
15 | │ ├── get_crashdump_venus_fea.py +++++++++++++++++++ 新数据特征生成代码
16 | │ ├── requirements.txt +++++++++++++++++++++++++++++ python包版本
17 | │ ├── stacking.py ++++++++++++++++++++++++++++++++++ 模型融合代码
18 | │ └── utils.py +++++++++++++++++++++++++++++++++++++ 小工具脚本
19 | ├── data
20 | │ ├── preliminary_a_test +++++++++++++++++++++++++++ 初赛A榜测试数据集
21 | │ ├── preliminary_b_test +++++++++++++++++++++++++++ 初赛B榜测试数据集
22 | │ └── preliminary_train ++++++++++++++++++++++++++++ 训练集数据
23 | ├── docker_push.sh +++++++++++++++++++++++++++++++++++++++++ Docker镜像构建、push脚本
24 | ├── feature
25 | │ └── generation +++++++++++++++++++++++++++++++++++ 特征生成文件夹
26 | ├── log ++++++++++++++++++++++++++++++++++++++++++++++++++++ 日志文件夹
27 | │ ├── catboost.log +++++++++++++++++++++++++++++++++ 模型运行日志
28 | ├── model ++++++++++++++++++++++++++++++++++++++++++++++++++ 模型文件
29 | ├── prediction_result ++++++++++++++++++++++++++++++++++++++ 模型预测结果文件夹
30 | │ ├── cat_prob_result.csv ++++++++++++++++++++++++++ CATBOOST模型预测概率
31 | │ ├── catboost_result.csv ++++++++++++++++++++++++++ CATBOOST模型预测结果
32 | │ └── stacking_result.csv ++++++++++++++++++++++++++ 模型融合结果
33 | ├── run.log ++++++++++++++++++++++++++++++++++++++++++++++++ 代码运行日志
34 | ├── run.sh ++++++++++++++++++++++++++++++++++++++++++++++++ 代码运行脚本
35 | ├── tcdata ++++++++++++++++++++++++++++++++++++++++++++++++ 复赛测试集数据文件夹(具体文件请使用初赛相关文件更改文件名替换)
36 | │ ├── final_crashdump_dataset_b.csv ++++++++++++++++ 复赛B榜新数据文件
37 | │ ├── final_sel_log_dataset_b.csv ++++++++++++++++++ 复赛测试集日志文件
38 | │ ├── final_submit_dataset_b.csv +++++++++++++++++++ 复赛测试集ID
39 | │ └── final_venus_dataset_b.csv ++++++++++++++++++++ 复赛B榜新数据文件
40 | ├── user_data
41 | │ └── tmp_data +++++++++++++++++++++++++++++++++++++ 临时文件
42 | └── 答辩PPT
43 | └── 悦智AI实验室_20220525.pdf
44 | ```
45 | ## 运行环境
46 | Python版本为3.8,各个Python包版本见requirements.txt,使用如下命令即可安装:
47 | ```
48 | pip install -r code/requirements.txt
49 | ```
50 |
51 | ## 构建镜像运行代码
52 | ### 构建镜像
53 | ```
54 | docker build -t [你的镜像仓库]:[TAG] .
55 | ```
56 | ### 运行镜像
57 | ```
58 | docker run [你的镜像ID] sh run.sh
59 | ```
60 | ### push 镜像
61 | ```
62 | docker push [你的仓库地址]:[TAG]
63 | ```
64 | ### 运行&push 镜像
65 | ```
66 | bash docker_push.sh
67 | ```
68 |
69 | ## 运行代码
70 | ```
71 | bash run.sh
72 | ```
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 | 1653988069044
36 |
37 |
38 | 1653988069044
39 |
40 |
41 |
42 |
43 | 1653988253675
44 |
45 |
46 |
47 | 1653988253675
48 |
49 |
50 | 1653988258918
51 |
52 |
53 |
54 | 1653988258918
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/code/generate_pseudo_label.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import os
4 | from utils import TRAIN_DIR ,TEST_A_DIR,TEST_B_DIR,RESULT_DIR,DATA_DIR
5 |
6 | log_dataset_a = pd.read_csv(os.path.join(DATA_DIR,'preliminary_a_test/preliminary_sel_log_dataset_a.csv'))
7 | log_dataset_b = pd.read_csv(os.path.join(DATA_DIR,'preliminary_b_test/preliminary_sel_log_dataset_b.csv'))
8 | submit_dataset_a = pd.read_csv(os.path.join(DATA_DIR,'preliminary_a_test/preliminary_submit_dataset_a.csv'))
9 | submit_dataset_b = pd.read_csv(os.path.join(DATA_DIR,'preliminary_b_test/preliminary_submit_dataset_b.csv'))
10 |
11 | log_dataset_c = pd.concat([log_dataset_a,log_dataset_b],ignore_index = True,axis = 0)
12 | submit_dataset_c = pd.concat([submit_dataset_a,submit_dataset_b],ignore_index = True,axis = 0)
13 |
14 | log_dataset_c.to_csv(os.path.join(TEST_A_DIR,'final_log_dataset_c.csv'),index =False)
15 | submit_dataset_c.to_csv(os.path.join(TEST_A_DIR,'final_submit_dataset_c.csv'),index =False)
16 |
17 |
18 | #
19 | # cat_prob = pd.read_csv(os.path.join(RESULT_DIR,'../../../TianchiAIOps_bert_model/cat_prob_result.csv'))
20 | # lgb_prob = pd.read_csv(os.path.join(RESULT_DIR,'../../../TianchiAIOps_bert_model/lgb_prob_result.csv'))
21 |
22 | cat_prob = pd.read_csv(os.path.join(RESULT_DIR,'B_prob_7511.csv'))
23 | lgb_prob = pd.read_csv(os.path.join(RESULT_DIR,'baseline_prob_7495.csv'))
24 | cat_prob.columns = ['cat_class_0', 'cat_class_1', 'cat_class_2', 'cat_class_3', 'label', 'sn',
25 | 'fault_time']
26 | lgb_prob.columns = ['lgb_class_0', 'lgb_class_1', 'lgb_class_2', 'lgb_class_3', 'label', 'sn',
27 | 'fault_time']
28 |
29 | lgb_prob = lgb_prob[lgb_prob['label'].isnull()]
30 | cat_prob = cat_prob[cat_prob['label'].isnull()]
31 |
32 | cat_prob['cat_prob'] = cat_prob.loc[:,['cat_class_0', 'cat_class_1', 'cat_class_2', 'cat_class_3']].max(1)
33 | cat_prob['cat_label'] = np.argmax(cat_prob.loc[:,['cat_class_0', 'cat_class_1', 'cat_class_2', 'cat_class_3']].values,axis = 1)
34 |
35 | lgb_prob['lgb_prob'] = lgb_prob.loc[:,['lgb_class_0', 'lgb_class_1', 'lgb_class_2', 'lgb_class_3']].max(1)
36 | lgb_prob['lgb_label'] = np.argmax(lgb_prob.loc[:,['lgb_class_0', 'lgb_class_1', 'lgb_class_2', 'lgb_class_3']].values,axis = 1)
37 |
38 | lgb_prob = lgb_prob[['sn','fault_time','lgb_label','lgb_prob']]
39 | cat_prob = cat_prob[['sn','fault_time','cat_label','cat_prob']]
40 |
41 | # prob = cat_prob.merge(lgb_prob,on =['sn','fault_time'],
42 | # how = 'left' )
43 |
44 | prob = pd.concat([cat_prob,lgb_prob],ignore_index = True)
45 | prob['cat_prob']=prob['cat_prob'].fillna(1)
46 | prob['lgb_prob']=prob['lgb_prob'].fillna(1)
47 | prob.loc[prob['cat_label'].isnull(),'cat_label'] = prob.loc[prob['cat_label'].isnull(),'lgb_label']
48 | prob.loc[prob['lgb_label'].isnull(),'lgb_label'] = prob.loc[prob['lgb_label'].isnull(),'cat_label']
49 |
50 |
51 | pseudo_labels = prob.query('cat_prob >0.85 and lgb_prob >0.85 and lgb_label == cat_label ')
52 |
53 | pseudo_labels = pseudo_labels[['sn','fault_time','cat_label']].rename(columns = {'cat_label':'label'}).reset_index(drop = True)
54 | pseudo_labels.to_csv(os.path.join(TRAIN_DIR,'pseudo_labels.csv'),index= False)
55 | print(f'生成伪标签的数据维度:{pseudo_labels.shape}')
56 |
57 | pseudo_sel_log_dataset = pd.read_csv(os.path.join(TEST_A_DIR,'final_sel_log_dataset_c.csv'))
58 | pseudo_sel_log_dataset = pseudo_sel_log_dataset[pseudo_sel_log_dataset['sn'].isin(pseudo_labels['sn'].to_list())]
59 | pseudo_sel_log_dataset.to_csv(os.path.join(TRAIN_DIR,'pseudo_sel_log_dataset.csv'),index = False)
60 | print(f'生成伪标签的日志数据维度:{pseudo_sel_log_dataset.shape}')
61 |
62 | # 制作新的测试集
63 | final_submit_dataset_d= prob.merge(pseudo_labels,on =['sn','fault_time'],how = 'left' )
64 | final_submit_dataset_d = final_submit_dataset_d[final_submit_dataset_d['label'].isnull()][['sn','fault_time' ]].reset_index(drop = True)
65 | final_submit_dataset_d.to_csv(os.path.join(TEST_A_DIR,'final_submit_dataset_d.csv'),index= False)
66 | print(f'生成新的测试集维度:{final_submit_dataset_d.shape}')
67 |
68 | final_sel_log_dataset_d = pd.read_csv(os.path.join(TEST_A_DIR,'final_sel_log_dataset_c.csv'))
69 | final_sel_log_dataset_d = final_sel_log_dataset_d[final_sel_log_dataset_d['sn'].isin(final_submit_dataset_d['sn'].to_list())]
70 |
71 | final_sel_log_dataset_d.to_csv(
72 | os.path.join(TEST_A_DIR,'final_sel_log_dataset_d.csv'),index = False)
73 | print(f'生成新的测试集日志数据维度:{final_sel_log_dataset_d.shape}')
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/log/catboost.log:
--------------------------------------------------------------------------------
1 | use_less_col:335
2 | 使用的特征维度: 1762
3 | ********************** RUN CATBOOST MODEL **********************
4 | ****************** 当前的 SEED 42 **********************
5 | FOLD 1 IS RUNNING...
6 | 0: learn: 1.2939006 test: 1.2943096 best: 1.2943096 (0) total: 135ms remaining: 22m 27s
7 | 800: learn: 0.2057676 test: 0.2778188 best: 0.2778188 (800) total: 1m 2s remaining: 11m 55s
8 | 1600: learn: 0.1533318 test: 0.2698555 best: 0.2698522 (1599) total: 2m 4s remaining: 10m 54s
9 | Stopped by overfitting detector (100 iterations wait)
10 |
11 | bestTest = 0.2677497192
12 | bestIteration = 2222
13 |
14 | Shrink model to first 2223 iterations.
15 | {'learn': {'MultiClass': 0.12163532058790176}, 'validation': {'MultiClass': 0.26774971916097773}}
16 | FOLD 2 IS RUNNING...
17 | 0: learn: 1.2947765 test: 1.2944610 best: 1.2944610 (0) total: 81.8ms remaining: 13m 38s
18 | 800: learn: 0.2009925 test: 0.2969940 best: 0.2969940 (800) total: 1m 2s remaining: 11m 53s
19 | Stopped by overfitting detector (100 iterations wait)
20 |
21 | bestTest = 0.2898436422
22 | bestIteration = 1413
23 |
24 | Shrink model to first 1414 iterations.
25 | {'learn': {'MultiClass': 0.15671545706553627}, 'validation': {'MultiClass': 0.2898436422052235}}
26 | FOLD 3 IS RUNNING...
27 | 0: learn: 1.2956904 test: 1.2979653 best: 1.2979653 (0) total: 83.6ms remaining: 13m 55s
28 | 800: learn: 0.2010365 test: 0.3031897 best: 0.3031249 (796) total: 1m 2s remaining: 11m 56s
29 | 1600: learn: 0.1521093 test: 0.2952955 best: 0.2952927 (1598) total: 2m 4s remaining: 10m 54s
30 | Stopped by overfitting detector (100 iterations wait)
31 |
32 | bestTest = 0.2948664255
33 | bestIteration = 1799
34 |
35 | Shrink model to first 1800 iterations.
36 | {'learn': {'MultiClass': 0.13764700334845772}, 'validation': {'MultiClass': 0.2948664254808659}}
37 | FOLD 4 IS RUNNING...
38 | 0: learn: 1.2944941 test: 1.2931731 best: 1.2931731 (0) total: 83.8ms remaining: 13m 58s
39 | 800: learn: 0.2055831 test: 0.2798750 best: 0.2798750 (800) total: 1m 2s remaining: 11m 54s
40 | 1600: learn: 0.1555797 test: 0.2733073 best: 0.2732265 (1590) total: 2m 4s remaining: 10m 54s
41 | Stopped by overfitting detector (100 iterations wait)
42 |
43 | bestTest = 0.2729804824
44 | bestIteration = 1672
45 |
46 | Shrink model to first 1673 iterations.
47 | {'learn': {'MultiClass': 0.14819996336927216}, 'validation': {'MultiClass': 0.27298048242230794}}
48 | FOLD 5 IS RUNNING...
49 | 0: learn: 1.2909100 test: 1.2914652 best: 1.2914652 (0) total: 86.9ms remaining: 14m 29s
50 | 800: learn: 0.2014462 test: 0.2983963 best: 0.2983963 (800) total: 1m 2s remaining: 11m 55s
51 | 1600: learn: 0.1523926 test: 0.2909189 best: 0.2907775 (1582) total: 2m 4s remaining: 10m 54s
52 | Stopped by overfitting detector (100 iterations wait)
53 |
54 | bestTest = 0.2898741689
55 | bestIteration = 1887
56 |
57 | Shrink model to first 1888 iterations.
58 | {'learn': {'MultiClass': 0.13391467495348316}, 'validation': {'MultiClass': 0.289874168865446}}
59 |
60 | OOF-MEAN-ERROR score:0.283063, OOF-STD:0.010657
61 | Inint Score: 0.7240031522090993
62 | round: 1
63 | class:0, new_weight:1.01, f1 score: 0.7242893038330873
64 | class:0, new_weight:1.02, f1 score: 0.7244468658037289
65 | class:0, new_weight:1.03, f1 score: 0.7247189260435818
66 | class:0, new_weight:1.05, f1 score: 0.7247883133652404
67 | class:0, new_weight:1.06, f1 score: 0.7253074441711662
68 | class:0, new_weight:1.07, f1 score: 0.7255838308898628
69 | class:0, new_weight:1.09, f1 score: 0.7258591461588992
70 | class:0, new_weight:1.1, f1 score: 0.7263732069942956
71 | class:0, new_weight:1.11, f1 score: 0.7269810148203093
72 | class:0, new_weight:1.12, f1 score: 0.727085092104794
73 | class:0, new_weight:1.19, f1 score: 0.7275673332111965
74 | class:0, new_weight:1.2, f1 score: 0.7277300984468054
75 | class:0, new_weight:1.21, f1 score: 0.7300337938032027
76 | class:0, new_weight:1.22, f1 score: 0.7302916982856817
77 | class:0, new_weight:1.32, f1 score: 0.7302972834627351
78 | class:0, new_weight:1.33, f1 score: 0.7305212560605624
79 | class:0, new_weight:1.34, f1 score: 0.7307742905548762
80 | class:0, new_weight:1.3800000000000001, f1 score: 0.731115696618317
81 | class:0, new_weight:1.3900000000000001, f1 score: 0.7311341774607671
82 | class:0, new_weight:1.4000000000000001, f1 score: 0.7321211157346706
83 | class:0, new_weight:1.41, f1 score: 0.732530278451288
84 | class:0, new_weight:1.42, f1 score: 0.7326514907204666
85 | class:0, new_weight:1.43, f1 score: 0.7326655042252155
86 | class:0, new_weight:1.44, f1 score: 0.7340465325949609
87 | class:0, new_weight:1.45, f1 score: 0.7349701799847135
88 | class:2, new_weight:0.47000000000000003, f1 score: 0.7351355277520346
89 | class:2, new_weight:0.51, f1 score: 0.7352366908078052
90 | class:2, new_weight:0.52, f1 score: 0.7354704485017871
91 | class:2, new_weight:0.53, f1 score: 0.7356003615547112
92 | class:2, new_weight:0.54, f1 score: 0.7358162977063339
93 | class:2, new_weight:0.55, f1 score: 0.7360528528073605
94 | class:2, new_weight:0.6, f1 score: 0.7360930396635706
95 | class:2, new_weight:0.62, f1 score: 0.7361315695490319
96 | class:3, new_weight:0.77, f1 score: 0.736236509770795
97 | class:3, new_weight:0.79, f1 score: 0.7362861930960579
98 | class:3, new_weight:0.8, f1 score: 0.73637330491084
99 | class:3, new_weight:0.81, f1 score: 0.7364039172775363
100 | class:3, new_weight:0.8200000000000001, f1 score: 0.7365143106346561
101 | class:3, new_weight:0.8300000000000001, f1 score: 0.7366247783303799
102 | round: 2
103 | class:2, new_weight:0.55, f1 score: 0.7366811046538598
104 | round: 3
105 | ********************** SEARCH BEST WEIGHT : [1.45, 1.0, 0.55, 0.8300000000000001] **********************
106 | ********************** BEST MACRO_F1 : 0.7366811046538598 **********************
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/code/model.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | import datetime
3 | import lightgbm as lgb
4 | import numpy as np
5 | import pandas as pd
6 | from catboost import CatBoostClassifier
7 | from sklearn.model_selection import StratifiedKFold
8 |
9 | from utils import N_ROUNDS
10 | import pickle
11 | import os
12 | warnings.filterwarnings('ignore')
13 |
14 |
15 | def get_model_feature_importances(model):
16 | feature_importances = pd.DataFrame()
17 | feature_importances['fea'] = model.feature_names_
18 | feature_importances['importances'] = model.feature_importances_
19 | feature_importances = feature_importances.sort_values('importances', ascending=False).reset_index(drop=True)
20 |
21 | return feature_importances
22 |
23 |
24 | def run_cbt(train, target, test, k, seed, NUM_CLASS=4, cat_cols=[]):
25 | print('********************** RUN CATBOOST MODEL **********************')
26 | print(f'****************** 当前的 SEED {seed} ********************** ')
27 | folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
28 | oof_prob = np.zeros((train.shape[0], NUM_CLASS))
29 | test_prob = np.zeros((test.shape[0], NUM_CLASS))
30 | feature_importance_df = []
31 | offline_score = []
32 | model_list = []
33 |
34 | ## K-Fold
35 | for fold, (trn_idx, val_idx) in enumerate(folds.split(train, target)):
36 | print("FOLD {} IS RUNNING...".format(fold + 1))
37 | trn_x, trn_y = train.loc[trn_idx], target.loc[trn_idx]
38 | val_x, val_y = train.loc[val_idx], target.loc[val_idx]
39 | catboost_model = CatBoostClassifier(
40 | iterations=N_ROUNDS,
41 | od_type='Iter',
42 | od_wait=120,
43 | max_depth=8,
44 | learning_rate=0.05,
45 | l2_leaf_reg=9,
46 | random_seed=seed,
47 | fold_len_multiplier=1.1,
48 | loss_function='MultiClass',
49 | logging_level='Verbose',
50 | # task_type="GPU"
51 |
52 | )
53 |
54 | start_time = datetime.datetime.now()
55 |
56 | catboost_model.fit(trn_x,
57 | trn_y,
58 | eval_set=(val_x, val_y),
59 | use_best_model=True,
60 | verbose=800,
61 | early_stopping_rounds=100,
62 | cat_features=cat_cols,
63 | )
64 | end_time = datetime.datetime.now()
65 | model_train_cost_time = end_time - start_time
66 | print('****************** 模型训练 COST TIME : ',str(model_train_cost_time),' ******************')
67 |
68 | start_time = datetime.datetime.now()
69 | oof_prob[val_idx] = catboost_model.predict_proba(train.loc[val_idx])
70 | end_time = datetime.datetime.now()
71 | model_pred_cost_time = end_time - start_time
72 | print('****************** 模型预测 COST TIME : ', str(model_pred_cost_time), ' ******************')
73 | # catboost_model = catboost_model.get_best_iteration()
74 | test_prob += catboost_model.predict_proba(test) / folds.n_splits
75 | print(catboost_model.get_best_score())
76 | offline_score.append(catboost_model.get_best_score()['validation']['MultiClass'])
77 |
78 | feature_importance_df.append(get_model_feature_importances(catboost_model))
79 | model_list.append(catboost_model)
80 | with open(os.path.join('../model', f'cat_model_flod_{fold}.pkl'), 'wb') as f:
81 | pickle.dump(catboost_model, f)
82 | print('\nOOF-MEAN-ERROR score:%.6f, OOF-STD:%.6f' % (np.mean(offline_score), np.std(offline_score)))
83 | fea_imp_df = pd.concat(feature_importance_df, ignore_index=True).groupby('fea').agg(
84 | {'importances': 'mean'}).reset_index().sort_values('importances', ascending=False).reset_index(drop=True)
85 |
86 | return oof_prob, test_prob, fea_imp_df, model_list
87 |
88 |
89 | def run_lgb(train, target, test, k, seed=42, NUM_CLASS=4, cat_cols=[]):
90 | # feats = [f for f in train.columns if f not in ['cust_no', 'label', 'I7', 'I9', 'B6']]
91 | # print('Current num of features:', len(feats))
92 | print(f'********************** RUN LGBM MODEL **********************')
93 | print(f'****************** 当前的 SEED {seed} ********************** ')
94 | cols_map = {j: i for i, j in enumerate(train.columns)}
95 | cat_cols = [cols_map[i] for i in cat_cols]
96 | train = train.rename(columns=cols_map)
97 | test = test.rename(columns=cols_map)
98 | folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
99 | oof_prob = np.zeros((train.shape[0], NUM_CLASS))
100 | test_prob = np.zeros((test.shape[0], NUM_CLASS))
101 | fea_imp_df_list = []
102 | offline_score = []
103 | model_list = []
104 | ## K-Fold
105 | for fold, (trn_idx, val_idx) in enumerate(folds.split(train, target)):
106 | params = {
107 | "objective": "multiclass",
108 | "num_class": NUM_CLASS,
109 | "learning_rate": 0.01,
110 | "max_depth": -1,
111 | "num_leaves": 32,
112 | "verbose": -1,
113 | "bagging_fraction": 0.8,
114 | "feature_fraction": 0.8,
115 | "seed": seed,
116 | 'metric': 'multi_error'
117 |
118 | }
119 | print("FOLD {} IS RUNNING...".format(fold + 1))
120 | trn_data = lgb.Dataset(train.loc[trn_idx], label=target.loc[trn_idx])
121 | val_data = lgb.Dataset(train.loc[val_idx], label=target.loc[val_idx])
122 |
123 | # train
124 | params['seed'] = seed
125 | lgb_model = lgb.train(
126 | params,
127 | trn_data,
128 | num_boost_round=N_ROUNDS,
129 | valid_sets=[trn_data, val_data],
130 | early_stopping_rounds=100,
131 | verbose_eval=200,
132 | categorical_feature=cat_cols,
133 |
134 | )
135 | # predict
136 | oof_prob[val_idx] = lgb_model.predict(train.loc[val_idx], num_iteration=lgb_model.best_iteration)
137 | test_prob += lgb_model.predict(test, num_iteration=lgb_model.best_iteration) / folds.n_splits
138 | offline_score.append(lgb_model.best_score['valid_1']['multi_error'])
139 | fea_imp = pd.DataFrame()
140 | fea_imp['feature_name'] = lgb_model.feature_name()
141 | fea_imp['importance'] = lgb_model.feature_importance()
142 | fea_imp['feature_name'] = fea_imp['feature_name'].map({str(cols_map[i]): i for i in cols_map})
143 | fea_imp = fea_imp.sort_values('importance', ascending=False)
144 | fea_imp_df_list.append(fea_imp)
145 |
146 | model_list.append(lgb_model)
147 | print('\nOOF-MEAN-ERROR score:%.6f, OOF-STD:%.6f' % (np.mean(offline_score), np.std(offline_score)))
148 | fea_imp_df = pd.concat(fea_imp_df_list, ignore_index=True).groupby('feature_name').agg(
149 | {'importance': 'mean'}).reset_index().sort_values('importance', ascending=False)
150 | return oof_prob, test_prob, fea_imp_df, model_list
151 |
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/code/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from log import Logger
4 | from collections import Counter
5 | from tqdm import tqdm
6 | import numpy as np
7 | import pandas as pd
8 |
9 | ROOT_DIR = os.path.join(sys.path[0], '../')
10 | LOG_DIR = os.path.join(ROOT_DIR, 'log')
11 |
12 | DATA_DIR = os.path.join(ROOT_DIR, 'data')
13 | TRAIN_DIR = os.path.join(DATA_DIR, 'preliminary_train')
14 | # 提交docker时 需要打开更换
15 | MODEL_PATH = os.path.join(ROOT_DIR, './model/deberta-base')
16 | MODEL_1_PATH = os.path.join(ROOT_DIR, './model')
17 | TEST_A_DIR = os.path.join(ROOT_DIR, './tcdata')
18 | # TEST_A_DIR = os.path.join(ROOT_DIR, './tcdata_test')
19 | PSEUDO_FALG = True
20 | TEST_B_DIR = os.path.join(ROOT_DIR, 'tcdata')
21 |
22 |
23 |
24 | RESULT_DIR = os.path.join(ROOT_DIR, 'prediction_result')
25 |
26 | FEATURE_DIR = os.path.join(ROOT_DIR, 'feature')
27 | GENERATION_DIR = os.path.join(FEATURE_DIR, 'generation')
28 | CORRELATION_DIR = os.path.join(FEATURE_DIR, 'correlation')
29 |
30 |
31 | USER_DATA_DIR = os.path.join(ROOT_DIR, 'user_data')
32 | USER_MODEL_DIR = os.path.join(USER_DATA_DIR, 'model_data')
33 | TMP_DIR = os.path.join(USER_DATA_DIR, 'tmp_data')
34 | N_ROUNDS = 10000
35 | TIME_INTERVAL = 60
36 |
37 | KEY_1 = ['OEM record c2', 'Processor CPU_Core_Error', '001c4c', 'System Event Sys_Event', 'Power Supply PS0_Status',
38 | 'Temperature CPU0_Margin_Temp', 'Reading 51 > Threshold 85 degrees C', 'Lower Non-critical going low',
39 | 'Temperature CPU1_Margin_Temp', 'System ACPI Power State #0x7d', 'Lower Critical going low']
40 | KEY_2 = ['OEM CPU0 MCERR', 'OEM CPU0 CATERR', 'Reading 0 < Threshold 2 degrees C', '0203c0a80101',
41 | 'Unknown CPU0 MCERR', 'Unknown CPU0 CATERR', 'Microcontroller #0x3b', 'System Boot Initiated',
42 | 'Processor #0xfa', 'Power Unit Pwr Unit Status', 'Hard reset', 'Power off/down', 'System Event #0xff',
43 | 'Memory CPU1A1_DIMM_Stat', '000000', 'Power cycle', 'OEM record c3', 'Memory CPU1C0_DIMM_Stat',
44 | 'Reading 0 < Threshold 1 degrees C', 'IERR']
45 | KEY_3 = ['Memory', 'Correctable ECC logging limit reached', 'Memory MEM_CHE0_Status', 'Memory Memory_Status',
46 | 'Memory #0x87', 'Memory CPU0F0_DIMM_Stat', 'Memory Device Disabled', 'Memory #0xe2',
47 | 'OS Stop/Shutdown OS Status', 'System Boot Initiated System Restart', 'OS Boot BIOS_Boot_Up',
48 | 'System Boot Initiated BIOS_Boot_UP', 'Memory DIMM101', 'OS graceful shutdown', 'OS Critical Stop OS Status',
49 | 'Memory #0xf9', 'Memory CPU0C0_DIMM_Stat', 'Memory DIMM111', 'Memory DIMM021', ]
50 | KEY_4 = ['Drive Fault', 'NMI/Diag Interrupt', 'Failure detected', 'Power Supply AC lost', 'Power Supply PSU0_Supply',
51 | 'AC out-of-range, but present', 'Predictive failure', 'Drive Present', 'Temperature Temp_DIMM_KLM',
52 | 'Temperature Temp_DIMM_DEF', 'Power Supply PS1_Status', 'Identify Status', 'Power Supply PS2_Status',
53 | 'Temperature DIMMG1_Temp', 'Upper Non-critical going high', 'Temperature DIMMG0_Temp',
54 | 'Upper Critical going high', 'Power Button pressed', 'System Boot Initiated #0xb8', 'Deasserted']
55 | TOP_KEY_WORDS = ['0203c0a80101', 'Configuration Error', 'Correctable ECC', 'Deasserted', 'Device Enabled', 'Drive Present',
56 | 'Event Logging Disabled SEL', 'Failure detected', 'IERR', 'Initiated by hard reset', 'Initiated by power up',
57 | 'Initiated by warm reset', 'Log area reset/cleared', 'Memory', 'Memory #0xe2', 'Memory CPU0C0',
58 | 'Microcontroller/Coprocessor BMC', 'OEM CPU0 CATERR', 'OEM CPU0 MCERR', 'OS Boot BIOS',
59 | 'OS Critical Stop OS Status', 'Power Supply PS1', 'Power Supply PS2', 'Presence detected', 'Processor', 'Processor CPU', 'Processor CPU0',
60 | 'Processor CPU1', 'S0/G0: working', 'S4/S5: soft-off', 'Slot / Connector PCIE', 'State Asserted', 'State Deasserted',
61 | 'System ACPI Power State ACPI', 'System Boot Initiated', 'System Boot Initiated #0xe0', 'System Boot Initiated BIOS',
62 | 'System Event', 'System Event #0x10', 'System Event #0xff', 'Timestamp Clock Sync', 'Transition to Running', 'Uncorrectable ECC',
63 | 'Uncorrectable machine check exception', 'Unknown CPU0 CATERR', 'Unknown CPU0 MCERR', 'Unknown Chassis', 'Watchdog2 IPMI',
64 | ]
65 | TOP_KEY_WORDS_2 = ['Processor CPU0 Status', 'System Boot Initiated BIOS Boot Up', 'Uncorrectable ECC', 'Initiated by power up',
66 | 'Configuration Error', 'Processor CPU CATERR', 'Processor CPU1 Status', 'Memory #0xe2', 'IERR', 'Initiated by warm reset',
67 | 'State Asserted', 'S4/S5: soft-off', 'Memory #0xf9', 'S0/G0: working', 'boot completed - device not specified', 'Timestamp Clock Sync',
68 | 'Presence detected', 'System Boot Initiated #0xe0', 'Drive Fault', 'Power Supply PS1 Status', 'Power off/down', 'OS Boot #0xe9',
69 | 'Failure detected', 'Uncorrectable machine check exception', 'Transition to Running', 'Power Supply PS2 Status',
70 | 'Memory Device Disabled', 'System Restart', 'System Event #0x10', 'Sensor access degraded or unavailable', 'Unknown #0x17',
71 | 'Drive Present', 'Management Subsys Health System Health', 'Power Supply AC lost', 'Microcontroller #0x16']
72 | CHARATERS = ['#', '&', ]
73 | # KEY_WORDS = KEY_1 + KEY_2 + KEY_3 + KEY_4 + CHARATERS
74 | KEY_WORDS = KEY_1 + KEY_2 + KEY_3 + KEY_4 + CHARATERS + TOP_KEY_WORDS
75 | KEY_WORDS = list(set(KEY_WORDS))
76 | # cnt_1_0_diff_key_words = ['State Asserted','Processor CPU_CATERR','Unknown #0x17','Microcontroller #0x16','Transition to Running','State Deasserted','Processor #0xfa','Temperature CPU1_Margin_Temp','Temperature CPU0_Margin_Temp','Power cycle','Management Subsys Health System_Health','Sensor access degraded or unavailable','Power off/down','System ACPI Power State #0x7d']
77 | # key_words_0 = ['Temperature CPU0_Margin_Temp','Lower Critical going low','System ACPI Power State #0x7d','Temperature CPU1_Margin_Temp','Lower Non-critical going low','Uncorrectable machine check exception','Reading 0 < Threshold 1 degrees C','000000','Unknown #0x19','Temperature DIMMG1_Temp','Reading 0 < Threshold 0 degrees C','001c4c','IERR','Upper Critical going high','Unknown Chassis_control','Temperature DIMMG0_Temp','Upper Non-critical going high','Temperature Temp_DIMM_DEF','Power cycle','Processor CPU0_Status','Temperature Temp_DIMM_KLM','Processor CPU1_Status','Management Subsys Health System_Health']
78 | # key_words_1 = ['Processor #0xfa','State Deasserted','Power off/down','Power cycle','IERR','Unknown #0x17','Management Subsys Health System_Health','Processor CPU_CATERR','Reading 0 < Threshold 1 degrees C','','Sensor access degraded or unavailable','Transition to Running','State Asserted','Microcontroller #0x16','Processor CPU0_Status','Processor CPU1_Status','Slot / Connector PCIE_Status','Fault Status','System ACPI Power State ACPI_PWR_Status','Management Subsystem Health System_Health','Configuration Error','Uncorrectable machine check exception','Timestamp Clock Sync']
79 | # key_words_2 = ['Memory #0xe2','Memory Device Disabled','Memory #0x87','Memory #0xf9','Correctable ECC','Memory CPU0D0_DIMM_Stat','Uncorrectable ECC','Memory CPU1B0_DIMM_Stat','System Boot Initiated BIOS_Boot_UP','System Restart','Presence Detected','Temperature CPU0_Temp','boot completed - device not specified','Log almost full','Device Present','Legacy OFF state','System Boot Initiated #0xe0','System Event #0x10','Legacy ON state','OS Boot #0xe0','Unknown #0xc5','System Boot Initiated #0xb8','Event Logging Disabled SEL_Status']
80 | # key_words_3 = ['Drive Fault','Failure detected','Drive Present','Temperature Temp_DIMM_KLM','Temperature Temp_DIMM_DEF','Power Supply PS4_Status','Upper Non-critical going high','Temperature DIMMG0_Temp','Temperature DIMMG1_Temp','Power Supply PS3_Status','Upper Critical going high','Predictive failure','Power Supply AC lost','Unknown #0x19','Power Unit Power Unit','AC out-of-range, but present','Power Supply PS1_Status','Power Supply PS2_Status','Log area reset/cleared','Microcontroller/Coprocessor BMC_Boot_Up','System Boot Initiated #0xb8','Power Button pressed','Device Present']
81 | # top_key_words = [ 'Configuration Error','Uncorrectable ECC','Processor CPU0_Status','Initiated by power up','','Presence Detected','Processor CPU1_Status','S0/G0: working','Processor CPU_CATERR','Presence detected','S4/S5: soft-off','Upper Critical going high','Memory #0xe2','IERR','Initiated by warm reset','State Asserted','Upper Non-critical going high','boot completed - device not specified','Memory Device Disabled','Timestamp Clock Sync','Lower Critical going low','Transition to Running','Memory #0xf9','Power Supply PS1_Status']
82 | # key_words_1_desc = ['#0xfa', '#0x','#0xff','CATERR','cycle','Unit','IERR','IPMI','#0x17', 'Running','#0x7c','Unknown','CPU', 'Sensor','CPU0','CPU1','Subsys']
83 | #
84 | # key_words = cnt_1_0_diff_key_words +key_words_0+key_words_1+key_words_2+key_words_3+top_key_words+key_words_1_desc
85 | # key_words = list(set(key_words))
86 | # KEY_WORDS = key_words+CHARATERS
87 |
88 |
89 | def create_dir(dir):
90 | """
91 | 创建目录
92 | :param dir: 目录名
93 | :return:
94 | """
95 | if not os.path.exists(dir):
96 | os.mkdir(dir)
97 | print(f'{dir}目录不存在,创建{dir}目录成功.')
98 | else:
99 | print(f'{dir}目录已存在.')
100 |
101 |
102 | def create_all_dir():
103 | """
104 | 创建所有需要的目录
105 | :return:
106 | """
107 | create_dir(ROOT_DIR)
108 | create_dir(LOG_DIR)
109 |
110 | # create_dir(MODEL_DIR)
111 | create_dir(RESULT_DIR)
112 |
113 | create_dir(FEATURE_DIR)
114 | create_dir(GENERATION_DIR)
115 | create_dir(CORRELATION_DIR)
116 |
117 | create_dir(DATA_DIR)
118 | create_dir(TRAIN_DIR)
119 | create_dir(TEST_A_DIR)
120 | # create_dir(TEST_B_DIR)
121 |
122 | create_dir(USER_DATA_DIR)
123 | create_dir(USER_MODEL_DIR)
124 | create_dir(TMP_DIR)
125 |
126 |
127 | def clean_str(string):
128 | return string
129 |
130 |
131 | def my_tokenizer(s):
132 | return s.split(' | ')
133 |
134 |
135 | def get_word_counter(data):
136 | print('获取异常日志计数字典')
137 |
138 | counter = Counter()
139 | for string_ in tqdm(data['msg']):
140 | string_ = string_.strip()
141 | counter.update(my_tokenizer(clean_str(string_)))
142 | return counter
143 |
144 |
145 | def macro_f1(target_df: pd.DataFrame, submit_df: pd.DataFrame):
146 | """
147 | 计算得分
148 | :param target_df: [sn,fault_time,label]
149 | :param submit_df: [sn,fault_time,label]
150 | :return:
151 | """
152 |
153 | weights = [5 / 11, 4 / 11, 1 / 11, 1 / 11]
154 |
155 | # weights = [3 / 7, 2 / 7, 1 / 7, 1 / 7]
156 | overall_df = target_df.merge(
157 | submit_df, how='left', on=[
158 | 'sn', 'fault_time'], suffixes=[
159 | '_gt', '_pr'])
160 | overall_df.fillna(-1)
161 | macro_F1 = 0.
162 | for i in range(len(weights)):
163 | TP = len(overall_df[(overall_df['label_gt'] == i)
164 | & (overall_df['label_pr'] == i)])
165 | FP = len(overall_df[(overall_df['label_gt'] != i)
166 | & (overall_df['label_pr'] == i)])
167 | FN = len(overall_df[(overall_df['label_gt'] == i)
168 | & (overall_df['label_pr'] != i)])
169 | precision = TP / (TP + FP) if (TP + FP) > 0 else 0
170 | recall = TP / (TP + FN) if (TP + FN) > 0 else 0
171 | F1 = 2 * precision * recall / \
172 | (precision + recall) if (precision + recall) > 0 else 0
173 | macro_F1 += weights[i] * F1
174 | return macro_F1
175 |
176 |
177 | def search_weight(train, valid_y, raw_prob, init_weight=[
178 | 1.0], class_num=4, step=0.001):
179 | weight = init_weight.copy() * class_num
180 | oof = train[['sn', 'fault_time']]
181 | oof['label'] = raw_prob.argmax(axis=1)
182 | f_best = macro_f1(train[['sn', 'fault_time', 'label']], oof)
183 | print("Inint Score:", f_best)
184 |
185 | # f_best = f1_score(y_true=valid_y, y_pred=raw_prob.argmax(axis=1),average='macro')
186 | flag_score = 0
187 | round_num = 1
188 | while (flag_score != f_best):
189 | print("round: ", round_num)
190 | round_num += 1
191 | flag_score = f_best
192 | for c in range(class_num):
193 | for n_w in range(0, 2000, 10):
194 | num = n_w * step
195 | new_weight = weight.copy()
196 | new_weight[c] = num
197 | prob_df = raw_prob.copy()
198 | prob_df = prob_df * np.array(new_weight)
199 |
200 | oof['label'] = prob_df.argmax(axis=1)
201 | f = macro_f1(train[['sn', 'fault_time', 'label']], oof)
202 | # f = f1_score(y_true=valid_y, y_pred=prob_df.argmax(axis=1),average='macro')
203 | if f > f_best:
204 | weight = new_weight.copy()
205 | f_best = f
206 | print(f"class:{c}, new_weight:{num}, f1 score: {f}")
207 | print(
208 | f'********************** SEARCH BEST WEIGHT : {weight} **********************')
209 | return weight
210 |
211 |
212 | def get_new_cols(df, key=['sn', 'fault_time']):
213 | if isinstance(df.columns[0], tuple):
214 |
215 | new_cols = []
216 | for i in df.columns:
217 | if i[0] in key:
218 | new_cols.append(i[0])
219 | else:
220 | new_cols.append(f'{i[0]}_{i[1]}')
221 | df.columns = new_cols
222 | return df
223 | else:
224 | print('当前的DataFrame没有二级列名,请检查。')
225 | return df
226 |
227 |
228 | if __name__ == '__main__':
229 | # create_all_dir()
230 | logger = Logger(name=os.path.basename(__file__).split(
231 | '.py')[0], log_path=LOG_DIR, mode="w").get_log
232 | print(len(KEY_WORDS))
233 |
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/code/lgb_fs.py:
--------------------------------------------------------------------------------
1 | import os
2 | import warnings
3 |
4 | import numpy as np
5 | import pandas as pd
6 | import datetime
7 | from generate_feature import get_beta_target, add_last_next_time4fault, get_feature, \
8 | get_duration_minutes_fea, get_nearest_msg_fea, get_server_model_sn_fea_2, \
9 | get_server_model_fea, get_msg_text_fea_all, get_key_word_cross_fea, get_server_model_time_interval_stat_fea, \
10 | get_w2v_feats
11 | from model import run_cbt,run_lgb
12 | from utils import RESULT_DIR, TRAIN_DIR, \
13 | TEST_A_DIR, KEY_WORDS, get_word_counter, search_weight, macro_f1, TIME_INTERVAL,PSEUDO_FALG,GENERATION_DIR
14 |
15 | warnings.filterwarnings('ignore')
16 |
17 |
18 | def get_label(PSEUDO_FALG):
19 | preliminary_train_label_dataset = pd.read_csv(preliminary_train_label_dataset_path)
20 | preliminary_train_label_dataset_s = pd.read_csv(preliminary_train_label_dataset_s_path)
21 |
22 | if PSEUDO_FALG:
23 | print('获取伪标签LABEL')
24 | pseudo_labels = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_labels.csv'))
25 | label = pd.concat([preliminary_train_label_dataset,
26 | pseudo_labels,
27 | preliminary_train_label_dataset_s],
28 | ignore_index=True,
29 | axis=0).sort_values(
30 | ['sn', 'fault_time']).reset_index(drop=True)
31 | else:
32 | print('不使用伪标签数据')
33 | label = pd.concat([preliminary_train_label_dataset,
34 | preliminary_train_label_dataset_s],
35 | ignore_index=True,
36 | axis=0).sort_values(
37 | ['sn', 'fault_time']).reset_index(drop=True)
38 | label['fault_time'] = label['fault_time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
39 | label['fault_time'] = label['fault_time'].apply(lambda x: str(x))
40 | return label
41 |
42 |
43 | def get_log_dateset(PSEUDO_FALG):
44 | preliminary_sel_log_dataset = pd.read_csv(preliminary_sel_log_dataset_path)
45 | preliminary_sel_log_dataset_a = pd.read_csv(preliminary_sel_log_dataset_a_path)
46 | if PSEUDO_FALG:
47 | print('获取伪标签日志数据')
48 | pseudo_sel_log_dataset = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_sel_log_dataset.csv'))
49 | log_dataset = pd.concat([preliminary_sel_log_dataset,
50 | pseudo_sel_log_dataset,
51 | preliminary_sel_log_dataset_a],
52 | ignore_index=True,
53 | axis=0).sort_values(
54 | ['sn', 'time', 'server_model']).reset_index(drop=True)
55 | else:
56 | print('不使用伪标签数据')
57 | log_dataset = pd.concat([preliminary_sel_log_dataset,
58 | preliminary_sel_log_dataset_a],
59 | ignore_index=True,
60 | axis=0).sort_values(
61 | ['sn', 'time', 'server_model']).reset_index(drop=True)
62 | log_dataset['time'] = log_dataset['time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
63 |
64 | return log_dataset
65 |
66 | def get_fea_distribute(feature_df, feature_importances, dataset_type, top=30):
67 | print('根据特征重要性,获取数据集的分布情况,用于验证训练集和测试集是否分布一致')
68 | fea_distribute_list = []
69 | for i in feature_importances[:top]['fea'].to_list():
70 | fea_distribute_tmp = (feature_df[i].value_counts() / len(feature_df)).reset_index().rename(
71 | columns={'index': 'value'})
72 | fea_distribute_list.append(fea_distribute_tmp)
73 |
74 | fea_distribute = fea_distribute_list[-1]
75 | for i in fea_distribute_list[:-1]:
76 | fea_distribute = fea_distribute.merge(i, on='value', how='left')
77 | fea_distribute['value'] = fea_distribute['value'].apply(lambda x: f'{dataset_type}_{int(x)}')
78 | return fea_distribute
79 |
80 |
81 | def get_train_test(label, preliminary_submit_dataset_a, log_dataset):
82 | print('获取训练集数据与测试集数据')
83 | train = label.merge(log_dataset, on='sn', how='left')
84 | test = preliminary_submit_dataset_a.merge(log_dataset, on='sn', how='left')
85 | # train['time_interval'] = (pd.to_datetime( train['fault_time'])-train['time'] ).apply(lambda x:x.total_seconds())
86 | # test['time_interval'] = (pd.to_datetime( test['fault_time'])- test['time'] ).apply(lambda x:x.total_seconds())
87 | # train = train.query('time_interval > 0')
88 | # test = test.query('time_interval > 0')
89 | print(f'训练集维度:{train.shape},测试集维度:{test.shape}')
90 | train = train.drop_duplicates().reset_index(drop=True)
91 | test = test.drop_duplicates().reset_index(drop=True)
92 | train['time'] = pd.to_datetime(train['time'])
93 | test['time'] = pd.to_datetime(test['time'])
94 | return train, test
95 |
96 | start_time = datetime.datetime.now()
97 |
98 | additional_sel_log_dataset_path = os.path.join(TRAIN_DIR, 'additional_sel_log_dataset.csv')
99 | preliminary_train_label_dataset_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset.csv')
100 | preliminary_train_label_dataset_s_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset_s.csv')
101 | preliminary_sel_log_dataset_path = os.path.join(TRAIN_DIR, 'preliminary_sel_log_dataset.csv')
102 |
103 | preliminary_submit_dataset_a_path = os.path.join(TEST_A_DIR, 'final_submit_dataset_b.csv')
104 | preliminary_sel_log_dataset_a_path = os.path.join(TEST_A_DIR, 'final_sel_log_dataset_b.csv')
105 |
106 | print(preliminary_submit_dataset_a_path, preliminary_sel_log_dataset_a_path)
107 |
108 | preliminary_submit_dataset_a = pd.read_csv(preliminary_submit_dataset_a_path)
109 | preliminary_submit_dataset_a.head()
110 |
111 | log_dataset = get_log_dateset(PSEUDO_FALG)
112 | label = get_label(PSEUDO_FALG)
113 |
114 |
115 | next_time_list = [i / TIME_INTERVAL for i in [3, 5, 10, 15, 30, 45, 60, 90, 120, 240, 360, 480, 540, 600]] + [1000000]
116 |
117 | label, preliminary_submit_dataset_a = add_last_next_time4fault(label, preliminary_submit_dataset_a, TIME_INTERVAL,
118 | next_time_list)
119 | train, test = get_train_test(label, preliminary_submit_dataset_a, log_dataset)
120 | train = train.drop_duplicates(['sn', 'fault_time', 'time', 'msg', 'server_model']).reset_index(drop=True)
121 |
122 | train['time_interval'] = (pd.to_datetime(train['fault_time']) - pd.to_datetime(train['time'])).apply(
123 | lambda x: x.total_seconds())
124 | test['time_interval'] = (pd.to_datetime(test['fault_time']) - pd.to_datetime(test['time'])).apply(
125 | lambda x: x.total_seconds())
126 |
127 | all_data = pd.concat([train, test], axis=0, ignore_index=True)
128 | all_data = all_data.sort_values(['sn','server_model', 'fault_time', 'time'])
129 | w2v_feats = get_w2v_feats(all_data,
130 | f1_list = ['sn'],
131 | f2_list = ['msg_list', 'msg_0', 'msg_1', 'msg_2'])
132 |
133 | # 获取 server_model_time_interval_stat_fea
134 | server_model_time_interval_stat_fea = get_server_model_time_interval_stat_fea(all_data)
135 |
136 | msg_text_fea = get_msg_text_fea_all(all_data)
137 | # 获取时间差特征
138 | duration_minutes_fea = get_duration_minutes_fea(train, test)
139 |
140 | # 获取时间server_model特征
141 | server_model_fea = get_server_model_fea(train, test)
142 | counter = get_word_counter(train)
143 |
144 | # 获取时间 nearest_msg 特征
145 | nearest_msg_fea = get_nearest_msg_fea(train, test)
146 | # 获取时间 server_model beta_target 特征
147 | beta_target_fea = get_beta_target(train, test)
148 |
149 | key = ['sn', 'fault_time', 'label', 'server_model']
150 |
151 | fea_num = len(KEY_WORDS)
152 | time_list = [i * TIME_INTERVAL for i in next_time_list]
153 | train = get_feature(train, time_list, KEY_WORDS, fea_num, key=['sn', 'fault_time', 'label', 'server_model'])
154 | test = get_feature(test, time_list, KEY_WORDS, fea_num, key=['sn', 'fault_time', 'server_model'])
155 |
156 | print('添加 时间差 特征')
157 | train = train.merge(duration_minutes_fea, on=['sn', 'fault_time', 'server_model'])
158 | test = test.merge(duration_minutes_fea, on=['sn', 'fault_time', 'server_model'])
159 |
160 | print('添加 server_model特征')
161 | train = train.merge(server_model_fea, on=['sn', 'server_model'])
162 | test = test.merge(server_model_fea, on=['sn', 'server_model'])
163 |
164 | print('添加 w2v_feats')
165 | train = train.merge(w2v_feats, on=['sn' ])
166 | test = test.merge(w2v_feats, on=['sn', ])
167 |
168 | print('添加 nearest_msg 特征')
169 | train = train.merge(nearest_msg_fea, on=['sn', 'server_model', 'fault_time'])
170 | test = test.merge(nearest_msg_fea, on=['sn', 'server_model', 'fault_time'])
171 |
172 | print('添加 beta_target 特征')
173 | train = train.merge(beta_target_fea, on=['sn', 'server_model', 'fault_time'])
174 | test = test.merge(beta_target_fea, on=['sn', 'server_model', 'fault_time'])
175 |
176 | server_model_sn_fea_2 = get_server_model_sn_fea_2(train, test)
177 | print('添加 server_model_sn_fea_2 特征')
178 | train = train.merge(server_model_sn_fea_2, on=['sn', 'server_model'])
179 | test = test.merge(server_model_sn_fea_2, on=['sn', 'server_model'])
180 |
181 | # crashdump_venus_fea = pd.read_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea.csv') )
182 | # print('添加 crashdump_venus_fea 特征')
183 | # print(train.shape,test.shape,crashdump_venus_fea.shape)
184 | # train = train.merge(crashdump_venus_fea, on=['sn' , 'fault_time'],how = 'left')
185 | # test = test.merge(crashdump_venus_fea, on=['sn', 'fault_time' ],how = 'left')
186 | # print(train.shape,test.shape )
187 |
188 | crashdump_venus_fea = pd.read_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea_v1.csv') )
189 | print('添加 crashdump_venus_fea 特征')
190 | print(train.shape,test.shape,crashdump_venus_fea.shape)
191 | train = train.merge(crashdump_venus_fea, on=['sn' , 'fault_time'],how = 'left')
192 | test = test.merge(crashdump_venus_fea, on=['sn', 'fault_time' ],how = 'left')
193 | print(train.shape,test.shape )
194 | test.to_csv(os.path.join(GENERATION_DIR,'test.csv'),index =False)
195 | train.to_csv(os.path.join(GENERATION_DIR,'train.csv'),index =False)
196 |
197 | # print('添加 msg_text_fea 特征')
198 | # train = train.merge(msg_text_fea, on=['sn', 'fault_time' ], how='left')
199 | # test = test.merge(msg_text_fea, on=['sn', 'fault_time'], how='left')
200 |
201 | # print('添加 关键词交叉特征 ')
202 | # train,test = get_key_word_cross_fea(train,test)
203 |
204 | # print('添加 server_model_time_interval_stat_fea 特征')
205 | # train = train.merge(server_model_time_interval_stat_fea, on=['server_model' ],how ='left')
206 | # test = test.merge(server_model_time_interval_stat_fea, on=['server_model' ],how ='left')
207 |
208 |
209 | use_less_cols_1 = ['last_last_msg_cnt', 'last_first_msg_cnt','time_diff_1_min',
210 | 'last_msg_list_unique_LabelEnc', 'last_msg_0_unique_LabelEnc',
211 | 'last_msg_1_unique_LabelEnc', 'last_msg_2_unique_LabelEnc',
212 | 'last_msg_list_list_LabelEnc', 'last_msg_0_list_LabelEnc',
213 | 'last_msg_1_list_LabelEnc', 'last_msg_2_list_LabelEnc',
214 | 'last_msg_0_first_LabelEnc', 'last_msg_1_first_LabelEnc',
215 | 'last_msg_2_first_LabelEnc', 'last_msg_0_last_LabelEnc',
216 | 'last_msg_1_last_LabelEnc', 'last_msg_2_last_LabelEnc',
217 | 'last_msg_last_LabelEnc', 'last_msg_first_LabelEnc']
218 |
219 | use_less_col = [i for i in train.columns if train[i].nunique() < 2] + use_less_cols_1
220 |
221 |
222 | print(f'use_less_col:{len(use_less_col)}')
223 | use_cols = [i for i in train.columns if i not in ['sn', 'fault_time', 'label', 'server_model'] + use_less_col]
224 | cat_cols = ['server_model_LabelEnc', 'msg_LabelEnc', 'msg_0_LabelEnc', 'msg_1_LabelEnc', 'msg_2_LabelEnc',]
225 | use_cols = sorted(use_cols)
226 | print('使用的特征维度:',len(use_cols))
227 |
228 | # cat_cols = []
229 | # for i in use_cols:
230 | # if '_LabelEnc' in i:
231 | # cat_cols.append(i)
232 |
233 | oof_prob = np.zeros((train.shape[0], 4))
234 | test_prob = np.zeros((test.shape[0], 4))
235 | # seeds = [42,4242,40424,1024,2048]
236 | seeds = [42 ]
237 | for seed in seeds:
238 | oof_prob, test_prob, fea_imp_df, model_list = run_lgb(train[use_cols], train[['label']], test[use_cols], k=5,
239 | seed=seed, cat_cols=cat_cols)
240 | oof_prob +=oof_prob/len(seeds)
241 | test_prob +=test_prob/len(seeds)
242 |
243 | weight = search_weight(train, train[['label']], oof_prob, init_weight=[1.0], class_num=4, step=0.001)
244 | oof_prob = oof_prob * np.array(weight)
245 | test_prob = test_prob * np.array(weight)
246 |
247 | target_df = train[['sn', 'fault_time', 'label']]
248 | submit_df = train[['sn', 'fault_time']]
249 | submit_df['label'] = oof_prob.argmax(axis=1)
250 |
251 | score = macro_f1(target_df=target_df, submit_df=submit_df)
252 | print(f'********************** BEST MACRO_F1 : {score} **********************')
253 | score = round(score, 5)
254 |
255 | y_pred = test_prob.argmax(axis=1)
256 | result = test[['sn', 'fault_time']]
257 | result['label'] = y_pred
258 | result = preliminary_submit_dataset_a.merge(result, on=['sn', 'fault_time'], how='left')[['sn', 'fault_time', 'label']]
259 | result['label'] = result['label'].fillna(0).astype(int)
260 |
261 | result.to_csv(os.path.join(RESULT_DIR, f'lgb_result.csv'), index=False)
262 |
263 | fea_imp_df = fea_imp_df.reset_index(drop=True)
264 | fea_imp_df.to_csv(os.path.join(RESULT_DIR, f'./lgb_fea_imp_{int(score * 100000)}.csv'), index=False)
265 |
266 | train_result_prob = pd.DataFrame(oof_prob).add_prefix('lgb_class_')
267 | test_result_prob = pd.DataFrame(test_prob).add_prefix('lgb_class_')
268 | train_result_prob['label'] = train['label']
269 | train_result_prob['sn'] = train['sn']
270 | train_result_prob['fault_time'] = train['fault_time']
271 | test_result_prob['sn'] = test['sn']
272 | test_result_prob['fault_time'] = test['fault_time']
273 |
274 | result_prob = pd.concat([train_result_prob,test_result_prob],ignore_index = True)
275 | result_prob.to_csv(os.path.join(RESULT_DIR,f'lgb_prob_result.csv'),index = False)
276 |
277 |
278 | end_time = datetime.datetime.now()
279 | cost_time = end_time - start_time
280 | print('****************** LIGHTGBM COST TIME : ',str(cost_time),' ******************')
281 |
282 | '''
283 |
284 | v7 最优版本 线下 7356
285 | v8: v7 添加 关键词交叉特征 线下 0.7357 线上 7338
286 | v8.1 v7 添加 关键词交叉特征 并作为类别变量输入模型 0.73361
287 | v8.2 v7 添加 关键词交叉特征 并作为类别变量输入模型 删除 TOP_KEY_WORDS 7117
288 | v8.3 v7 添加 关键词交叉特征 并作为类别变量输入模型 使用 TOP_KEY_WORDS_2 7260
289 | v8.3 v7 添加 关键词交叉特征 并作为类别变量输入模型 添加 TOP_KEY_WORDS_2 7260
290 |
291 | '''
292 |
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/code/catboost_fs.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import os
3 | import warnings
4 |
5 | import numpy as np
6 | import pandas as pd
7 |
8 |
9 | from generate_feature import get_beta_target, add_last_next_time4fault, get_feature, \
10 | get_duration_minutes_fea, get_nearest_msg_fea, get_server_model_sn_fea_2, \
11 | get_server_model_fea, get_msg_text_fea_all, get_key_word_cross_fea, get_server_model_time_interval_stat_fea, \
12 | get_w2v_feats, get_key_for_top_fea,get_time_diff_feats_v2
13 | from model import run_cbt
14 | from utils import RESULT_DIR, TRAIN_DIR, \
15 | TEST_A_DIR, KEY_WORDS, get_word_counter, search_weight, macro_f1, TIME_INTERVAL,PSEUDO_FALG,GENERATION_DIR
16 |
17 | warnings.filterwarnings('ignore')
18 |
19 |
20 | def get_label(PSEUDO_FALG):
21 | preliminary_train_label_dataset = pd.read_csv(preliminary_train_label_dataset_path)
22 | preliminary_train_label_dataset_s = pd.read_csv(preliminary_train_label_dataset_s_path)
23 |
24 | if PSEUDO_FALG:
25 | print('获取伪标签LABEL')
26 | pseudo_labels = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_labels.csv'))
27 | label = pd.concat([preliminary_train_label_dataset,
28 | pseudo_labels,
29 | preliminary_train_label_dataset_s],
30 | ignore_index=True,
31 | axis=0).sort_values(
32 | ['sn', 'fault_time']).reset_index(drop=True)
33 | else:
34 | print('不使用伪标签数据')
35 | label = pd.concat([preliminary_train_label_dataset,
36 | preliminary_train_label_dataset_s],
37 | ignore_index=True,
38 | axis=0).sort_values(
39 | ['sn', 'fault_time']).reset_index(drop=True)
40 | label['fault_time'] = label['fault_time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
41 | label['fault_time'] = label['fault_time'].apply(lambda x: str(x))
42 | return label
43 |
44 |
45 | def get_log_dateset(PSEUDO_FALG):
46 | preliminary_sel_log_dataset = pd.read_csv(preliminary_sel_log_dataset_path)
47 | preliminary_sel_log_dataset_a = pd.read_csv(preliminary_sel_log_dataset_a_path)
48 | if PSEUDO_FALG:
49 | print('获取伪标签日志数据')
50 | pseudo_sel_log_dataset = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_sel_log_dataset.csv'))
51 | log_dataset = pd.concat([preliminary_sel_log_dataset,
52 | pseudo_sel_log_dataset,
53 | preliminary_sel_log_dataset_a],
54 | ignore_index=True,
55 | axis=0).sort_values(
56 | ['sn', 'time', 'server_model']).reset_index(drop=True)
57 | else:
58 | print('不使用伪标签数据')
59 | log_dataset = pd.concat([preliminary_sel_log_dataset,
60 | preliminary_sel_log_dataset_a],
61 | ignore_index=True,
62 | axis=0).sort_values(
63 | ['sn', 'time', 'server_model']).reset_index(drop=True)
64 | log_dataset['time'] = log_dataset['time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
65 |
66 | return log_dataset
67 |
68 | def get_fea_distribute(feature_df, feature_importances, dataset_type, top=30):
69 | print('根据特征重要性,获取数据集的分布情况,用于验证训练集和测试集是否分布一致')
70 | fea_distribute_list = []
71 | for i in feature_importances[:top]['fea'].to_list():
72 | fea_distribute_tmp = (feature_df[i].value_counts() / len(feature_df)).reset_index().rename(
73 | columns={'index': 'value'})
74 | fea_distribute_list.append(fea_distribute_tmp)
75 |
76 | fea_distribute = fea_distribute_list[-1]
77 | for i in fea_distribute_list[:-1]:
78 | fea_distribute = fea_distribute.merge(i, on='value', how='left')
79 | fea_distribute['value'] = fea_distribute['value'].apply(lambda x: f'{dataset_type}_{int(x)}')
80 | return fea_distribute
81 |
82 |
83 |
84 | def get_train_test(label, preliminary_submit_dataset_a, log_dataset):
85 | print('获取训练集数据与测试集数据')
86 | train = label.merge(log_dataset, on='sn', how='left')
87 | test = preliminary_submit_dataset_a.merge(log_dataset, on='sn', how='left')
88 | # train['time_interval'] = (pd.to_datetime( train['fault_time'])-train['time'] ).apply(lambda x:x.total_seconds())
89 | # test['time_interval'] = (pd.to_datetime( test['fault_time'])- test['time'] ).apply(lambda x:x.total_seconds())
90 | # train = train.query('time_interval > 0')
91 | # test = test.query('time_interval > 0')
92 | print(f'训练集维度:{train.shape},测试集维度:{test.shape}')
93 | train = train.drop_duplicates().reset_index(drop=True)
94 | test = test.drop_duplicates().reset_index(drop=True)
95 | train['time'] = pd.to_datetime(train['time'])
96 | test['time'] = pd.to_datetime(test['time'])
97 | return train, test
98 | start_time = datetime.datetime.now()
99 |
100 | additional_sel_log_dataset_path = os.path.join(TRAIN_DIR, 'additional_sel_log_dataset.csv')
101 | preliminary_train_label_dataset_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset.csv')
102 | preliminary_train_label_dataset_s_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset_s.csv')
103 | preliminary_sel_log_dataset_path = os.path.join(TRAIN_DIR, 'preliminary_sel_log_dataset.csv')
104 |
105 | preliminary_submit_dataset_a_path = os.path.join(TEST_A_DIR, 'final_submit_dataset_b.csv')
106 | preliminary_sel_log_dataset_a_path = os.path.join(TEST_A_DIR, 'final_sel_log_dataset_b.csv')
107 |
108 | print(preliminary_submit_dataset_a_path, preliminary_sel_log_dataset_a_path)
109 |
110 | preliminary_submit_dataset_a = pd.read_csv(preliminary_submit_dataset_a_path)
111 | preliminary_submit_dataset_a.head()
112 |
113 | log_dataset = get_log_dateset(PSEUDO_FALG)
114 | label = get_label(PSEUDO_FALG)
115 |
116 |
117 | next_time_list = [i / TIME_INTERVAL for i in [3, 5, 10, 15, 30, 45, 60, 90, 120, 240, 360, 480, 540, 600]] + [1000000]
118 |
119 | label, preliminary_submit_dataset_a = add_last_next_time4fault(label, preliminary_submit_dataset_a, TIME_INTERVAL,
120 | next_time_list)
121 | train, test = get_train_test(label, preliminary_submit_dataset_a, log_dataset)
122 | train = train.drop_duplicates(['sn', 'fault_time', 'time', 'msg', 'server_model']).reset_index(drop=True)
123 |
124 | train['time_interval'] = (pd.to_datetime(train['fault_time']) - pd.to_datetime(train['time'])).apply(
125 | lambda x: x.total_seconds())
126 | test['time_interval'] = (pd.to_datetime(test['fault_time']) - pd.to_datetime(test['time'])).apply(
127 | lambda x: x.total_seconds())
128 |
129 | all_data = pd.concat([train, test], axis=0, ignore_index=True)
130 | all_data = all_data.sort_values(['sn','server_model', 'fault_time', 'time'])
131 | w2v_feats = get_w2v_feats(all_data,
132 | f1_list = ['sn'],
133 | f2_list = ['msg_list', 'msg_0', 'msg_1', 'msg_2'])
134 | # 获取 time_diff_feats_v2
135 | time_diff_feats_v2 = get_time_diff_feats_v2(all_data)
136 | # 获取 server_model_time_interval_stat_fea
137 | server_model_time_interval_stat_fea = get_server_model_time_interval_stat_fea(all_data)
138 |
139 | msg_text_fea = get_msg_text_fea_all(all_data)
140 | # 获取时间差特征
141 | duration_minutes_fea = get_duration_minutes_fea(train, test)
142 |
143 | # 获取时间server_model特征
144 | server_model_fea = get_server_model_fea(train, test)
145 | counter = get_word_counter(train)
146 |
147 | # 获取时间 nearest_msg 特征
148 | nearest_msg_fea = get_nearest_msg_fea(train, test)
149 | # 获取时间 server_model beta_target 特征
150 | beta_target_fea = get_beta_target(train, test)
151 |
152 | key = ['sn', 'fault_time', 'label', 'server_model']
153 |
154 | fea_num = len(KEY_WORDS)
155 | time_list = [i * TIME_INTERVAL for i in next_time_list]
156 | train = get_feature(train, time_list, KEY_WORDS, fea_num, key=['sn', 'fault_time', 'label', 'server_model'])
157 | test = get_feature(test, time_list, KEY_WORDS, fea_num, key=['sn', 'fault_time', 'server_model'])
158 |
159 | print('添加 时间差 特征')
160 | train = train.merge(duration_minutes_fea, on=['sn', 'fault_time', 'server_model'])
161 | test = test.merge(duration_minutes_fea, on=['sn', 'fault_time', 'server_model'])
162 |
163 | print('添加 server_model特征')
164 | train = train.merge(server_model_fea, on=['sn', 'server_model'])
165 | test = test.merge(server_model_fea, on=['sn', 'server_model'])
166 |
167 | print('添加 w2v_feats')
168 | train = train.merge(w2v_feats, on=['sn' ])
169 | test = test.merge(w2v_feats, on=['sn', ])
170 |
171 | print('添加 nearest_msg 特征')
172 | train = train.merge(nearest_msg_fea, on=['sn', 'server_model', 'fault_time'])
173 | test = test.merge(nearest_msg_fea, on=['sn', 'server_model', 'fault_time'])
174 |
175 | print('添加 beta_target 特征')
176 | train = train.merge(beta_target_fea, on=['sn', 'server_model', 'fault_time'])
177 | test = test.merge(beta_target_fea, on=['sn', 'server_model', 'fault_time'])
178 |
179 | server_model_sn_fea_2 = get_server_model_sn_fea_2(train, test)
180 | print('添加 server_model_sn_fea_2 特征')
181 | train = train.merge(server_model_sn_fea_2, on=['sn', 'server_model'])
182 | test = test.merge(server_model_sn_fea_2, on=['sn', 'server_model'])
183 |
184 | print('添加 time_diff_feats_v2 特征')
185 | train = train.merge(time_diff_feats_v2, on=['sn', 'server_model', 'fault_time'])
186 | test = test.merge(time_diff_feats_v2, on=['sn', 'server_model', 'fault_time'])
187 |
188 | # test.to_csv(os.path.join(GENERATION_DIR,'test.csv'),index =False)
189 | # train.to_csv(os.path.join(GENERATION_DIR,'train.csv'),index =False)
190 |
191 | # crashdump_venus_fea = pd.read_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea.csv') )
192 | # print('添加 crashdump_venus_fea 特征')
193 | # print(train.shape,test.shape,crashdump_venus_fea.shape)
194 | # train = train.merge(crashdump_venus_fea, on=['sn' , 'fault_time'],how = 'left')
195 | # test = test.merge(crashdump_venus_fea, on=['sn', 'fault_time' ],how = 'left')
196 | # print(train.shape,test.shape )
197 |
198 | # crashdump_venus_fea = pd.read_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea_v1.csv') )
199 | # print('添加 crashdump_venus_fea 特征')
200 | # print(train.shape,test.shape,crashdump_venus_fea.shape)
201 | # train = train.merge(crashdump_venus_fea, on=['sn' , 'fault_time'],how = 'left')
202 | # test = test.merge(crashdump_venus_fea, on=['sn', 'fault_time' ],how = 'left')
203 | # print(train.shape,test.shape )
204 | # test.to_csv(os.path.join(GENERATION_DIR,'test.csv'),index =False)
205 | # train.to_csv(os.path.join(GENERATION_DIR,'train.csv'),index =False)
206 | # print('添加 key_for_top_fea 特征')
207 | # train,test = get_key_for_top_fea(train,test)
208 |
209 | # print('添加 w2v_tfidf_doc2v_fea 特征')
210 | # w2v_tfidf_fea = pd.read_csv(os.path.join(GENERATION_DIR,'w2v_tfidf_fea.csv'))
211 | # drop_cols = [i for i in w2v_tfidf_fea if 'doc2vec' in i ]+[i for i in w2v_tfidf_fea if 'tfidf' in i ]
212 | # for col in drop_cols:
213 | # del w2v_tfidf_fea[col]
214 | #
215 | # train = train.merge(w2v_tfidf_fea, on=['sn' ], how='left')
216 | # test = test.merge(w2v_tfidf_fea, on=['sn' ], how='left')
217 |
218 | # print('添加 关键词交叉特征 ')
219 | # train,test = get_key_word_cross_fea(train,test)
220 |
221 | # print('添加 server_model_time_interval_stat_fea 特征')
222 | # train = train.merge(server_model_time_interval_stat_fea, on=['server_model' ],how ='left')
223 | # test = test.merge(server_model_time_interval_stat_fea, on=['server_model' ],how ='left')
224 |
225 |
226 | use_less_cols_1 = ['last_last_msg_cnt', 'last_first_msg_cnt','time_diff_1_min',
227 | 'last_msg_list_unique_LabelEnc', 'last_msg_0_unique_LabelEnc',
228 | 'last_msg_1_unique_LabelEnc', 'last_msg_2_unique_LabelEnc',
229 | 'last_msg_list_list_LabelEnc', 'last_msg_0_list_LabelEnc',
230 | 'last_msg_1_list_LabelEnc', 'last_msg_2_list_LabelEnc',
231 | 'last_msg_0_first_LabelEnc', 'last_msg_1_first_LabelEnc',
232 | 'last_msg_2_first_LabelEnc', 'last_msg_0_last_LabelEnc',
233 | 'last_msg_1_last_LabelEnc', 'last_msg_2_last_LabelEnc',
234 | 'last_msg_last_LabelEnc', 'last_msg_first_LabelEnc']
235 |
236 | use_less_col = [i for i in train.columns if train[i].nunique() < 2] + use_less_cols_1
237 |
238 |
239 | print(f'use_less_col:{len(use_less_col)}')
240 | use_cols = [i for i in train.columns if i not in ['sn', 'fault_time', 'label', 'server_model'] + use_less_col]
241 |
242 | cat_cols = ['server_model_LabelEnc', 'msg_LabelEnc', 'msg_0_LabelEnc', 'msg_1_LabelEnc', 'msg_2_LabelEnc',]
243 | use_cols = sorted(use_cols)
244 |
245 | cat_cols = []
246 | for i in use_cols:
247 | if '_LabelEnc' in i:
248 | cat_cols.append(i)
249 | print('使用的特征维度:',len(use_cols),'类别特征维度:',len(cat_cols))
250 | # fs = FeatureSelector(data=train[use_cols], labels=train['label'])
251 | #
252 | # # 选择出missing value 百分比大于60%的特征
253 | # fs.identify_missing(missing_threshold=0.9)
254 | #
255 | # # # 查看选择出的特征
256 | # # fs.ops['missing']
257 | # # 不对feature进行one-hot encoding(默认为False), 然后选择出相关性大于98%的feature,
258 | # fs.identify_collinear(correlation_threshold=0.99, one_hot=False)
259 | #
260 | # # # 查看选择的feature
261 | # # fs.ops['collinear']
262 | #
263 | # # 选择出只有单个值的feature
264 | # fs.identify_single_unique()
265 | #
266 | # # # 查看选择出的feature
267 | # # fs.ops['single_unique']
268 | #
269 | # train_removed = fs.remove(methods = ['missing', 'single_unique', 'collinear',], keep_one_hot=False)
270 | # use_cols = train_removed.columns
271 | # print('特征选择之后,使用的特征维度:',len(use_cols))
272 |
273 |
274 | oof_prob = np.zeros((train.shape[0], 4))
275 | test_prob = np.zeros((test.shape[0], 4))
276 | # seeds = [42,4242,40424,1024,2048]
277 | seeds = [42 ]
278 | for seed in seeds:
279 | oof_prob, test_prob, fea_imp_df, model_list = run_cbt(train[use_cols] , train[['label']] , test[use_cols], k=5,
280 | seed=seed, cat_cols=cat_cols)
281 | oof_prob +=oof_prob/len(seeds)
282 | test_prob +=test_prob/len(seeds)
283 |
284 |
285 | weight = search_weight(train, train[['label']], oof_prob, init_weight=[1.0], class_num=4, step=0.001)
286 | oof_prob = oof_prob * np.array(weight)
287 | test_prob = test_prob * np.array(weight)
288 |
289 | target_df = train[['sn', 'fault_time', 'label']]
290 | submit_df = train[['sn', 'fault_time']]
291 | submit_df['label'] = oof_prob.argmax(axis=1)
292 |
293 | score = macro_f1(target_df=target_df, submit_df=submit_df)
294 | print(f'********************** BEST MACRO_F1 : {score} **********************')
295 | score = round(score, 5)
296 |
297 | y_pred = test_prob.argmax(axis=1)
298 | result = test[['sn', 'fault_time']]
299 | result['label'] = y_pred
300 | result = preliminary_submit_dataset_a.merge(result, on=['sn', 'fault_time'], how='left')[['sn', 'fault_time', 'label']]
301 | result['label'] = result['label'].fillna(0).astype(int)
302 |
303 | result.to_csv(os.path.join(RESULT_DIR,f'catboost_result.csv'), index=False)
304 | print(result['label'].value_counts())
305 | fea_imp_df = fea_imp_df.reset_index(drop = True)
306 | fea_imp_df.to_csv(os.path.join(RESULT_DIR,f'./cat_fea_imp_{int(score*100000)}.csv'),index = False)
307 |
308 | train_result_prob = pd.DataFrame(oof_prob).add_prefix('cat_class_')
309 | test_result_prob = pd.DataFrame(test_prob).add_prefix('cat_class_')
310 | train_result_prob['label'] = train['label']
311 | train_result_prob['sn'] = train['sn']
312 | train_result_prob['fault_time'] = train['fault_time']
313 | test_result_prob['sn'] = test['sn']
314 | test_result_prob['fault_time'] = test['fault_time']
315 |
316 | result_prob = pd.concat([train_result_prob,test_result_prob],ignore_index = True)
317 | result_prob.to_csv(os.path.join(RESULT_DIR,f'cat_prob_result.csv'),index = False)
318 |
319 | end_time = datetime.datetime.now()
320 | cost_time = end_time - start_time
321 | print('****************** CATBOOST COST TIME : ',str(cost_time),' ******************')
322 |
323 | '''
324 |
325 | v7: 最优 线下 0.7303
326 | v8: v7 添加 关键词交叉特征 并作为类别变量输入模型 0.73114
327 |
328 | '''
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/code/get_crashdump_venus_fea.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import os
3 | import gc
4 | import warnings
5 | import pandas as pd
6 | import pickle
7 | from gensim.models.word2vec import Word2Vec
8 | from gensim.models.doc2vec import Doc2Vec, TaggedDocument
9 | from sklearn.utils.class_weight import compute_class_weight
10 | from sklearn.preprocessing import LabelEncoder
11 | from sklearn.feature_extraction.text import TfidfVectorizer
12 | from sklearn.decomposition import TruncatedSVD
13 | import numpy as np
14 | import pandas as pd
15 | from generate_feature import add_w2v_feats, cat2num
16 | from generate_feature import get_key
17 |
18 | from generate_feature import get_beta_target, add_last_next_time4fault, get_feature, \
19 | get_duration_minutes_fea, get_nearest_msg_fea, get_server_model_sn_fea_2, \
20 | get_server_model_fea, get_msg_text_fea_all, get_key_word_cross_fea, get_server_model_time_interval_stat_fea, \
21 | get_w2v_feats, get_key, get_class_key_words_nunique
22 | from model import run_cbt, run_lgb
23 | from utils import RESULT_DIR, TRAIN_DIR, \
24 | TEST_A_DIR, KEY_WORDS, TOP_KEY_WORDS, get_word_counter, search_weight, macro_f1, TIME_INTERVAL, PSEUDO_FALG, \
25 | GENERATION_DIR
26 |
27 | warnings.filterwarnings('ignore')
28 |
29 |
30 | def get_fault_code_list(x):
31 | try:
32 | x = x.replace('.', ',').split(',')
33 | except:
34 | x = []
35 | return x
36 |
37 |
38 | def get_module_cause_list(x):
39 | try:
40 | x = x.replace(',', '_').replace(',', '_')
41 | x = list(set(x.split('_')))
42 | except:
43 | x = []
44 | return x
45 |
46 |
47 | def get_label(PSEUDO_FALG):
48 | preliminary_train_label_dataset = pd.read_csv(preliminary_train_label_dataset_path)
49 | preliminary_train_label_dataset_s = pd.read_csv(preliminary_train_label_dataset_s_path)
50 |
51 | if PSEUDO_FALG:
52 | print('获取伪标签LABEL')
53 | pseudo_labels = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_labels.csv'))
54 | label = pd.concat([preliminary_train_label_dataset,
55 | pseudo_labels,
56 | preliminary_train_label_dataset_s],
57 | ignore_index=True,
58 | axis=0).sort_values(
59 | ['sn', 'fault_time']).reset_index(drop=True)
60 | else:
61 | print('不使用伪标签数据')
62 | label = pd.concat([preliminary_train_label_dataset,
63 | preliminary_train_label_dataset_s],
64 | ignore_index=True,
65 | axis=0).sort_values(
66 | ['sn', 'fault_time']).reset_index(drop=True)
67 | label['fault_time'] = label['fault_time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
68 | label['fault_time'] = label['fault_time'].apply(lambda x: str(x))
69 | return label
70 |
71 |
72 | def get_module_cause_code(x, code_name):
73 | code_list = []
74 | for i in x:
75 | if code_name in i:
76 | code_list.append(i)
77 | return code_list
78 |
79 |
80 | def get_alertname_code(x, alertname):
81 | x = x.split(',')
82 |
83 | try:
84 | alertname_code = x[x.index(alertname) + 1]
85 | except:
86 | alertname_code = np.nan
87 | return alertname_code
88 |
89 |
90 | def get_alertname_code_2(x, alertname):
91 | # x =x.split(',')
92 |
93 | try:
94 | alertname_code = x[x.index(alertname) + 1]
95 | except:
96 | alertname_code = ' '
97 | return alertname_code
98 |
99 |
100 | def get_last_msg_cnt(x):
101 | last_msg = x[-1]
102 | cnt = x.count(last_msg)
103 | return cnt
104 |
105 |
106 | def get_first_msg_cnt(x):
107 | first_msg = x[0]
108 | cnt = x.count(first_msg)
109 | return cnt
110 |
111 |
112 | def get_crashdump_venus_data():
113 | final_venus_dataset = pd.read_csv(os.path.join(TEST_A_DIR, 'final_venus_dataset_b.csv'))
114 | final_crashdump_dataset = pd.read_csv(os.path.join(TEST_A_DIR, 'final_crashdump_dataset_b.csv'))
115 | final_crashdump_venus = final_crashdump_dataset.merge(final_venus_dataset, on=['sn', 'fault_time'],
116 | how='outer')
117 |
118 | preliminary_venus_dataset = pd.read_csv(os.path.join(TRAIN_DIR, 'preliminary_venus_dataset.csv'))
119 | preliminary_crashdump_dataset = pd.read_csv(os.path.join(TRAIN_DIR, 'preliminary_crashdump_dataset.csv'))
120 | preliminary_crashdump_venus = preliminary_crashdump_dataset.merge(preliminary_venus_dataset,
121 | on=['sn', 'fault_time'],
122 | how='outer')
123 |
124 | crashdump_venus = pd.concat([final_crashdump_venus, preliminary_crashdump_venus],
125 | ignore_index=True).drop_duplicates()
126 | crashdump_venus = crashdump_venus.sort_values(['sn', 'fault_time']).reset_index(drop=True)
127 | return crashdump_venus
128 |
129 |
130 | def get_crashdump_venus_fea(crashdump_venus):
131 | print('生成 crashdump_venus 特征')
132 | crashdump_venus['module_cause_list'] = crashdump_venus['module_cause'].apply(lambda x: get_module_cause_list(x))
133 | crashdump_venus['fault_code_list'] = crashdump_venus['fault_code'].apply(lambda x: get_fault_code_list(x))
134 |
135 | code_name_list = ['module', 'cod1', 'cod2', 'addr', 'port']
136 | for code_name in code_name_list:
137 | crashdump_venus[f'module_cause_{code_name}'] = crashdump_venus['module_cause_list'].apply(
138 | lambda x: get_module_cause_code(x, code_name))
139 | crashdump_venus[f'module_cause_{code_name}_len'] = crashdump_venus[f'module_cause_{code_name}'].apply(
140 | lambda x: len(x))
141 | crashdump_venus[f'module_cause_{code_name}'] = crashdump_venus[f'module_cause_{code_name}'].apply(
142 | lambda x: '_'.join(set(x)))
143 | code_name_list = ['cha', '0x', 'cod', 'core', 'cpu', 'm2m', 'pcu']
144 | for code_name in code_name_list:
145 | crashdump_venus[f'fault_{code_name}'] = crashdump_venus['fault_code_list'].apply(
146 | lambda x: get_module_cause_code(x, code_name))
147 | crashdump_venus[f'fault_{code_name}_len'] = crashdump_venus[f'fault_{code_name}'].apply(lambda x: len(x))
148 | crashdump_venus[f'fault_{code_name}'] = crashdump_venus[f'fault_{code_name}'].apply(lambda x: '_'.join(set(x)))
149 |
150 | cols_tmp = ['module_cause', 'fault_code', 'module_cause_module',
151 | 'module_cause_cod1', 'module_cause_cod2', 'module_cause_addr',
152 | 'module_cause_port', 'fault_cha', 'fault_0x', 'fault_cod', 'fault_core',
153 | 'fault_cpu', 'fault_m2m', 'fault_pcu', ]
154 | new_cat_cols = []
155 | crashdump_venus = cat2num(crashdump_venus, cols_tmp)
156 | for name in cols_tmp:
157 | # le = LabelEncoder()
158 | # crashdump_venus[f'{name}_LabelEnc'] = le.fit_transform(crashdump_venus[name])
159 | new_cat_cols.append(f'{name}_LabelEnc')
160 |
161 | num_cols = ['fault_pcu_len', 'fault_m2m_len',
162 | 'fault_cpu_len', 'fault_0x_len', 'fault_cod_len',
163 | 'module_cause_module_len', 'module_cause_cod1_len',
164 | 'module_cause_cod2_len', 'module_cause_addr_len',
165 | 'module_cause_port_len', 'fault_cha_len', 'fault_core_len', ]
166 |
167 | crashdump_venus = crashdump_venus[['sn', 'fault_time'] + new_cat_cols + num_cols]
168 | crashdump_venus = crashdump_venus.rename(columns={'fault_time': 'crashdump_fault_time'})
169 |
170 | crashdump_venus['crashdump_fault_time'] = pd.to_datetime(crashdump_venus['crashdump_fault_time'])
171 | del crashdump_venus['crashdump_fault_time']
172 | print(f'生成 crashdump_venus 特征完毕,特征维度 {crashdump_venus.shape}')
173 | return crashdump_venus
174 |
175 |
176 | def get_location_word(x, num):
177 | try:
178 | return x[num]
179 | except:
180 | return
181 |
182 |
183 | def get_label(PSEUDO_FALG):
184 | preliminary_train_label_dataset = pd.read_csv(preliminary_train_label_dataset_path)
185 | preliminary_train_label_dataset_s = pd.read_csv(preliminary_train_label_dataset_s_path)
186 |
187 | if PSEUDO_FALG:
188 | print('获取伪标签LABEL')
189 | pseudo_labels = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_labels.csv'))
190 | label = pd.concat([preliminary_train_label_dataset,
191 | pseudo_labels,
192 | preliminary_train_label_dataset_s],
193 | ignore_index=True,
194 | axis=0).sort_values(
195 | ['sn', 'fault_time']).reset_index(drop=True)
196 | else:
197 | print('不使用伪标签数据')
198 | label = pd.concat([preliminary_train_label_dataset,
199 | preliminary_train_label_dataset_s],
200 | ignore_index=True,
201 | axis=0).sort_values(
202 | ['sn', 'fault_time']).reset_index(drop=True)
203 | label['fault_time'] = label['fault_time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
204 | label['fault_time'] = label['fault_time'].apply(lambda x: str(x))
205 | return label
206 |
207 |
208 | module_list = ['module0','module1','module2','module3','module4','module5','module7','module8','module9',
209 | 'module10','module11','module12','module13','module14','module17','module18','module19',
210 | 'in traffic control',
211 | 'irpp0','irpp1',
212 | 'pcie rootport 0:0.0','pcie rootport a2:0.0','pcie rootport 2b:3.0',
213 | 'port a','port c']
214 | module_list2 = ['module0','module1','module2','module3','module4','module5','module7','module8','module9',
215 | 'module10','module11','module12','module13','module14','module17','module18','module19']
216 | other_module_list = ['in traffic control', 'irpp0', 'irpp1', 'pcie rootport 0:0.0',
217 | 'pcie rootport a2:0.0', 'pcie rootport 2b:3.0', 'port a', 'port c']
218 | module_content_list = ['module0_cod1', 'module0_cod2', 'module0_addr',
219 | 'module1_cod1', 'module1_cod2', 'module1_addr', 'module2_cod1',
220 | 'module2_cod2', 'module2_addr', 'module3_cod1', 'module3_cod2',
221 | 'module3_addr', 'module4_cod1', 'module4_cod2', 'module4_addr',
222 | 'module5_cod1', 'module5_cod2', 'module5_addr', 'module7_cod1',
223 | 'module7_cod2', 'module7_addr', 'module8_cod1', 'module8_cod2',
224 | 'module8_addr', 'module9_cod1', 'module9_cod2', 'module9_addr',
225 | 'module10_cod1', 'module10_cod2', 'module10_addr', 'module11_cod1',
226 | 'module11_cod2', 'module11_addr', 'module12_cod1', 'module12_cod2',
227 | 'module12_addr', 'module13_cod1', 'module13_cod2', 'module13_addr',
228 | 'module14_cod1', 'module14_cod2', 'module14_addr', 'module17_cod1',
229 | 'module17_cod2', 'module17_addr', 'module18_cod1', 'module18_cod2',
230 | 'module18_addr', 'module19_cod1', 'module19_cod2', 'module19_addr']
231 | fault_code_content_list = ['fault_code_cod1', 'fault_code_cod2',
232 | 'fault_code_cpu0', 'fault_code_cpu1']
233 |
234 |
235 | crashdump_venus = get_crashdump_venus_data()
236 | crashdump_venus['module_cause_list'] = crashdump_venus['module_cause'].fillna('_').apply(lambda x:x.split(','))
237 | crashdump_venus['module_cause'] = crashdump_venus['module_cause'].fillna('_').apply(lambda x:x.replace(':','_').replace(',','_'))
238 | for module in module_list:
239 | crashdump_venus['module_cause'] = crashdump_venus['module_cause'].fillna('_').apply(
240 | lambda x:x.replace(f'{module}_',f'{module}:').replace(f'_{module}',f',{module}'))
241 | crashdump_venus['module_cause'] = crashdump_venus['module_cause'].apply(lambda x:x.replace(':',','))
242 |
243 | for module in module_list:
244 | crashdump_venus[module] = crashdump_venus['module_cause'].apply(lambda x:get_alertname_code(x,module))
245 | crashdump_venus[module] = crashdump_venus.loc[:,module].fillna(' ').apply(lambda x:x.replace('_',' '))
246 | crashdump_venus[module] = crashdump_venus[module].apply(lambda x:x.split(' '))
247 | crashdump_venus['module_cause_new'] = crashdump_venus.loc[:,module_list].sum(1)
248 |
249 |
250 | for module in module_list2:
251 | crashdump_venus[f'{module}_cod1'] = crashdump_venus[module].apply(lambda x:[get_alertname_code_2(x,'cod1')])
252 | crashdump_venus[f'{module}_cod2'] = crashdump_venus[module].apply(lambda x:[get_alertname_code_2(x,'cod2')])
253 | crashdump_venus[f'{module}_addr'] = crashdump_venus[module].apply(lambda x:[get_alertname_code_2(x,'addr')])
254 | del crashdump_venus[module]
255 | gc.collect()
256 |
257 | crashdump_venus['fault_code_list'] = crashdump_venus['fault_code'].fillna(' ').apply(lambda x:x.split('.'))
258 | for i in ['cod1','cod2','cpu0','cpu1']:
259 | crashdump_venus[f'fault_code_{i}'] = crashdump_venus['fault_code_list'].apply(lambda x:[get_alertname_code_2(x,i)])
260 |
261 |
262 | crashdump_venus['other_module_list'] = crashdump_venus.loc[:,other_module_list].sum(1)
263 | crashdump_venus['module_content_list'] = crashdump_venus.loc[:,module_content_list].sum(1)
264 | crashdump_venus['module_cause_new'] = crashdump_venus.loc[:,other_module_list+module_content_list].sum(1)
265 | crashdump_venus['fault_code_content_list'] = crashdump_venus.loc[:,fault_code_content_list].sum(1)
266 | crashdump_venus['all_crashdump_venus'] = crashdump_venus.loc[:,other_module_list+module_content_list+fault_code_content_list].sum(1)
267 |
268 | f1_list = ['sn']
269 | f2_list = ['other_module_list','module_content_list','module_cause_new','fault_code_content_list','all_crashdump_venus']
270 | w2v_feats_df = crashdump_venus[f1_list].drop_duplicates()
271 | w2v_feats_df_list = []
272 | for f1 in f1_list:
273 | for f2 in f2_list:
274 | w2v_fea_tmp = add_w2v_feats(crashdump_venus,w2v_feats_df,f1,f2,emb_size = 10,window = 5,min_count =5,)
275 | w2v_feats_df_list.append(w2v_fea_tmp)
276 | w2v_feats_df = w2v_feats_df_list[0]
277 | for i in w2v_feats_df_list[1:]:
278 | w2v_feats_df = w2v_feats_df.merge(i,on = 'sn',how = 'left')
279 |
280 | for i in other_module_list+module_content_list+fault_code_content_list:
281 | crashdump_venus[i] = crashdump_venus[i].astype(str)
282 |
283 | crashdump_venus = cat2num(crashdump_venus,other_module_list+module_content_list+fault_code_content_list)
284 | for i in other_module_list+module_content_list+fault_code_content_list:
285 | del crashdump_venus[i]
286 | gc.collect()
287 | crashdump_venus = crashdump_venus.merge(w2v_feats_df,on ='sn',how ='left').rename(columns ={'fault_time':'crashdump_venus_fault_time'} )
288 |
289 | preliminary_train_label_dataset_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset.csv')
290 | preliminary_train_label_dataset_s_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset_s.csv')
291 | test = pd.read_csv(os.path.join(TEST_A_DIR, 'final_submit_dataset_b.csv'))[['sn', 'fault_time' ]]
292 | train = get_label(False)[['sn', 'fault_time', 'label',]]
293 |
294 | test_tmp = test[['sn', 'fault_time']]
295 | test_tmp = test_tmp.merge(crashdump_venus, on='sn').drop_duplicates(['sn', 'fault_time']).reset_index(drop=True)
296 | train_tmp = train[['sn', 'fault_time', 'label', ]]
297 | train_tmp = train_tmp.merge(crashdump_venus, on='sn').drop_duplicates(['sn', 'fault_time']).reset_index(drop=True)
298 |
299 |
300 | train_tmp['duration_fault_time'] = pd.to_datetime(train_tmp['fault_time']) - pd.to_datetime(train_tmp['crashdump_venus_fault_time'])
301 | test_tmp['duration_fault_time'] = pd.to_datetime(test_tmp['fault_time']) - pd.to_datetime(test_tmp['crashdump_venus_fault_time'])
302 |
303 | train_tmp['duration_fault_time'] = train_tmp['duration_fault_time'].apply(lambda x:x.total_seconds())
304 | test_tmp['duration_fault_time'] = test_tmp['duration_fault_time'].apply(lambda x:x.total_seconds())
305 |
306 |
307 | drop_cols = ['sn', 'fault_time', 'fault_code', 'module_cause', 'module','crashdump_venus_fault_time',
308 | 'module_cause_list', 'module_cause_new', 'fault_code_list','label','duration_fault_time',
309 | 'other_module_list', 'module_content_list', 'fault_code_content_list',
310 | 'all_crashdump_venus',]
311 | use_cols = [i for i in train_tmp.columns if i not in drop_cols]
312 |
313 | cat_cols = [f'{i}_LabelEnc' for i in other_module_list+module_content_list+fault_code_content_list]
314 |
315 | oof_prob = np.zeros((train.shape[0], 4))
316 |
317 | test_prob = np.zeros((test.shape[0], 4))
318 | # seeds = [42,4242,40424,1024,2048]
319 | seeds = [42 ]
320 | for seed in seeds:
321 | oof_prob, test_prob, fea_imp_df, model_list = run_cbt(train_tmp[use_cols], train_tmp[['label']], test_tmp[use_cols], k=5,
322 | seed=seed, cat_cols=cat_cols)
323 | oof_prob +=oof_prob/len(seeds)
324 | test_prob +=test_prob/len(seeds)
325 |
326 |
327 | weight = search_weight(train_tmp, train_tmp[['label']], oof_prob, init_weight=[1.0], class_num=4, step=0.001)
328 | oof_prob = oof_prob * np.array(weight)
329 | test_prob = test_prob * np.array(weight)
330 |
331 |
332 | target_df = train_tmp[['sn', 'fault_time', 'label']].drop_duplicates(['sn', 'fault_time'])
333 | submit_df = train_tmp[['sn', 'fault_time']]
334 | submit_df['label'] = oof_prob.argmax(axis=1)
335 | submit_df = submit_df.drop_duplicates(['sn', 'fault_time'])
336 | # submit_df = pd.read_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea1.csv')).rename(columns = {'crashdump_venus_label':'label'})
337 |
338 |
339 | score = macro_f1(target_df=target_df, submit_df=submit_df)
340 | print(f'********************** BEST MACRO_F1 : {score} **********************')
341 | score = round(score, 5)
342 |
343 | print(fea_imp_df[:20])
344 | y_pred = test_prob.argmax(axis=1)
345 | result = test_tmp[['sn', 'fault_time']]
346 | result['label'] = y_pred
347 | result = result.drop_duplicates(['sn', 'fault_time'])
348 |
349 | crashdump_venus_fea = pd.concat([submit_df,result],ignore_index = False,axis = 0)
350 | crashdump_venus_fea = crashdump_venus_fea.rename(columns = {'label':'crashdump_venus_label_v1'})
351 | crashdump_venus_fea.to_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea_v1.csv'),index= False)
352 | print(crashdump_venus_fea['crashdump_venus_label_v1'].value_counts())
353 |
354 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU AFFERO GENERAL PUBLIC LICENSE
2 | Version 3, 19 November 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 | Preamble
9 |
10 | The GNU Affero General Public License is a free, copyleft license for
11 | software and other kinds of works, specifically designed to ensure
12 | cooperation with the community in the case of network server software.
13 |
14 | The licenses for most software and other practical works are designed
15 | to take away your freedom to share and change the works. By contrast,
16 | our General Public Licenses are intended to guarantee your freedom to
17 | share and change all versions of a program--to make sure it remains free
18 | software for all its users.
19 |
20 | When we speak of free software, we are referring to freedom, not
21 | price. Our General Public Licenses are designed to make sure that you
22 | have the freedom to distribute copies of free software (and charge for
23 | them if you wish), that you receive source code or can get it if you
24 | want it, that you can change the software or use pieces of it in new
25 | free programs, and that you know you can do these things.
26 |
27 | Developers that use our General Public Licenses protect your rights
28 | with two steps: (1) assert copyright on the software, and (2) offer
29 | you this License which gives you legal permission to copy, distribute
30 | and/or modify the software.
31 |
32 | A secondary benefit of defending all users' freedom is that
33 | improvements made in alternate versions of the program, if they
34 | receive widespread use, become available for other developers to
35 | incorporate. Many developers of free software are heartened and
36 | encouraged by the resulting cooperation. However, in the case of
37 | software used on network servers, this result may fail to come about.
38 | The GNU General Public License permits making a modified version and
39 | letting the public access it on a server without ever releasing its
40 | source code to the public.
41 |
42 | The GNU Affero General Public License is designed specifically to
43 | ensure that, in such cases, the modified source code becomes available
44 | to the community. It requires the operator of a network server to
45 | provide the source code of the modified version running there to the
46 | users of that server. Therefore, public use of a modified version, on
47 | a publicly accessible server, gives the public access to the source
48 | code of the modified version.
49 |
50 | An older license, called the Affero General Public License and
51 | published by Affero, was designed to accomplish similar goals. This is
52 | a different license, not a version of the Affero GPL, but Affero has
53 | released a new version of the Affero GPL which permits relicensing under
54 | this license.
55 |
56 | The precise terms and conditions for copying, distribution and
57 | modification follow.
58 |
59 | TERMS AND CONDITIONS
60 |
61 | 0. Definitions.
62 |
63 | "This License" refers to version 3 of the GNU Affero General Public License.
64 |
65 | "Copyright" also means copyright-like laws that apply to other kinds of
66 | works, such as semiconductor masks.
67 |
68 | "The Program" refers to any copyrightable work licensed under this
69 | License. Each licensee is addressed as "you". "Licensees" and
70 | "recipients" may be individuals or organizations.
71 |
72 | To "modify" a work means to copy from or adapt all or part of the work
73 | in a fashion requiring copyright permission, other than the making of an
74 | exact copy. The resulting work is called a "modified version" of the
75 | earlier work or a work "based on" the earlier work.
76 |
77 | A "covered work" means either the unmodified Program or a work based
78 | on the Program.
79 |
80 | To "propagate" a work means to do anything with it that, without
81 | permission, would make you directly or secondarily liable for
82 | infringement under applicable copyright law, except executing it on a
83 | computer or modifying a private copy. Propagation includes copying,
84 | distribution (with or without modification), making available to the
85 | public, and in some countries other activities as well.
86 |
87 | To "convey" a work means any kind of propagation that enables other
88 | parties to make or receive copies. Mere interaction with a user through
89 | a computer network, with no transfer of a copy, is not conveying.
90 |
91 | An interactive user interface displays "Appropriate Legal Notices"
92 | to the extent that it includes a convenient and prominently visible
93 | feature that (1) displays an appropriate copyright notice, and (2)
94 | tells the user that there is no warranty for the work (except to the
95 | extent that warranties are provided), that licensees may convey the
96 | work under this License, and how to view a copy of this License. If
97 | the interface presents a list of user commands or options, such as a
98 | menu, a prominent item in the list meets this criterion.
99 |
100 | 1. Source Code.
101 |
102 | The "source code" for a work means the preferred form of the work
103 | for making modifications to it. "Object code" means any non-source
104 | form of a work.
105 |
106 | A "Standard Interface" means an interface that either is an official
107 | standard defined by a recognized standards body, or, in the case of
108 | interfaces specified for a particular programming language, one that
109 | is widely used among developers working in that language.
110 |
111 | The "System Libraries" of an executable work include anything, other
112 | than the work as a whole, that (a) is included in the normal form of
113 | packaging a Major Component, but which is not part of that Major
114 | Component, and (b) serves only to enable use of the work with that
115 | Major Component, or to implement a Standard Interface for which an
116 | implementation is available to the public in source code form. A
117 | "Major Component", in this context, means a major essential component
118 | (kernel, window system, and so on) of the specific operating system
119 | (if any) on which the executable work runs, or a compiler used to
120 | produce the work, or an object code interpreter used to run it.
121 |
122 | The "Corresponding Source" for a work in object code form means all
123 | the source code needed to generate, install, and (for an executable
124 | work) run the object code and to modify the work, including scripts to
125 | control those activities. However, it does not include the work's
126 | System Libraries, or general-purpose tools or generally available free
127 | programs which are used unmodified in performing those activities but
128 | which are not part of the work. For example, Corresponding Source
129 | includes interface definition files associated with source files for
130 | the work, and the source code for shared libraries and dynamically
131 | linked subprograms that the work is specifically designed to require,
132 | such as by intimate data communication or control flow between those
133 | subprograms and other parts of the work.
134 |
135 | The Corresponding Source need not include anything that users
136 | can regenerate automatically from other parts of the Corresponding
137 | Source.
138 |
139 | The Corresponding Source for a work in source code form is that
140 | same work.
141 |
142 | 2. Basic Permissions.
143 |
144 | All rights granted under this License are granted for the term of
145 | copyright on the Program, and are irrevocable provided the stated
146 | conditions are met. This License explicitly affirms your unlimited
147 | permission to run the unmodified Program. The output from running a
148 | covered work is covered by this License only if the output, given its
149 | content, constitutes a covered work. This License acknowledges your
150 | rights of fair use or other equivalent, as provided by copyright law.
151 |
152 | You may make, run and propagate covered works that you do not
153 | convey, without conditions so long as your license otherwise remains
154 | in force. You may convey covered works to others for the sole purpose
155 | of having them make modifications exclusively for you, or provide you
156 | with facilities for running those works, provided that you comply with
157 | the terms of this License in conveying all material for which you do
158 | not control copyright. Those thus making or running the covered works
159 | for you must do so exclusively on your behalf, under your direction
160 | and control, on terms that prohibit them from making any copies of
161 | your copyrighted material outside their relationship with you.
162 |
163 | Conveying under any other circumstances is permitted solely under
164 | the conditions stated below. Sublicensing is not allowed; section 10
165 | makes it unnecessary.
166 |
167 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
168 |
169 | No covered work shall be deemed part of an effective technological
170 | measure under any applicable law fulfilling obligations under article
171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
172 | similar laws prohibiting or restricting circumvention of such
173 | measures.
174 |
175 | When you convey a covered work, you waive any legal power to forbid
176 | circumvention of technological measures to the extent such circumvention
177 | is effected by exercising rights under this License with respect to
178 | the covered work, and you disclaim any intention to limit operation or
179 | modification of the work as a means of enforcing, against the work's
180 | users, your or third parties' legal rights to forbid circumvention of
181 | technological measures.
182 |
183 | 4. Conveying Verbatim Copies.
184 |
185 | You may convey verbatim copies of the Program's source code as you
186 | receive it, in any medium, provided that you conspicuously and
187 | appropriately publish on each copy an appropriate copyright notice;
188 | keep intact all notices stating that this License and any
189 | non-permissive terms added in accord with section 7 apply to the code;
190 | keep intact all notices of the absence of any warranty; and give all
191 | recipients a copy of this License along with the Program.
192 |
193 | You may charge any price or no price for each copy that you convey,
194 | and you may offer support or warranty protection for a fee.
195 |
196 | 5. Conveying Modified Source Versions.
197 |
198 | You may convey a work based on the Program, or the modifications to
199 | produce it from the Program, in the form of source code under the
200 | terms of section 4, provided that you also meet all of these conditions:
201 |
202 | a) The work must carry prominent notices stating that you modified
203 | it, and giving a relevant date.
204 |
205 | b) The work must carry prominent notices stating that it is
206 | released under this License and any conditions added under section
207 | 7. This requirement modifies the requirement in section 4 to
208 | "keep intact all notices".
209 |
210 | c) You must license the entire work, as a whole, under this
211 | License to anyone who comes into possession of a copy. This
212 | License will therefore apply, along with any applicable section 7
213 | additional terms, to the whole of the work, and all its parts,
214 | regardless of how they are packaged. This License gives no
215 | permission to license the work in any other way, but it does not
216 | invalidate such permission if you have separately received it.
217 |
218 | d) If the work has interactive user interfaces, each must display
219 | Appropriate Legal Notices; however, if the Program has interactive
220 | interfaces that do not display Appropriate Legal Notices, your
221 | work need not make them do so.
222 |
223 | A compilation of a covered work with other separate and independent
224 | works, which are not by their nature extensions of the covered work,
225 | and which are not combined with it such as to form a larger program,
226 | in or on a volume of a storage or distribution medium, is called an
227 | "aggregate" if the compilation and its resulting copyright are not
228 | used to limit the access or legal rights of the compilation's users
229 | beyond what the individual works permit. Inclusion of a covered work
230 | in an aggregate does not cause this License to apply to the other
231 | parts of the aggregate.
232 |
233 | 6. Conveying Non-Source Forms.
234 |
235 | You may convey a covered work in object code form under the terms
236 | of sections 4 and 5, provided that you also convey the
237 | machine-readable Corresponding Source under the terms of this License,
238 | in one of these ways:
239 |
240 | a) Convey the object code in, or embodied in, a physical product
241 | (including a physical distribution medium), accompanied by the
242 | Corresponding Source fixed on a durable physical medium
243 | customarily used for software interchange.
244 |
245 | b) Convey the object code in, or embodied in, a physical product
246 | (including a physical distribution medium), accompanied by a
247 | written offer, valid for at least three years and valid for as
248 | long as you offer spare parts or customer support for that product
249 | model, to give anyone who possesses the object code either (1) a
250 | copy of the Corresponding Source for all the software in the
251 | product that is covered by this License, on a durable physical
252 | medium customarily used for software interchange, for a price no
253 | more than your reasonable cost of physically performing this
254 | conveying of source, or (2) access to copy the
255 | Corresponding Source from a network server at no charge.
256 |
257 | c) Convey individual copies of the object code with a copy of the
258 | written offer to provide the Corresponding Source. This
259 | alternative is allowed only occasionally and noncommercially, and
260 | only if you received the object code with such an offer, in accord
261 | with subsection 6b.
262 |
263 | d) Convey the object code by offering access from a designated
264 | place (gratis or for a charge), and offer equivalent access to the
265 | Corresponding Source in the same way through the same place at no
266 | further charge. You need not require recipients to copy the
267 | Corresponding Source along with the object code. If the place to
268 | copy the object code is a network server, the Corresponding Source
269 | may be on a different server (operated by you or a third party)
270 | that supports equivalent copying facilities, provided you maintain
271 | clear directions next to the object code saying where to find the
272 | Corresponding Source. Regardless of what server hosts the
273 | Corresponding Source, you remain obligated to ensure that it is
274 | available for as long as needed to satisfy these requirements.
275 |
276 | e) Convey the object code using peer-to-peer transmission, provided
277 | you inform other peers where the object code and Corresponding
278 | Source of the work are being offered to the general public at no
279 | charge under subsection 6d.
280 |
281 | A separable portion of the object code, whose source code is excluded
282 | from the Corresponding Source as a System Library, need not be
283 | included in conveying the object code work.
284 |
285 | A "User Product" is either (1) a "consumer product", which means any
286 | tangible personal property which is normally used for personal, family,
287 | or household purposes, or (2) anything designed or sold for incorporation
288 | into a dwelling. In determining whether a product is a consumer product,
289 | doubtful cases shall be resolved in favor of coverage. For a particular
290 | product received by a particular user, "normally used" refers to a
291 | typical or common use of that class of product, regardless of the status
292 | of the particular user or of the way in which the particular user
293 | actually uses, or expects or is expected to use, the product. A product
294 | is a consumer product regardless of whether the product has substantial
295 | commercial, industrial or non-consumer uses, unless such uses represent
296 | the only significant mode of use of the product.
297 |
298 | "Installation Information" for a User Product means any methods,
299 | procedures, authorization keys, or other information required to install
300 | and execute modified versions of a covered work in that User Product from
301 | a modified version of its Corresponding Source. The information must
302 | suffice to ensure that the continued functioning of the modified object
303 | code is in no case prevented or interfered with solely because
304 | modification has been made.
305 |
306 | If you convey an object code work under this section in, or with, or
307 | specifically for use in, a User Product, and the conveying occurs as
308 | part of a transaction in which the right of possession and use of the
309 | User Product is transferred to the recipient in perpetuity or for a
310 | fixed term (regardless of how the transaction is characterized), the
311 | Corresponding Source conveyed under this section must be accompanied
312 | by the Installation Information. But this requirement does not apply
313 | if neither you nor any third party retains the ability to install
314 | modified object code on the User Product (for example, the work has
315 | been installed in ROM).
316 |
317 | The requirement to provide Installation Information does not include a
318 | requirement to continue to provide support service, warranty, or updates
319 | for a work that has been modified or installed by the recipient, or for
320 | the User Product in which it has been modified or installed. Access to a
321 | network may be denied when the modification itself materially and
322 | adversely affects the operation of the network or violates the rules and
323 | protocols for communication across the network.
324 |
325 | Corresponding Source conveyed, and Installation Information provided,
326 | in accord with this section must be in a format that is publicly
327 | documented (and with an implementation available to the public in
328 | source code form), and must require no special password or key for
329 | unpacking, reading or copying.
330 |
331 | 7. Additional Terms.
332 |
333 | "Additional permissions" are terms that supplement the terms of this
334 | License by making exceptions from one or more of its conditions.
335 | Additional permissions that are applicable to the entire Program shall
336 | be treated as though they were included in this License, to the extent
337 | that they are valid under applicable law. If additional permissions
338 | apply only to part of the Program, that part may be used separately
339 | under those permissions, but the entire Program remains governed by
340 | this License without regard to the additional permissions.
341 |
342 | When you convey a copy of a covered work, you may at your option
343 | remove any additional permissions from that copy, or from any part of
344 | it. (Additional permissions may be written to require their own
345 | removal in certain cases when you modify the work.) You may place
346 | additional permissions on material, added by you to a covered work,
347 | for which you have or can give appropriate copyright permission.
348 |
349 | Notwithstanding any other provision of this License, for material you
350 | add to a covered work, you may (if authorized by the copyright holders of
351 | that material) supplement the terms of this License with terms:
352 |
353 | a) Disclaiming warranty or limiting liability differently from the
354 | terms of sections 15 and 16 of this License; or
355 |
356 | b) Requiring preservation of specified reasonable legal notices or
357 | author attributions in that material or in the Appropriate Legal
358 | Notices displayed by works containing it; or
359 |
360 | c) Prohibiting misrepresentation of the origin of that material, or
361 | requiring that modified versions of such material be marked in
362 | reasonable ways as different from the original version; or
363 |
364 | d) Limiting the use for publicity purposes of names of licensors or
365 | authors of the material; or
366 |
367 | e) Declining to grant rights under trademark law for use of some
368 | trade names, trademarks, or service marks; or
369 |
370 | f) Requiring indemnification of licensors and authors of that
371 | material by anyone who conveys the material (or modified versions of
372 | it) with contractual assumptions of liability to the recipient, for
373 | any liability that these contractual assumptions directly impose on
374 | those licensors and authors.
375 |
376 | All other non-permissive additional terms are considered "further
377 | restrictions" within the meaning of section 10. If the Program as you
378 | received it, or any part of it, contains a notice stating that it is
379 | governed by this License along with a term that is a further
380 | restriction, you may remove that term. If a license document contains
381 | a further restriction but permits relicensing or conveying under this
382 | License, you may add to a covered work material governed by the terms
383 | of that license document, provided that the further restriction does
384 | not survive such relicensing or conveying.
385 |
386 | If you add terms to a covered work in accord with this section, you
387 | must place, in the relevant source files, a statement of the
388 | additional terms that apply to those files, or a notice indicating
389 | where to find the applicable terms.
390 |
391 | Additional terms, permissive or non-permissive, may be stated in the
392 | form of a separately written license, or stated as exceptions;
393 | the above requirements apply either way.
394 |
395 | 8. Termination.
396 |
397 | You may not propagate or modify a covered work except as expressly
398 | provided under this License. Any attempt otherwise to propagate or
399 | modify it is void, and will automatically terminate your rights under
400 | this License (including any patent licenses granted under the third
401 | paragraph of section 11).
402 |
403 | However, if you cease all violation of this License, then your
404 | license from a particular copyright holder is reinstated (a)
405 | provisionally, unless and until the copyright holder explicitly and
406 | finally terminates your license, and (b) permanently, if the copyright
407 | holder fails to notify you of the violation by some reasonable means
408 | prior to 60 days after the cessation.
409 |
410 | Moreover, your license from a particular copyright holder is
411 | reinstated permanently if the copyright holder notifies you of the
412 | violation by some reasonable means, this is the first time you have
413 | received notice of violation of this License (for any work) from that
414 | copyright holder, and you cure the violation prior to 30 days after
415 | your receipt of the notice.
416 |
417 | Termination of your rights under this section does not terminate the
418 | licenses of parties who have received copies or rights from you under
419 | this License. If your rights have been terminated and not permanently
420 | reinstated, you do not qualify to receive new licenses for the same
421 | material under section 10.
422 |
423 | 9. Acceptance Not Required for Having Copies.
424 |
425 | You are not required to accept this License in order to receive or
426 | run a copy of the Program. Ancillary propagation of a covered work
427 | occurring solely as a consequence of using peer-to-peer transmission
428 | to receive a copy likewise does not require acceptance. However,
429 | nothing other than this License grants you permission to propagate or
430 | modify any covered work. These actions infringe copyright if you do
431 | not accept this License. Therefore, by modifying or propagating a
432 | covered work, you indicate your acceptance of this License to do so.
433 |
434 | 10. Automatic Licensing of Downstream Recipients.
435 |
436 | Each time you convey a covered work, the recipient automatically
437 | receives a license from the original licensors, to run, modify and
438 | propagate that work, subject to this License. You are not responsible
439 | for enforcing compliance by third parties with this License.
440 |
441 | An "entity transaction" is a transaction transferring control of an
442 | organization, or substantially all assets of one, or subdividing an
443 | organization, or merging organizations. If propagation of a covered
444 | work results from an entity transaction, each party to that
445 | transaction who receives a copy of the work also receives whatever
446 | licenses to the work the party's predecessor in interest had or could
447 | give under the previous paragraph, plus a right to possession of the
448 | Corresponding Source of the work from the predecessor in interest, if
449 | the predecessor has it or can get it with reasonable efforts.
450 |
451 | You may not impose any further restrictions on the exercise of the
452 | rights granted or affirmed under this License. For example, you may
453 | not impose a license fee, royalty, or other charge for exercise of
454 | rights granted under this License, and you may not initiate litigation
455 | (including a cross-claim or counterclaim in a lawsuit) alleging that
456 | any patent claim is infringed by making, using, selling, offering for
457 | sale, or importing the Program or any portion of it.
458 |
459 | 11. Patents.
460 |
461 | A "contributor" is a copyright holder who authorizes use under this
462 | License of the Program or a work on which the Program is based. The
463 | work thus licensed is called the contributor's "contributor version".
464 |
465 | A contributor's "essential patent claims" are all patent claims
466 | owned or controlled by the contributor, whether already acquired or
467 | hereafter acquired, that would be infringed by some manner, permitted
468 | by this License, of making, using, or selling its contributor version,
469 | but do not include claims that would be infringed only as a
470 | consequence of further modification of the contributor version. For
471 | purposes of this definition, "control" includes the right to grant
472 | patent sublicenses in a manner consistent with the requirements of
473 | this License.
474 |
475 | Each contributor grants you a non-exclusive, worldwide, royalty-free
476 | patent license under the contributor's essential patent claims, to
477 | make, use, sell, offer for sale, import and otherwise run, modify and
478 | propagate the contents of its contributor version.
479 |
480 | In the following three paragraphs, a "patent license" is any express
481 | agreement or commitment, however denominated, not to enforce a patent
482 | (such as an express permission to practice a patent or covenant not to
483 | sue for patent infringement). To "grant" such a patent license to a
484 | party means to make such an agreement or commitment not to enforce a
485 | patent against the party.
486 |
487 | If you convey a covered work, knowingly relying on a patent license,
488 | and the Corresponding Source of the work is not available for anyone
489 | to copy, free of charge and under the terms of this License, through a
490 | publicly available network server or other readily accessible means,
491 | then you must either (1) cause the Corresponding Source to be so
492 | available, or (2) arrange to deprive yourself of the benefit of the
493 | patent license for this particular work, or (3) arrange, in a manner
494 | consistent with the requirements of this License, to extend the patent
495 | license to downstream recipients. "Knowingly relying" means you have
496 | actual knowledge that, but for the patent license, your conveying the
497 | covered work in a country, or your recipient's use of the covered work
498 | in a country, would infringe one or more identifiable patents in that
499 | country that you have reason to believe are valid.
500 |
501 | If, pursuant to or in connection with a single transaction or
502 | arrangement, you convey, or propagate by procuring conveyance of, a
503 | covered work, and grant a patent license to some of the parties
504 | receiving the covered work authorizing them to use, propagate, modify
505 | or convey a specific copy of the covered work, then the patent license
506 | you grant is automatically extended to all recipients of the covered
507 | work and works based on it.
508 |
509 | A patent license is "discriminatory" if it does not include within
510 | the scope of its coverage, prohibits the exercise of, or is
511 | conditioned on the non-exercise of one or more of the rights that are
512 | specifically granted under this License. You may not convey a covered
513 | work if you are a party to an arrangement with a third party that is
514 | in the business of distributing software, under which you make payment
515 | to the third party based on the extent of your activity of conveying
516 | the work, and under which the third party grants, to any of the
517 | parties who would receive the covered work from you, a discriminatory
518 | patent license (a) in connection with copies of the covered work
519 | conveyed by you (or copies made from those copies), or (b) primarily
520 | for and in connection with specific products or compilations that
521 | contain the covered work, unless you entered into that arrangement,
522 | or that patent license was granted, prior to 28 March 2007.
523 |
524 | Nothing in this License shall be construed as excluding or limiting
525 | any implied license or other defenses to infringement that may
526 | otherwise be available to you under applicable patent law.
527 |
528 | 12. No Surrender of Others' Freedom.
529 |
530 | If conditions are imposed on you (whether by court order, agreement or
531 | otherwise) that contradict the conditions of this License, they do not
532 | excuse you from the conditions of this License. If you cannot convey a
533 | covered work so as to satisfy simultaneously your obligations under this
534 | License and any other pertinent obligations, then as a consequence you may
535 | not convey it at all. For example, if you agree to terms that obligate you
536 | to collect a royalty for further conveying from those to whom you convey
537 | the Program, the only way you could satisfy both those terms and this
538 | License would be to refrain entirely from conveying the Program.
539 |
540 | 13. Remote Network Interaction; Use with the GNU General Public License.
541 |
542 | Notwithstanding any other provision of this License, if you modify the
543 | Program, your modified version must prominently offer all users
544 | interacting with it remotely through a computer network (if your version
545 | supports such interaction) an opportunity to receive the Corresponding
546 | Source of your version by providing access to the Corresponding Source
547 | from a network server at no charge, through some standard or customary
548 | means of facilitating copying of software. This Corresponding Source
549 | shall include the Corresponding Source for any work covered by version 3
550 | of the GNU General Public License that is incorporated pursuant to the
551 | following paragraph.
552 |
553 | Notwithstanding any other provision of this License, you have
554 | permission to link or combine any covered work with a work licensed
555 | under version 3 of the GNU General Public License into a single
556 | combined work, and to convey the resulting work. The terms of this
557 | License will continue to apply to the part which is the covered work,
558 | but the work with which it is combined will remain governed by version
559 | 3 of the GNU General Public License.
560 |
561 | 14. Revised Versions of this License.
562 |
563 | The Free Software Foundation may publish revised and/or new versions of
564 | the GNU Affero General Public License from time to time. Such new versions
565 | will be similar in spirit to the present version, but may differ in detail to
566 | address new problems or concerns.
567 |
568 | Each version is given a distinguishing version number. If the
569 | Program specifies that a certain numbered version of the GNU Affero General
570 | Public License "or any later version" applies to it, you have the
571 | option of following the terms and conditions either of that numbered
572 | version or of any later version published by the Free Software
573 | Foundation. If the Program does not specify a version number of the
574 | GNU Affero General Public License, you may choose any version ever published
575 | by the Free Software Foundation.
576 |
577 | If the Program specifies that a proxy can decide which future
578 | versions of the GNU Affero General Public License can be used, that proxy's
579 | public statement of acceptance of a version permanently authorizes you
580 | to choose that version for the Program.
581 |
582 | Later license versions may give you additional or different
583 | permissions. However, no additional obligations are imposed on any
584 | author or copyright holder as a result of your choosing to follow a
585 | later version.
586 |
587 | 15. Disclaimer of Warranty.
588 |
589 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
590 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
594 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
595 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
597 |
598 | 16. Limitation of Liability.
599 |
600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
608 | SUCH DAMAGES.
609 |
610 | 17. Interpretation of Sections 15 and 16.
611 |
612 | If the disclaimer of warranty and limitation of liability provided
613 | above cannot be given local legal effect according to their terms,
614 | reviewing courts shall apply local law that most closely approximates
615 | an absolute waiver of all civil liability in connection with the
616 | Program, unless a warranty or assumption of liability accompanies a
617 | copy of the Program in return for a fee.
618 |
619 | END OF TERMS AND CONDITIONS
620 |
621 | How to Apply These Terms to Your New Programs
622 |
623 | If you develop a new program, and you want it to be of the greatest
624 | possible use to the public, the best way to achieve this is to make it
625 | free software which everyone can redistribute and change under these terms.
626 |
627 | To do so, attach the following notices to the program. It is safest
628 | to attach them to the start of each source file to most effectively
629 | state the exclusion of warranty; and each file should have at least
630 | the "copyright" line and a pointer to where the full notice is found.
631 |
632 |
633 | Copyright (C)
634 |
635 | This program is free software: you can redistribute it and/or modify
636 | it under the terms of the GNU Affero General Public License as published
637 | by the Free Software Foundation, either version 3 of the License, or
638 | (at your option) any later version.
639 |
640 | This program is distributed in the hope that it will be useful,
641 | but WITHOUT ANY WARRANTY; without even the implied warranty of
642 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
643 | GNU Affero General Public License for more details.
644 |
645 | You should have received a copy of the GNU Affero General Public License
646 | along with this program. If not, see .
647 |
648 | Also add information on how to contact you by electronic and paper mail.
649 |
650 | If your software can interact with users remotely through a computer
651 | network, you should also make sure that it provides a way for users to
652 | get its source. For example, if your program is a web application, its
653 | interface could display a "Source" link that leads users to an archive
654 | of the code. There are many ways you could offer source, and different
655 | solutions will be better for different programs; see section 13 for the
656 | specific requirements.
657 |
658 | You should also get your employer (if you work as a programmer) or school,
659 | if any, to sign a "copyright disclaimer" for the program, if necessary.
660 | For more information on this, and how to apply and follow the GNU AGPL, see
661 | .
662 |
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/LICENSE:
--------------------------------------------------------------------------------
1 | GNU AFFERO GENERAL PUBLIC LICENSE
2 | Version 3, 19 November 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 | Preamble
9 |
10 | The GNU Affero General Public License is a free, copyleft license for
11 | software and other kinds of works, specifically designed to ensure
12 | cooperation with the community in the case of network server software.
13 |
14 | The licenses for most software and other practical works are designed
15 | to take away your freedom to share and change the works. By contrast,
16 | our General Public Licenses are intended to guarantee your freedom to
17 | share and change all versions of a program--to make sure it remains free
18 | software for all its users.
19 |
20 | When we speak of free software, we are referring to freedom, not
21 | price. Our General Public Licenses are designed to make sure that you
22 | have the freedom to distribute copies of free software (and charge for
23 | them if you wish), that you receive source code or can get it if you
24 | want it, that you can change the software or use pieces of it in new
25 | free programs, and that you know you can do these things.
26 |
27 | Developers that use our General Public Licenses protect your rights
28 | with two steps: (1) assert copyright on the software, and (2) offer
29 | you this License which gives you legal permission to copy, distribute
30 | and/or modify the software.
31 |
32 | A secondary benefit of defending all users' freedom is that
33 | improvements made in alternate versions of the program, if they
34 | receive widespread use, become available for other developers to
35 | incorporate. Many developers of free software are heartened and
36 | encouraged by the resulting cooperation. However, in the case of
37 | software used on network servers, this result may fail to come about.
38 | The GNU General Public License permits making a modified version and
39 | letting the public access it on a server without ever releasing its
40 | source code to the public.
41 |
42 | The GNU Affero General Public License is designed specifically to
43 | ensure that, in such cases, the modified source code becomes available
44 | to the community. It requires the operator of a network server to
45 | provide the source code of the modified version running there to the
46 | users of that server. Therefore, public use of a modified version, on
47 | a publicly accessible server, gives the public access to the source
48 | code of the modified version.
49 |
50 | An older license, called the Affero General Public License and
51 | published by Affero, was designed to accomplish similar goals. This is
52 | a different license, not a version of the Affero GPL, but Affero has
53 | released a new version of the Affero GPL which permits relicensing under
54 | this license.
55 |
56 | The precise terms and conditions for copying, distribution and
57 | modification follow.
58 |
59 | TERMS AND CONDITIONS
60 |
61 | 0. Definitions.
62 |
63 | "This License" refers to version 3 of the GNU Affero General Public License.
64 |
65 | "Copyright" also means copyright-like laws that apply to other kinds of
66 | works, such as semiconductor masks.
67 |
68 | "The Program" refers to any copyrightable work licensed under this
69 | License. Each licensee is addressed as "you". "Licensees" and
70 | "recipients" may be individuals or organizations.
71 |
72 | To "modify" a work means to copy from or adapt all or part of the work
73 | in a fashion requiring copyright permission, other than the making of an
74 | exact copy. The resulting work is called a "modified version" of the
75 | earlier work or a work "based on" the earlier work.
76 |
77 | A "covered work" means either the unmodified Program or a work based
78 | on the Program.
79 |
80 | To "propagate" a work means to do anything with it that, without
81 | permission, would make you directly or secondarily liable for
82 | infringement under applicable copyright law, except executing it on a
83 | computer or modifying a private copy. Propagation includes copying,
84 | distribution (with or without modification), making available to the
85 | public, and in some countries other activities as well.
86 |
87 | To "convey" a work means any kind of propagation that enables other
88 | parties to make or receive copies. Mere interaction with a user through
89 | a computer network, with no transfer of a copy, is not conveying.
90 |
91 | An interactive user interface displays "Appropriate Legal Notices"
92 | to the extent that it includes a convenient and prominently visible
93 | feature that (1) displays an appropriate copyright notice, and (2)
94 | tells the user that there is no warranty for the work (except to the
95 | extent that warranties are provided), that licensees may convey the
96 | work under this License, and how to view a copy of this License. If
97 | the interface presents a list of user commands or options, such as a
98 | menu, a prominent item in the list meets this criterion.
99 |
100 | 1. Source Code.
101 |
102 | The "source code" for a work means the preferred form of the work
103 | for making modifications to it. "Object code" means any non-source
104 | form of a work.
105 |
106 | A "Standard Interface" means an interface that either is an official
107 | standard defined by a recognized standards body, or, in the case of
108 | interfaces specified for a particular programming language, one that
109 | is widely used among developers working in that language.
110 |
111 | The "System Libraries" of an executable work include anything, other
112 | than the work as a whole, that (a) is included in the normal form of
113 | packaging a Major Component, but which is not part of that Major
114 | Component, and (b) serves only to enable use of the work with that
115 | Major Component, or to implement a Standard Interface for which an
116 | implementation is available to the public in source code form. A
117 | "Major Component", in this context, means a major essential component
118 | (kernel, window system, and so on) of the specific operating system
119 | (if any) on which the executable work runs, or a compiler used to
120 | produce the work, or an object code interpreter used to run it.
121 |
122 | The "Corresponding Source" for a work in object code form means all
123 | the source code needed to generate, install, and (for an executable
124 | work) run the object code and to modify the work, including scripts to
125 | control those activities. However, it does not include the work's
126 | System Libraries, or general-purpose tools or generally available free
127 | programs which are used unmodified in performing those activities but
128 | which are not part of the work. For example, Corresponding Source
129 | includes interface definition files associated with source files for
130 | the work, and the source code for shared libraries and dynamically
131 | linked subprograms that the work is specifically designed to require,
132 | such as by intimate data communication or control flow between those
133 | subprograms and other parts of the work.
134 |
135 | The Corresponding Source need not include anything that users
136 | can regenerate automatically from other parts of the Corresponding
137 | Source.
138 |
139 | The Corresponding Source for a work in source code form is that
140 | same work.
141 |
142 | 2. Basic Permissions.
143 |
144 | All rights granted under this License are granted for the term of
145 | copyright on the Program, and are irrevocable provided the stated
146 | conditions are met. This License explicitly affirms your unlimited
147 | permission to run the unmodified Program. The output from running a
148 | covered work is covered by this License only if the output, given its
149 | content, constitutes a covered work. This License acknowledges your
150 | rights of fair use or other equivalent, as provided by copyright law.
151 |
152 | You may make, run and propagate covered works that you do not
153 | convey, without conditions so long as your license otherwise remains
154 | in force. You may convey covered works to others for the sole purpose
155 | of having them make modifications exclusively for you, or provide you
156 | with facilities for running those works, provided that you comply with
157 | the terms of this License in conveying all material for which you do
158 | not control copyright. Those thus making or running the covered works
159 | for you must do so exclusively on your behalf, under your direction
160 | and control, on terms that prohibit them from making any copies of
161 | your copyrighted material outside their relationship with you.
162 |
163 | Conveying under any other circumstances is permitted solely under
164 | the conditions stated below. Sublicensing is not allowed; section 10
165 | makes it unnecessary.
166 |
167 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
168 |
169 | No covered work shall be deemed part of an effective technological
170 | measure under any applicable law fulfilling obligations under article
171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
172 | similar laws prohibiting or restricting circumvention of such
173 | measures.
174 |
175 | When you convey a covered work, you waive any legal power to forbid
176 | circumvention of technological measures to the extent such circumvention
177 | is effected by exercising rights under this License with respect to
178 | the covered work, and you disclaim any intention to limit operation or
179 | modification of the work as a means of enforcing, against the work's
180 | users, your or third parties' legal rights to forbid circumvention of
181 | technological measures.
182 |
183 | 4. Conveying Verbatim Copies.
184 |
185 | You may convey verbatim copies of the Program's source code as you
186 | receive it, in any medium, provided that you conspicuously and
187 | appropriately publish on each copy an appropriate copyright notice;
188 | keep intact all notices stating that this License and any
189 | non-permissive terms added in accord with section 7 apply to the code;
190 | keep intact all notices of the absence of any warranty; and give all
191 | recipients a copy of this License along with the Program.
192 |
193 | You may charge any price or no price for each copy that you convey,
194 | and you may offer support or warranty protection for a fee.
195 |
196 | 5. Conveying Modified Source Versions.
197 |
198 | You may convey a work based on the Program, or the modifications to
199 | produce it from the Program, in the form of source code under the
200 | terms of section 4, provided that you also meet all of these conditions:
201 |
202 | a) The work must carry prominent notices stating that you modified
203 | it, and giving a relevant date.
204 |
205 | b) The work must carry prominent notices stating that it is
206 | released under this License and any conditions added under section
207 | 7. This requirement modifies the requirement in section 4 to
208 | "keep intact all notices".
209 |
210 | c) You must license the entire work, as a whole, under this
211 | License to anyone who comes into possession of a copy. This
212 | License will therefore apply, along with any applicable section 7
213 | additional terms, to the whole of the work, and all its parts,
214 | regardless of how they are packaged. This License gives no
215 | permission to license the work in any other way, but it does not
216 | invalidate such permission if you have separately received it.
217 |
218 | d) If the work has interactive user interfaces, each must display
219 | Appropriate Legal Notices; however, if the Program has interactive
220 | interfaces that do not display Appropriate Legal Notices, your
221 | work need not make them do so.
222 |
223 | A compilation of a covered work with other separate and independent
224 | works, which are not by their nature extensions of the covered work,
225 | and which are not combined with it such as to form a larger program,
226 | in or on a volume of a storage or distribution medium, is called an
227 | "aggregate" if the compilation and its resulting copyright are not
228 | used to limit the access or legal rights of the compilation's users
229 | beyond what the individual works permit. Inclusion of a covered work
230 | in an aggregate does not cause this License to apply to the other
231 | parts of the aggregate.
232 |
233 | 6. Conveying Non-Source Forms.
234 |
235 | You may convey a covered work in object code form under the terms
236 | of sections 4 and 5, provided that you also convey the
237 | machine-readable Corresponding Source under the terms of this License,
238 | in one of these ways:
239 |
240 | a) Convey the object code in, or embodied in, a physical product
241 | (including a physical distribution medium), accompanied by the
242 | Corresponding Source fixed on a durable physical medium
243 | customarily used for software interchange.
244 |
245 | b) Convey the object code in, or embodied in, a physical product
246 | (including a physical distribution medium), accompanied by a
247 | written offer, valid for at least three years and valid for as
248 | long as you offer spare parts or customer support for that product
249 | model, to give anyone who possesses the object code either (1) a
250 | copy of the Corresponding Source for all the software in the
251 | product that is covered by this License, on a durable physical
252 | medium customarily used for software interchange, for a price no
253 | more than your reasonable cost of physically performing this
254 | conveying of source, or (2) access to copy the
255 | Corresponding Source from a network server at no charge.
256 |
257 | c) Convey individual copies of the object code with a copy of the
258 | written offer to provide the Corresponding Source. This
259 | alternative is allowed only occasionally and noncommercially, and
260 | only if you received the object code with such an offer, in accord
261 | with subsection 6b.
262 |
263 | d) Convey the object code by offering access from a designated
264 | place (gratis or for a charge), and offer equivalent access to the
265 | Corresponding Source in the same way through the same place at no
266 | further charge. You need not require recipients to copy the
267 | Corresponding Source along with the object code. If the place to
268 | copy the object code is a network server, the Corresponding Source
269 | may be on a different server (operated by you or a third party)
270 | that supports equivalent copying facilities, provided you maintain
271 | clear directions next to the object code saying where to find the
272 | Corresponding Source. Regardless of what server hosts the
273 | Corresponding Source, you remain obligated to ensure that it is
274 | available for as long as needed to satisfy these requirements.
275 |
276 | e) Convey the object code using peer-to-peer transmission, provided
277 | you inform other peers where the object code and Corresponding
278 | Source of the work are being offered to the general public at no
279 | charge under subsection 6d.
280 |
281 | A separable portion of the object code, whose source code is excluded
282 | from the Corresponding Source as a System Library, need not be
283 | included in conveying the object code work.
284 |
285 | A "User Product" is either (1) a "consumer product", which means any
286 | tangible personal property which is normally used for personal, family,
287 | or household purposes, or (2) anything designed or sold for incorporation
288 | into a dwelling. In determining whether a product is a consumer product,
289 | doubtful cases shall be resolved in favor of coverage. For a particular
290 | product received by a particular user, "normally used" refers to a
291 | typical or common use of that class of product, regardless of the status
292 | of the particular user or of the way in which the particular user
293 | actually uses, or expects or is expected to use, the product. A product
294 | is a consumer product regardless of whether the product has substantial
295 | commercial, industrial or non-consumer uses, unless such uses represent
296 | the only significant mode of use of the product.
297 |
298 | "Installation Information" for a User Product means any methods,
299 | procedures, authorization keys, or other information required to install
300 | and execute modified versions of a covered work in that User Product from
301 | a modified version of its Corresponding Source. The information must
302 | suffice to ensure that the continued functioning of the modified object
303 | code is in no case prevented or interfered with solely because
304 | modification has been made.
305 |
306 | If you convey an object code work under this section in, or with, or
307 | specifically for use in, a User Product, and the conveying occurs as
308 | part of a transaction in which the right of possession and use of the
309 | User Product is transferred to the recipient in perpetuity or for a
310 | fixed term (regardless of how the transaction is characterized), the
311 | Corresponding Source conveyed under this section must be accompanied
312 | by the Installation Information. But this requirement does not apply
313 | if neither you nor any third party retains the ability to install
314 | modified object code on the User Product (for example, the work has
315 | been installed in ROM).
316 |
317 | The requirement to provide Installation Information does not include a
318 | requirement to continue to provide support service, warranty, or updates
319 | for a work that has been modified or installed by the recipient, or for
320 | the User Product in which it has been modified or installed. Access to a
321 | network may be denied when the modification itself materially and
322 | adversely affects the operation of the network or violates the rules and
323 | protocols for communication across the network.
324 |
325 | Corresponding Source conveyed, and Installation Information provided,
326 | in accord with this section must be in a format that is publicly
327 | documented (and with an implementation available to the public in
328 | source code form), and must require no special password or key for
329 | unpacking, reading or copying.
330 |
331 | 7. Additional Terms.
332 |
333 | "Additional permissions" are terms that supplement the terms of this
334 | License by making exceptions from one or more of its conditions.
335 | Additional permissions that are applicable to the entire Program shall
336 | be treated as though they were included in this License, to the extent
337 | that they are valid under applicable law. If additional permissions
338 | apply only to part of the Program, that part may be used separately
339 | under those permissions, but the entire Program remains governed by
340 | this License without regard to the additional permissions.
341 |
342 | When you convey a copy of a covered work, you may at your option
343 | remove any additional permissions from that copy, or from any part of
344 | it. (Additional permissions may be written to require their own
345 | removal in certain cases when you modify the work.) You may place
346 | additional permissions on material, added by you to a covered work,
347 | for which you have or can give appropriate copyright permission.
348 |
349 | Notwithstanding any other provision of this License, for material you
350 | add to a covered work, you may (if authorized by the copyright holders of
351 | that material) supplement the terms of this License with terms:
352 |
353 | a) Disclaiming warranty or limiting liability differently from the
354 | terms of sections 15 and 16 of this License; or
355 |
356 | b) Requiring preservation of specified reasonable legal notices or
357 | author attributions in that material or in the Appropriate Legal
358 | Notices displayed by works containing it; or
359 |
360 | c) Prohibiting misrepresentation of the origin of that material, or
361 | requiring that modified versions of such material be marked in
362 | reasonable ways as different from the original version; or
363 |
364 | d) Limiting the use for publicity purposes of names of licensors or
365 | authors of the material; or
366 |
367 | e) Declining to grant rights under trademark law for use of some
368 | trade names, trademarks, or service marks; or
369 |
370 | f) Requiring indemnification of licensors and authors of that
371 | material by anyone who conveys the material (or modified versions of
372 | it) with contractual assumptions of liability to the recipient, for
373 | any liability that these contractual assumptions directly impose on
374 | those licensors and authors.
375 |
376 | All other non-permissive additional terms are considered "further
377 | restrictions" within the meaning of section 10. If the Program as you
378 | received it, or any part of it, contains a notice stating that it is
379 | governed by this License along with a term that is a further
380 | restriction, you may remove that term. If a license document contains
381 | a further restriction but permits relicensing or conveying under this
382 | License, you may add to a covered work material governed by the terms
383 | of that license document, provided that the further restriction does
384 | not survive such relicensing or conveying.
385 |
386 | If you add terms to a covered work in accord with this section, you
387 | must place, in the relevant source files, a statement of the
388 | additional terms that apply to those files, or a notice indicating
389 | where to find the applicable terms.
390 |
391 | Additional terms, permissive or non-permissive, may be stated in the
392 | form of a separately written license, or stated as exceptions;
393 | the above requirements apply either way.
394 |
395 | 8. Termination.
396 |
397 | You may not propagate or modify a covered work except as expressly
398 | provided under this License. Any attempt otherwise to propagate or
399 | modify it is void, and will automatically terminate your rights under
400 | this License (including any patent licenses granted under the third
401 | paragraph of section 11).
402 |
403 | However, if you cease all violation of this License, then your
404 | license from a particular copyright holder is reinstated (a)
405 | provisionally, unless and until the copyright holder explicitly and
406 | finally terminates your license, and (b) permanently, if the copyright
407 | holder fails to notify you of the violation by some reasonable means
408 | prior to 60 days after the cessation.
409 |
410 | Moreover, your license from a particular copyright holder is
411 | reinstated permanently if the copyright holder notifies you of the
412 | violation by some reasonable means, this is the first time you have
413 | received notice of violation of this License (for any work) from that
414 | copyright holder, and you cure the violation prior to 30 days after
415 | your receipt of the notice.
416 |
417 | Termination of your rights under this section does not terminate the
418 | licenses of parties who have received copies or rights from you under
419 | this License. If your rights have been terminated and not permanently
420 | reinstated, you do not qualify to receive new licenses for the same
421 | material under section 10.
422 |
423 | 9. Acceptance Not Required for Having Copies.
424 |
425 | You are not required to accept this License in order to receive or
426 | run a copy of the Program. Ancillary propagation of a covered work
427 | occurring solely as a consequence of using peer-to-peer transmission
428 | to receive a copy likewise does not require acceptance. However,
429 | nothing other than this License grants you permission to propagate or
430 | modify any covered work. These actions infringe copyright if you do
431 | not accept this License. Therefore, by modifying or propagating a
432 | covered work, you indicate your acceptance of this License to do so.
433 |
434 | 10. Automatic Licensing of Downstream Recipients.
435 |
436 | Each time you convey a covered work, the recipient automatically
437 | receives a license from the original licensors, to run, modify and
438 | propagate that work, subject to this License. You are not responsible
439 | for enforcing compliance by third parties with this License.
440 |
441 | An "entity transaction" is a transaction transferring control of an
442 | organization, or substantially all assets of one, or subdividing an
443 | organization, or merging organizations. If propagation of a covered
444 | work results from an entity transaction, each party to that
445 | transaction who receives a copy of the work also receives whatever
446 | licenses to the work the party's predecessor in interest had or could
447 | give under the previous paragraph, plus a right to possession of the
448 | Corresponding Source of the work from the predecessor in interest, if
449 | the predecessor has it or can get it with reasonable efforts.
450 |
451 | You may not impose any further restrictions on the exercise of the
452 | rights granted or affirmed under this License. For example, you may
453 | not impose a license fee, royalty, or other charge for exercise of
454 | rights granted under this License, and you may not initiate litigation
455 | (including a cross-claim or counterclaim in a lawsuit) alleging that
456 | any patent claim is infringed by making, using, selling, offering for
457 | sale, or importing the Program or any portion of it.
458 |
459 | 11. Patents.
460 |
461 | A "contributor" is a copyright holder who authorizes use under this
462 | License of the Program or a work on which the Program is based. The
463 | work thus licensed is called the contributor's "contributor version".
464 |
465 | A contributor's "essential patent claims" are all patent claims
466 | owned or controlled by the contributor, whether already acquired or
467 | hereafter acquired, that would be infringed by some manner, permitted
468 | by this License, of making, using, or selling its contributor version,
469 | but do not include claims that would be infringed only as a
470 | consequence of further modification of the contributor version. For
471 | purposes of this definition, "control" includes the right to grant
472 | patent sublicenses in a manner consistent with the requirements of
473 | this License.
474 |
475 | Each contributor grants you a non-exclusive, worldwide, royalty-free
476 | patent license under the contributor's essential patent claims, to
477 | make, use, sell, offer for sale, import and otherwise run, modify and
478 | propagate the contents of its contributor version.
479 |
480 | In the following three paragraphs, a "patent license" is any express
481 | agreement or commitment, however denominated, not to enforce a patent
482 | (such as an express permission to practice a patent or covenant not to
483 | sue for patent infringement). To "grant" such a patent license to a
484 | party means to make such an agreement or commitment not to enforce a
485 | patent against the party.
486 |
487 | If you convey a covered work, knowingly relying on a patent license,
488 | and the Corresponding Source of the work is not available for anyone
489 | to copy, free of charge and under the terms of this License, through a
490 | publicly available network server or other readily accessible means,
491 | then you must either (1) cause the Corresponding Source to be so
492 | available, or (2) arrange to deprive yourself of the benefit of the
493 | patent license for this particular work, or (3) arrange, in a manner
494 | consistent with the requirements of this License, to extend the patent
495 | license to downstream recipients. "Knowingly relying" means you have
496 | actual knowledge that, but for the patent license, your conveying the
497 | covered work in a country, or your recipient's use of the covered work
498 | in a country, would infringe one or more identifiable patents in that
499 | country that you have reason to believe are valid.
500 |
501 | If, pursuant to or in connection with a single transaction or
502 | arrangement, you convey, or propagate by procuring conveyance of, a
503 | covered work, and grant a patent license to some of the parties
504 | receiving the covered work authorizing them to use, propagate, modify
505 | or convey a specific copy of the covered work, then the patent license
506 | you grant is automatically extended to all recipients of the covered
507 | work and works based on it.
508 |
509 | A patent license is "discriminatory" if it does not include within
510 | the scope of its coverage, prohibits the exercise of, or is
511 | conditioned on the non-exercise of one or more of the rights that are
512 | specifically granted under this License. You may not convey a covered
513 | work if you are a party to an arrangement with a third party that is
514 | in the business of distributing software, under which you make payment
515 | to the third party based on the extent of your activity of conveying
516 | the work, and under which the third party grants, to any of the
517 | parties who would receive the covered work from you, a discriminatory
518 | patent license (a) in connection with copies of the covered work
519 | conveyed by you (or copies made from those copies), or (b) primarily
520 | for and in connection with specific products or compilations that
521 | contain the covered work, unless you entered into that arrangement,
522 | or that patent license was granted, prior to 28 March 2007.
523 |
524 | Nothing in this License shall be construed as excluding or limiting
525 | any implied license or other defenses to infringement that may
526 | otherwise be available to you under applicable patent law.
527 |
528 | 12. No Surrender of Others' Freedom.
529 |
530 | If conditions are imposed on you (whether by court order, agreement or
531 | otherwise) that contradict the conditions of this License, they do not
532 | excuse you from the conditions of this License. If you cannot convey a
533 | covered work so as to satisfy simultaneously your obligations under this
534 | License and any other pertinent obligations, then as a consequence you may
535 | not convey it at all. For example, if you agree to terms that obligate you
536 | to collect a royalty for further conveying from those to whom you convey
537 | the Program, the only way you could satisfy both those terms and this
538 | License would be to refrain entirely from conveying the Program.
539 |
540 | 13. Remote Network Interaction; Use with the GNU General Public License.
541 |
542 | Notwithstanding any other provision of this License, if you modify the
543 | Program, your modified version must prominently offer all users
544 | interacting with it remotely through a computer network (if your version
545 | supports such interaction) an opportunity to receive the Corresponding
546 | Source of your version by providing access to the Corresponding Source
547 | from a network server at no charge, through some standard or customary
548 | means of facilitating copying of software. This Corresponding Source
549 | shall include the Corresponding Source for any work covered by version 3
550 | of the GNU General Public License that is incorporated pursuant to the
551 | following paragraph.
552 |
553 | Notwithstanding any other provision of this License, you have
554 | permission to link or combine any covered work with a work licensed
555 | under version 3 of the GNU General Public License into a single
556 | combined work, and to convey the resulting work. The terms of this
557 | License will continue to apply to the part which is the covered work,
558 | but the work with which it is combined will remain governed by version
559 | 3 of the GNU General Public License.
560 |
561 | 14. Revised Versions of this License.
562 |
563 | The Free Software Foundation may publish revised and/or new versions of
564 | the GNU Affero General Public License from time to time. Such new versions
565 | will be similar in spirit to the present version, but may differ in detail to
566 | address new problems or concerns.
567 |
568 | Each version is given a distinguishing version number. If the
569 | Program specifies that a certain numbered version of the GNU Affero General
570 | Public License "or any later version" applies to it, you have the
571 | option of following the terms and conditions either of that numbered
572 | version or of any later version published by the Free Software
573 | Foundation. If the Program does not specify a version number of the
574 | GNU Affero General Public License, you may choose any version ever published
575 | by the Free Software Foundation.
576 |
577 | If the Program specifies that a proxy can decide which future
578 | versions of the GNU Affero General Public License can be used, that proxy's
579 | public statement of acceptance of a version permanently authorizes you
580 | to choose that version for the Program.
581 |
582 | Later license versions may give you additional or different
583 | permissions. However, no additional obligations are imposed on any
584 | author or copyright holder as a result of your choosing to follow a
585 | later version.
586 |
587 | 15. Disclaimer of Warranty.
588 |
589 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
590 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
594 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
595 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
597 |
598 | 16. Limitation of Liability.
599 |
600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
608 | SUCH DAMAGES.
609 |
610 | 17. Interpretation of Sections 15 and 16.
611 |
612 | If the disclaimer of warranty and limitation of liability provided
613 | above cannot be given local legal effect according to their terms,
614 | reviewing courts shall apply local law that most closely approximates
615 | an absolute waiver of all civil liability in connection with the
616 | Program, unless a warranty or assumption of liability accompanies a
617 | copy of the Program in return for a fee.
618 |
619 | END OF TERMS AND CONDITIONS
620 |
621 | How to Apply These Terms to Your New Programs
622 |
623 | If you develop a new program, and you want it to be of the greatest
624 | possible use to the public, the best way to achieve this is to make it
625 | free software which everyone can redistribute and change under these terms.
626 |
627 | To do so, attach the following notices to the program. It is safest
628 | to attach them to the start of each source file to most effectively
629 | state the exclusion of warranty; and each file should have at least
630 | the "copyright" line and a pointer to where the full notice is found.
631 |
632 |
633 | Copyright (C)
634 |
635 | This program is free software: you can redistribute it and/or modify
636 | it under the terms of the GNU Affero General Public License as published
637 | by the Free Software Foundation, either version 3 of the License, or
638 | (at your option) any later version.
639 |
640 | This program is distributed in the hope that it will be useful,
641 | but WITHOUT ANY WARRANTY; without even the implied warranty of
642 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
643 | GNU Affero General Public License for more details.
644 |
645 | You should have received a copy of the GNU Affero General Public License
646 | along with this program. If not, see .
647 |
648 | Also add information on how to contact you by electronic and paper mail.
649 |
650 | If your software can interact with users remotely through a computer
651 | network, you should also make sure that it provides a way for users to
652 | get its source. For example, if your program is a web application, its
653 | interface could display a "Source" link that leads users to an archive
654 | of the code. There are many ways you could offer source, and different
655 | solutions will be better for different programs; see section 13 for the
656 | specific requirements.
657 |
658 | You should also get your employer (if you work as a programmer) or school,
659 | if any, to sign a "copyright disclaimer" for the program, if necessary.
660 | For more information on this, and how to apply and follow the GNU AGPL, see
661 | .
662 |
--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/code/generate_feature.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import os
3 | import pickle
4 | from collections import Counter
5 | from utils import get_new_cols
6 | import numpy as np
7 | import pandas as pd
8 | from tqdm import tqdm
9 | from gensim.models import Word2Vec
10 | from utils import GENERATION_DIR
11 | from utils import KEY_1, KEY_2, KEY_3, KEY_4
12 | from tqdm import tqdm
13 | from scipy import stats
14 |
15 | def cat2num(df, cat_cols, Transfer2num=True):
16 | '''
17 |
18 | :param df:
19 | :param cat_cols: 类别特征列表
20 | :param Transfer2num: 类别特征转换为数值特征
21 | :return:
22 | '''
23 | if Transfer2num:
24 |
25 | print('Transfer category feature to num feature ')
26 | for col in cat_cols:
27 |
28 | if not os.path.exists(os.path.join(GENERATION_DIR, f'{col}_map.pkl')):
29 | print(f'Transfer : {col}')
30 | tmp_map = dict(zip(df[col].unique(), range(df[col].nunique())))
31 | with open(os.path.join(GENERATION_DIR, f'{col}_map.pkl'), 'wb') as f:
32 | pickle.dump(tmp_map, f)
33 | else:
34 | with open(os.path.join(GENERATION_DIR, f'{col}_map.pkl'), 'rb') as f:
35 | tmp_map = pickle.load(f)
36 | df[f'{col}_LabelEnc'] = df[col].map(tmp_map).fillna(-1).astype(int)
37 | else:
38 | print('Transfer category feature to category feature ')
39 | for col in cat_cols:
40 | df[col] = df[col].astype('category')
41 | print('Transfer category feature to num feature Down...')
42 | return df
43 |
44 | def add_minutes(x, minutes=5):
45 | dt = datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
46 | out_date = (dt + datetime.timedelta(minutes=minutes)
47 | ).strftime('%Y-%m-%d %H:%M:%S')
48 | return out_date
49 |
50 |
51 | def time_process(df, time_cols, minutes_):
52 | df[f'time_{minutes_}'] = df[time_cols].apply(
53 | lambda x: add_minutes(str(x), minutes_))
54 | return df
55 |
56 |
57 | def get_fea(x, fea):
58 | if fea in x:
59 | return 1
60 | else:
61 | return 0
62 |
63 |
64 | def get_last_msg_cnt(x):
65 | last_msg = x[-1]
66 | cnt = x.count(last_msg)
67 | return cnt
68 |
69 |
70 | def get_first_msg_cnt(x):
71 | first_msg = x[0]
72 | cnt = x.count(first_msg)
73 | return cnt
74 |
75 |
76 | def add_last_next_time4fault(label, preliminary_submit_dataset_a,
77 | time_interval, next_time_list):
78 | print(f'添加自定义异常出现的时间间隔{time_interval}的前后的时间点')
79 | for i in tqdm([-i for i in next_time_list] + next_time_list):
80 | label = time_process(label, 'fault_time', i * time_interval)
81 | preliminary_submit_dataset_a = time_process(
82 | preliminary_submit_dataset_a, 'fault_time', i * time_interval)
83 |
84 | return label, preliminary_submit_dataset_a
85 |
86 |
87 | def get_msg_text_fea(df, msg_type='last'):
88 | print(f'获取 msg text {msg_type}特征')
89 |
90 | df_fea = df.groupby(['sn', 'fault_time']).agg(
91 | {'msg_list': 'sum', 'msg_0': 'sum', 'msg_1': 'sum', 'msg_2': 'sum'}).reset_index()
92 | df_fea['msg_list_unique'] = df_fea['msg_list'].apply(lambda x: str(set(x)))
93 | df_fea['msg_0_unique'] = df_fea['msg_0'].apply(lambda x: str(set(x)))
94 | df_fea['msg_1_unique'] = df_fea['msg_1'].apply(lambda x: str(set(x)))
95 | df_fea['msg_2_unique'] = df_fea['msg_2'].apply(lambda x: str(set(x)))
96 |
97 | df_fea['msg_list_list'] = df_fea['msg_list'].apply(lambda x: str(x))
98 | df_fea['msg_0_list'] = df_fea['msg_0'].apply(lambda x: str(x))
99 | df_fea['msg_1_list'] = df_fea['msg_1'].apply(lambda x: str(x))
100 | df_fea['msg_2_list'] = df_fea['msg_2'].apply(lambda x: str(x))
101 |
102 | df_fea['msg_0_first'] = df_fea['msg_0'].apply(lambda x: x[0])
103 | df_fea['msg_1_first'] = df_fea['msg_1'].apply(lambda x: x[0])
104 | df_fea['msg_2_first'] = df_fea['msg_2'].apply(lambda x: x[0])
105 |
106 | df_fea['msg_0_last'] = df_fea['msg_0'].apply(lambda x: x[-1])
107 | df_fea['msg_1_last'] = df_fea['msg_1'].apply(lambda x: x[-1])
108 | df_fea['msg_2_last'] = df_fea['msg_2'].apply(lambda x: x[-1])
109 |
110 | df_fea['msg_last'] = df.groupby(['sn', 'fault_time']).apply(
111 | lambda x: x['msg'].to_list()[-1]).values
112 | df_fea['msg_first'] = df.groupby(['sn', 'fault_time']).apply(
113 | lambda x: x['msg'].to_list()[0]).values
114 |
115 | df_fea['last_msg_cnt'] = df_fea['msg_list'].apply(
116 | lambda x: get_last_msg_cnt(x))
117 | df_fea['first_msg_cnt'] = df_fea['msg_list'].apply(
118 | lambda x: get_first_msg_cnt(x))
119 | cat_cols = ['msg_list', 'msg_0', 'msg_1', 'msg_2',
120 | 'msg_list_unique', 'msg_0_unique', 'msg_1_unique', 'msg_2_unique',
121 | 'msg_list_list', 'msg_0_list', 'msg_1_list', 'msg_2_list',
122 | 'msg_0_first', 'msg_1_first', 'msg_2_first', 'msg_0_last', 'msg_1_last',
123 | 'msg_2_last', 'msg_last', 'msg_first']
124 | num_cols = ['last_msg_cnt', 'first_msg_cnt']
125 | id_cols = ['sn', 'fault_time']
126 |
127 | df_fea = df_fea.rename(
128 | columns={
129 | i: f'{msg_type}_{i}' for i in (cat_cols + num_cols)})
130 | cat_cols = [f'{msg_type}_{i}' for i in cat_cols]
131 | for cat_col in cat_cols:
132 | df_fea[cat_col] = df_fea[cat_col].astype(str)
133 | df_fea = cat2num(df_fea, cat_cols, Transfer2num=True)
134 | for i in cat_cols:
135 | del df_fea[i]
136 | return df_fea
137 |
138 | def add_w2v_feats(all_data,w2v_feats_df,f1,f2,emb_size = 32,window = 5,min_count =5,):
139 | print(f'生成 {f1}_{f2}_w2v 特征')
140 |
141 | df_fea = all_data.groupby(f1).agg({f2:'sum'}).reset_index()
142 | df_emb = df_fea[[f1 ]]
143 | sencences = df_fea[f2].to_list()
144 | if not os.path.exists(os.path.join(GENERATION_DIR, f'{f1}_{f2}_w2v_model.pkl')):
145 | print(f'{f1}_{f2}_w2v_model 不存在,开始训练......')
146 | model = Word2Vec(sencences, vector_size=emb_size, window=window,
147 | min_count=min_count, sg=0, hs=1, seed=42)
148 | with open(os.path.join(GENERATION_DIR, f'{f1}_{f2}_w2v_model.pkl'), 'wb') as f:
149 | pickle.dump(model, f)
150 | else:
151 | print(f'{f1}_{f2}_w2v_model 已存在,开始读取......')
152 | with open(os.path.join(GENERATION_DIR, f'{f1}_{f2}_w2v_model.pkl'), 'rb') as f:
153 | model = pickle.load(f)
154 |
155 | emb_matrix_mean = []
156 | for sent in sencences:
157 | vec = []
158 | for w in sent:
159 | if w in model.wv:
160 | vec.append(model.wv[w])
161 | if len(vec) >0:
162 | emb_matrix_mean.append(np.mean(vec,axis = 0))
163 | else:
164 | emb_matrix_mean.append([0]*emb_size)
165 | df_emb_mean = pd.DataFrame(emb_matrix_mean).add_prefix(f'{f1}_{f2}_w2v_')
166 |
167 | df_emb = pd.concat([df_emb,df_emb_mean],axis = 1)
168 | w2v_feats_df = w2v_feats_df.merge(df_emb,on = f1,how ='left')
169 | return w2v_feats_df
170 | def get_w2v_feats(all_data,f1_list,f2_list):
171 | all_data['msg_list'] = all_data['msg'].apply(lambda x: [i.strip() for i in x.split(' | ')])
172 | all_data['msg_0'] = all_data['msg'].apply(lambda x: [get_msg_location(x.split(' | '), 0)])
173 | all_data['msg_1'] = all_data['msg'].apply(lambda x: [get_msg_location(x.split(' | '), 1)])
174 | all_data['msg_2'] = all_data['msg'].apply(lambda x: [get_msg_location(x.split(' | '), 2)])
175 | w2v_feats_df = all_data[f1_list].drop_duplicates()
176 | for f1 in f1_list:
177 | for f2 in f2_list:
178 | w2v_feats_df = add_w2v_feats(all_data,w2v_feats_df,f1,f2,emb_size = 10,window = 5,min_count =5,)
179 | print(f'w2v_feats 的特征维度: {w2v_feats_df.shape}')
180 | return w2v_feats_df
181 |
182 |
183 |
184 | def get_time_diff_feats_v2(all_data):
185 | print('生成时间差特征 time_diff_feats_v2')
186 | all_data['duration_seconds'] = all_data['time_interval']
187 | all_data['duration_minutes'] = all_data['time_interval'] / 60
188 | df_merge_log = all_data[['sn', 'fault_time', 'label', 'time', 'msg',
189 | 'server_model', 'time_interval', 'duration_seconds',
190 | 'duration_minutes']]
191 | df_merge_log['fault_id'] = df_merge_log['sn'] + '_' + df_merge_log['fault_time'] + '_' + df_merge_log[
192 | 'server_model']
193 | f1_list = ['fault_id', 'sn', 'server_model']
194 | f2_list = ['duration_minutes', 'duration_seconds']
195 | time_diff_feats_v2 = df_merge_log[['sn', 'fault_time', 'fault_id', 'server_model']].drop_duplicates().reset_index(
196 | drop=True)
197 |
198 | for f1 in f1_list:
199 | for f2 in f2_list:
200 | func_opt = ['count', 'nunique', 'min', 'max', 'median', 'sum']
201 | for opt in func_opt:
202 | tmp = df_merge_log.groupby([f1])[f2].agg([(f'{f2}_in_{f1}_' + opt, opt)]).reset_index()
203 | # print(f'{f1}_in_{f2}_{opt}:{tmp.shape}' )
204 | time_diff_feats_v2 = time_diff_feats_v2.merge(tmp, on=f1, how='left')
205 |
206 | temp = df_merge_log.groupby([f1])[f2].apply(lambda x: stats.mode(x)[0][0])
207 | time_diff_feats_v2[f'{f2}_in_{f1}_mode'] = time_diff_feats_v2[f1].map(temp).fillna(np.nan)
208 | secs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
209 | for sec in secs:
210 | temp = df_merge_log.groupby([f1])[f2].quantile(sec).reset_index(
211 | name=f'log_{f2}_in_{f1}_quantile_' + str(sec * 100))
212 | # print(f'log_{f1}_in_{f2}_quantile_{str(sec * 100)}:{tmp.shape}' )
213 | time_diff_feats_v2 = pd.merge(time_diff_feats_v2, temp, on=f1, how='left')
214 | del time_diff_feats_v2['fault_id']
215 | return time_diff_feats_v2
216 |
217 | def get_feature(data, time_list, log_fea, fea_num, key):
218 | print(f'当前特征维度{data.shape}')
219 | fea_df_list = []
220 | fea_cnt_list = ['OEM record c2', 'Processor CPU_Core_Error', '001c4c', 'System Event Sys_Event','OEM CPU0 MCERR',
221 | 'OEM CPU0 CATERR', 'Reading 0 < Threshold 2 degrees C', '0203c0a80101',
222 | 'Unknown CPU0 MCERR', 'Unknown CPU0 CATERR','Memory', 'Correctable ECC logging limit reached',
223 | 'Memory MEM_CHE0_Status', 'Memory Memory_Status', 'Memory #0x87', 'Memory CPU0F0_DIMM_Stat',
224 | 'Drive Fault', 'NMI/Diag Interrupt', 'Failure detected', 'Power Supply AC lost', ]
225 | for time_tmp in tqdm(time_list):
226 | print(f'获取异常前后 {time_tmp} min的数据进行聚合')
227 | tmp1 = data[(pd.to_datetime(data['time']) < pd.to_datetime(data[f'time_{time_tmp}'])) & (pd.to_datetime(data['time']) > pd.to_datetime(data[f'time_-{time_tmp}']))].sort_values(
228 | ['sn', 'fault_time'])
229 | tmp1 = tmp1.groupby(key).apply(
230 | lambda x: ' | '.join(x['msg'].to_list())).reset_index().rename(columns={0: 'msg'})
231 | tmp1[f'msg_len'] = tmp1['msg'].apply(lambda x: len(x.split(' | ')))
232 | # tmp1[f'msg_len_two'] = tmp1['msg'].apply(lambda x: len(x))
233 | # 添加数字个数
234 | # tmp1[f'msg_num_two'] = tmp1['msg'].apply(
235 | # lambda x: len([int(s) for s in re.findall(r'\b\d+\b', x)]))
236 | print(f'根据异常前后 {time_tmp} min的数据的日志数据提取 {fea_num} 个稀疏特征')
237 | feature = log_fea + ['msg_len']
238 | for fea in feature:
239 | tmp1[fea] = tmp1['msg'].apply(lambda x: get_fea(x, fea))
240 | # 添加计数特征
241 | if fea in fea_cnt_list:
242 | tmp1[f'{fea}_cnt'] = tmp1['msg'].apply(lambda x:x.replace('|',' ').replace('_',' ').split(' ').count(fea))
243 | feature.append(f'{fea}_cnt')
244 | tmp1_new_col_map = {i: i + '_' + str(int(time_tmp)) for i in feature}
245 | tmp1 = tmp1.rename(columns=tmp1_new_col_map)
246 | del tmp1['msg']
247 | fea_df_list.append(tmp1)
248 | fea_df = fea_df_list[-1]
249 | print(fea_df.shape)
250 | for i in fea_df_list[:-1]:
251 | fea_df = fea_df.merge(i, on=key, how='left')
252 | print(fea_df.shape)
253 | return fea_df
254 |
255 |
256 | def get_msg_location(x, num):
257 | try:
258 | return x[num]
259 | except BaseException:
260 | return '其它'
261 |
262 |
263 | def get_nearest_msg_fea(train, test):
264 | print('生成 nearest_msg 特征')
265 | df = pd.concat([train, test], axis=0, ignore_index=True)
266 | df['duration_minutes'] = (pd.to_datetime(df['fault_time']) - pd.to_datetime(df['time'])).apply(
267 | lambda x: x.total_seconds())
268 | df = df.sort_values(
269 | ['sn', 'server_model', 'fault_time', 'time']).reset_index(drop=True)
270 | df['duration_minutes_abs'] = np.abs(df['duration_minutes'])
271 |
272 | df['duration_minutes_abs_rank'] = df.groupby(['sn', 'server_model', 'fault_time'])['duration_minutes_abs'].rank(
273 | method='first', ascending=False)
274 |
275 | key = ['sn', 'server_model', 'fault_time', 'duration_minutes_abs']
276 | df = df.sort_values(key, ascending=False)
277 | df = df.drop_duplicates(
278 | ['sn', 'server_model', 'fault_time', ], keep='first')
279 |
280 | df.loc[df['duration_minutes'] ==
281 | df['duration_minutes_abs'], 'last_or_next'] = 1
282 | df.loc[df['duration_minutes'] !=
283 | df['duration_minutes_abs'], 'last_or_next'] = 0
284 | df['msg_cnt'] = df['msg'].map(df['msg'].value_counts())
285 | df['msg_0'] = df['msg'].apply(
286 | lambda x: get_msg_location(
287 | x.split(' | '), 0))
288 | df['msg_0_cnt'] = df['msg_0'].map(df['msg_0'].value_counts())
289 | df['msg_1'] = df['msg'].apply(
290 | lambda x: get_msg_location(
291 | x.split(' | '), 1))
292 | df['msg_1_cnt'] = df['msg_1'].map(df['msg_1'].value_counts())
293 | df['msg_2'] = df['msg'].apply(
294 | lambda x: get_msg_location(
295 | x.split(' | '), 2))
296 | df['msg_2_cnt'] = df['msg_2'].map(df['msg_2'].value_counts())
297 | cat_feats = ['msg', 'msg_0', 'msg_1',
298 | 'msg_2'] # ,'server_model_day_date','server_model_dayofmonth','server_model_dayofweek','server_model_hour']
299 | # for name in cat_feats:
300 | # le = LabelEncoder()
301 | # df[f'{name}_LabelEnc'] = le.fit_transform(df[name])
302 | df = cat2num(df,cat_feats)
303 | df = df.drop_duplicates().reset_index(drop=True)
304 | df = df[['sn', 'server_model', 'fault_time', 'msg_cnt',
305 | 'msg_0_cnt', 'msg_1_cnt', 'msg_2_cnt',
306 | # 'duration_minutes_abs','duration_minutes', 'duration_minutes_abs_rank',
307 | 'last_or_next', 'msg_LabelEnc', 'msg_0_LabelEnc', 'msg_1_LabelEnc', 'msg_2_LabelEnc']]
308 | print(f'生成 nearest_msg 特征完毕,特征维度{df.shape}')
309 | return df
310 |
311 | def get_server_model_time_interval_stat_fea(all_data):
312 | server_model_time_interval_stat_fea = all_data.groupby('server_model').agg({'time_interval':['min','max','mean','median']}).reset_index()
313 | server_model_time_interval_stat_fea = get_new_cols(server_model_time_interval_stat_fea,key = ['server_model' ])
314 |
315 | server_model_time_interval_stat_fea.columns = ['server_model', 'sm_time_interval_min', 'sm_ttime_interval_max',
316 | 'sm_ttime_interval_mean', 'sm_ttime_interval_median']
317 | return server_model_time_interval_stat_fea
318 |
319 | def get_server_model_sn_fea_2(train, test):
320 | df = pd.concat([train[['sn', 'server_model']],
321 | test[['sn', 'server_model']]], ignore_index=True)
322 | df['server_model_count_sn_2'] = df.groupby(
323 | ['server_model'])['sn'].transform('count')
324 | df['server_model_nunique_sn_2'] = df.groupby(
325 | ['server_model'])['sn'].transform('nunique')
326 | df['sn_cnt_2'] = df['sn'].map(df['sn'].value_counts())
327 | return df.drop_duplicates().reset_index(drop=True)
328 |
329 |
330 | def get_4_time_stat_fea(df):
331 | print(' 生成时间统计特征')
332 | time_stat_fea_df = df.groupby(['sn', 'fault_time', 'server_model']).agg(
333 | {'duration_minutes': ['min', 'max', 'mean', 'median', 'skew', 'sum', 'std', 'count'],
334 | 'log_duration_minutes': ['min', 'max', 'mean', 'median', 'skew', 'sum', 'std'],
335 | 'time_diff_1': ['min', 'max', 'mean', 'median', 'skew', 'sum', 'std'],
336 | 'log_time_diff_1': ['min', 'max', 'median'],
337 | }).reset_index()
338 | new_time_stat_cols = []
339 | for i in time_stat_fea_df.columns:
340 | if i[0] in ['sn', 'fault_time', 'server_model']:
341 | new_time_stat_cols.append(i[0])
342 | else:
343 | new_time_stat_cols.append(f'{i[0]}_{i[1]}')
344 | # print(f'{i[0]}_{i[1]}')
345 | time_stat_fea_df.loc[time_stat_fea_df[i[0]]
346 | [i[1]] == -np.inf, (i[0], i[1])] = -20
347 | time_stat_fea_df.loc[time_stat_fea_df[i[0]]
348 | [i[1]] == np.inf, (i[0], i[1])] = 30
349 | time_stat_fea_df.columns = new_time_stat_cols
350 | time_stat_fea_df['duration_minutes_range'] = time_stat_fea_df['duration_minutes_max'] - time_stat_fea_df[
351 | 'duration_minutes_min']
352 | time_stat_fea_df['log_duration_minutes_range'] = time_stat_fea_df['log_duration_minutes_max'] - time_stat_fea_df[
353 | 'log_duration_minutes_min']
354 | time_stat_fea_df['time_diff_1_range'] = time_stat_fea_df['time_diff_1_max'] - \
355 | time_stat_fea_df['time_diff_1_min']
356 | time_stat_fea_df['log_time_diff_1_range'] = time_stat_fea_df['log_time_diff_1_max'] - time_stat_fea_df[
357 | 'log_time_diff_1_min']
358 | time_stat_fea_df['duration_minutes_freq'] = time_stat_fea_df['duration_minutes_range'] / time_stat_fea_df[
359 | 'duration_minutes_count']
360 | print(f' 生成时间统计特征完毕,特征维度:{time_stat_fea_df.shape}')
361 | return time_stat_fea_df
362 |
363 |
364 | def get_time_std_fea(train, test):
365 | print('生成 server_model 特征')
366 | df = pd.concat([train, test], axis=0, ignore_index=True)
367 | # df['year'] = df['time'].dt.year
368 | # df['month'] = df['time'].dt.month
369 | df['hour'] = df['time'].dt.hour
370 | # df['week'] = df['time'].dt.week
371 | df['minute'] = df['time'].dt.minute
372 | time_std = df.groupby(['sn', 'server_model']).agg(
373 | {'hour': 'std', 'minute': 'std'}).reset_index()
374 | time_std = time_std.rename(
375 | columns={
376 | 'hour': 'hour_std',
377 | 'minute': 'minute_std'})
378 | return time_std
379 |
380 |
381 | def get_key(all_data):
382 | all_data['msg_list'] = all_data['msg'].apply(lambda x: [i.strip() for i in x.split(' | ')])
383 | class_fea_cnt_list = []
384 | for label in [0,1,2,3]:
385 | class_df = all_data.query(f'label =={label}')
386 | counter = Counter()
387 | for i in class_df['msg_list']:
388 | counter.update(i)
389 | class_fea_cnt = pd.DataFrame({i[0]:i[1] for i in counter.most_common()},index = [f'fea_cnt_{label}']).T.reset_index().rename(columns = {'index':'fea'})
390 | class_fea_cnt_list.append(class_fea_cnt)
391 |
392 | fea_cnt_df = class_fea_cnt_list[0]
393 | for tmp in class_fea_cnt_list[1:]:
394 | fea_cnt_df = fea_cnt_df.merge(tmp,on = 'fea')
395 |
396 | fea_cnt_df['fea_cnt_sum'] = fea_cnt_df.loc[:,['fea_cnt_0', 'fea_cnt_1', 'fea_cnt_2', 'fea_cnt_3']].sum(1)
397 |
398 | all_fea_cnt = fea_cnt_df['fea_cnt_sum'].sum()
399 |
400 | for i in ['fea_cnt_0', 'fea_cnt_1', 'fea_cnt_2', 'fea_cnt_3']:
401 | fea_cnt_df[f'{i}_ratio'] = fea_cnt_df[i]/fea_cnt_df['fea_cnt_sum']
402 | fea_cnt_df[f'{i}_all_ratio'] = fea_cnt_df[i]/all_fea_cnt
403 |
404 | fea_cnt_df['fea_cnt_ratio_std'] = fea_cnt_df.loc[:,['fea_cnt_0_ratio','fea_cnt_1_ratio','fea_cnt_2_ratio','fea_cnt_3_ratio', ]].std(1)
405 | fea_cnt_df['fea_cnt_std'] = fea_cnt_df.loc[:,['fea_cnt_0', 'fea_cnt_1','fea_cnt_2','fea_cnt_3',]].std(1)
406 |
407 | fea_cnt_df['fea_cnt_all_ratio_std'] = fea_cnt_df.loc[:,['fea_cnt_0_all_ratio','fea_cnt_1_all_ratio',
408 | 'fea_cnt_2_all_ratio','fea_cnt_3_all_ratio',]].std(1)
409 |
410 | fea_cnt_df = fea_cnt_df[~fea_cnt_df['fea_cnt_ratio_std'].isnull()].sort_values('fea_cnt_ratio_std',ascending = False)
411 |
412 | fea_cnt_df['fea_max'] = np.argmax(fea_cnt_df.loc[:,['fea_cnt_0', 'fea_cnt_1', 'fea_cnt_2', 'fea_cnt_3',]].values,axis = 1)
413 | key_0 = fea_cnt_df.query('fea_max ==0 ')['fea'].to_list()
414 | key_1 = fea_cnt_df.query('fea_max ==1 ')['fea'].to_list()
415 | key_2 = fea_cnt_df.query('fea_max ==2 ')['fea'].to_list()
416 | key_3 = fea_cnt_df.query('fea_max ==3 ')['fea'].to_list()
417 | # key_1 = ['OEM record c2','Processor CPU_Core_Error','001c4c','System Event Sys_Event','Power Supply PS0_Status','Temperature CPU0_Margin_Temp','Reading 51 > Threshold 85 degrees C','Lower Non-critical going low','Temperature CPU1_Margin_Temp','System ACPI Power State #0x7d','Lower Critical going low']
418 | # key_2 = ['OEM CPU0 MCERR','OEM CPU0 CATERR','Reading 0 < Threshold 2 degrees C','0203c0a80101','Unknown CPU0 MCERR','Unknown CPU0 CATERR','Microcontroller #0x3b','System Boot Initiated','Processor #0xfa','Power Unit Pwr Unit Status','Hard reset','Power off/down','System Event #0xff','Memory CPU1A1_DIMM_Stat','000000','Power cycle','OEM record c3','Memory CPU1C0_DIMM_Stat','Reading 0 < Threshold 1 degrees C','IERR']
419 | # key_3 = ['Memory','Correctable ECC logging limit reached','Memory MEM_CHE0_Status','Memory Memory_Status','Memory #0x87','Memory CPU0F0_DIMM_Stat','Memory Device Disabled','Memory #0xe2','OS Stop/Shutdown OS Status','System Boot Initiated System Restart','OS Boot BIOS_Boot_Up','System Boot Initiated BIOS_Boot_UP','Memory DIMM101','OS graceful shutdown','OS Critical Stop OS Status','Memory #0xf9','Memory CPU0C0_DIMM_Stat','Memory DIMM111','Memory DIMM021',]
420 | # key_4 = ['Drive Fault','NMI/Diag Interrupt','Failure detected','Power Supply AC lost','Power Supply PSU0_Supply','AC out-of-range, but present','Predictive failure','Drive Present','Temperature Temp_DIMM_KLM','Temperature Temp_DIMM_DEF','Power Supply PS1_Status','Identify Status','Power Supply PS2_Status','Temperature DIMMG1_Temp','Upper Non-critical going high','Temperature DIMMG0_Temp','Upper Critical going high','Power Button pressed','System Boot Initiated #0xb8','Deasserted']
421 | return key_0,key_1,key_2,key_3
422 |
423 | def get_class_key_words_nunique(all_data):
424 | print('获取 class_key_words_nunique 特征')
425 |
426 | key_0,key_1,key_2,key_3 = get_key(all_data)
427 |
428 | df = all_data[['sn', 'fault_time', 'msg_list']]
429 | df_tmp = df.groupby(['sn' ]).agg({'msg_list':'sum'}).reset_index()
430 | df_tmp['class_0_key_words_nunique'] = df_tmp['msg_list'].apply(lambda x:len(set(x)&set(key_0)))
431 | df_tmp['class_1_key_words_nunique'] = df_tmp['msg_list'].apply(lambda x:len(set(x)&set(key_1)))
432 | df_tmp['class_2_key_words_nunique'] = df_tmp['msg_list'].apply(lambda x:len(set(x)&set(key_2)))
433 | df_tmp['class_3_key_words_nunique'] = df_tmp['msg_list'].apply(lambda x:len(set(x)&set(key_3)))
434 | del df_tmp['msg_list']
435 | return df_tmp
436 | def get_key_for_top_fea(train,test):
437 | KEY_FOR_TOP_COLS = []
438 | print('添加 key_for_top_fea 特征')
439 | for TIME in [3, 5, 10, 15, 30, 45, 60, 90, 120, 240, 360, 480, 540, 600,60000000]:
440 | for i in range(10):
441 | train[f'KEY_FOR_TOP_{i}_{TIME}'] = train[f'{KEY_1[i]}_{TIME}'].astype(str)+'_'+train[f'{KEY_2[i]}_{TIME}'].astype(str)+'_'+train[f'{KEY_3[i]}_{TIME}'].astype(str)+'_'+train[f'{KEY_4[i]}_{TIME}'].astype(str)
442 | test[f'KEY_FOR_TOP_{i}_{TIME}'] = test[f'{KEY_1[i]}_{TIME}'].astype(str)+'_'+test[f'{KEY_2[i]}_{TIME}'].astype(str)+'_'+test[f'{KEY_3[i]}_{TIME}'].astype(str)+'_'+test[f'{KEY_4[i]}_{TIME}'].astype(str)
443 | KEY_FOR_TOP_COLS.append(f'KEY_FOR_TOP_{i}_{TIME}')
444 | train = cat2num(train,KEY_FOR_TOP_COLS)
445 | test = cat2num(test,KEY_FOR_TOP_COLS)
446 | for KEY_FOR_TOP_COL in KEY_FOR_TOP_COLS:
447 | del train[KEY_FOR_TOP_COL]
448 | del test[KEY_FOR_TOP_COL]
449 | return train,test
450 |
451 | def get_key_word_cross_fea(train,test):
452 | print('获取关键词交叉特征......')
453 | KEY_WORDS_MAP = {'CPU0':KEY_1,'CPU1':KEY_2,'CPU2':KEY_3,'CPU3':KEY_4}
454 | KEY_WORDS_CROSS_COLS =[]
455 | for KEY_WORDS in KEY_WORDS_MAP:
456 | for i in [3, 5, 10, 15, 30, 45, 60, 90, 120, 240, 360, 480, 540, 600,60000000]:
457 | KEY_WORDS_COLS = [f'{col}_{i}' for col in KEY_WORDS_MAP[KEY_WORDS]]
458 | train[f'{KEY_WORDS}_WORDS_{i}'] = train[KEY_WORDS_COLS].astype(str).sum(1)
459 | test[f'{KEY_WORDS}_WORDS_{i}'] = test[KEY_WORDS_COLS].astype(str).sum(1)
460 | KEY_WORDS_CROSS_COLS.append(f'{KEY_WORDS}_WORDS_{i}')
461 | train = cat2num(train,KEY_WORDS_CROSS_COLS)
462 | test = cat2num(test,KEY_WORDS_CROSS_COLS)
463 |
464 | for COLS in KEY_WORDS_CROSS_COLS:
465 | del train[COLS]
466 | del test[COLS]
467 | print('获取关键词交叉特征完毕......')
468 | return train,test
469 | def get_time_quantile_fea(df):
470 | print(' 生成时间分位数特征')
471 | secs = [0.2, 0.4, 0.6, 0.8]
472 | time_fea_list = []
473 | for sec in tqdm(secs):
474 | for time_fea_type in [
475 | 'duration_minutes', 'log_duration_minutes', 'time_diff_1', 'log_time_diff_1']:
476 | temp = df.groupby(['sn', 'server_model', 'fault_time'])[time_fea_type].quantile(sec).reset_index(
477 | name=f'{time_fea_type}_' + str(sec * 100))
478 |
479 | time_fea_list.append(temp)
480 | time_fea_df = time_fea_list[0]
481 | for time_fea in time_fea_list[1:]:
482 | time_fea_df = time_fea_df.merge(
483 | time_fea, how='left', on=[
484 | 'sn', 'server_model', 'fault_time'])
485 | print(f' 生成时间分位数特征完毕,特征维度:{time_fea_df.shape}')
486 | return time_fea_df
487 |
488 |
489 | def get_server_model_fea(train, test):
490 | print('生成 server_model 特征')
491 | df = pd.concat([train, test], axis=0, ignore_index=True)
492 | df['server_model_count_sn'] = df.groupby(
493 | ['server_model'])['sn'].transform('count')
494 | df['server_model_nunique_sn'] = df.groupby(
495 | ['server_model'])['sn'].transform('nunique')
496 | # df['server_model_count'] = df.groupby('server_model')['server_model'].transform('count')
497 | # df['server_model_cnt_quantile'] = df['server_model'].map(
498 | # df['server_model'].value_counts().rank() / len(df['server_model'].unique()))
499 | # df['server_model_cnt_rank'] = df[f'server_model_cnt_quantile'].rank(method='min')
500 |
501 | df['sn_cnt'] = df['sn'].map(df['sn'].value_counts())
502 | df['sn_freq'] = df['sn'].map(df['sn'].value_counts() / len(df))
503 | df['server_model_cnt'] = df['server_model'].map(
504 | df['server_model'].value_counts())
505 | df['server_model_freq'] = df['server_model'].map(
506 | df['server_model'].value_counts() / len(df))
507 | select_cols = ['sn', 'server_model',
508 | 'server_model_count_sn', 'server_model_nunique_sn',
509 | 'sn_cnt', 'sn_freq', 'server_model_cnt', 'server_model_freq'
510 | # 'server_model_count','server_model_cnt_quantile', 'server_model_cnt_rank'
511 | ]
512 | server_model_fea = df[select_cols]
513 |
514 | cat_feats = [
515 | 'server_model'] # ,'server_model_day_date','server_model_dayofmonth','server_model_dayofweek','server_model_hour']
516 | # for name in cat_feats:
517 | # le = LabelEncoder()
518 | # server_model_fea[f'{name}_LabelEnc'] = le.fit_transform(
519 | # server_model_fea[name])
520 | server_model_fea = cat2num(server_model_fea, cat_feats, Transfer2num=True)
521 | server_model_fea = server_model_fea.drop_duplicates().reset_index(drop=True)
522 | print(f'生成 server_model 特征完毕,特征维度:{server_model_fea.shape}')
523 |
524 | return server_model_fea
525 |
526 |
527 | def get_time_type_msg_unique_fea(df):
528 | df['msg_list'] = df['msg'].apply(
529 | lambda x: [i.strip() for i in x.split(' | ')])
530 |
531 | df['msg_0'] = df['msg'].apply(
532 | lambda x: [
533 | get_msg_location(
534 | x.split(' | '),
535 | 0)])
536 | df['msg_1'] = df['msg'].apply(
537 | lambda x: [
538 | get_msg_location(
539 | x.split(' | '),
540 | 1)])
541 | df['msg_2'] = df['msg'].apply(
542 | lambda x: [
543 | get_msg_location(
544 | x.split(' | '),
545 | 2)])
546 |
547 | df = df.groupby(['sn', 'fault_time']).agg(
548 | {'msg_list': 'sum', 'msg_0': 'sum', 'msg_1': 'sum', 'msg_2': 'sum'}).reset_index()
549 |
550 | df['msg_set'] = df['msg_list'].apply(lambda x: '|'.join(list(set(x))))
551 |
552 | df['msg_0_set'] = df['msg_0'].apply(lambda x: '|'.join(list(set(x))))
553 | df['msg_1_set'] = df['msg_1'].apply(lambda x: '|'.join(list(set(x))))
554 | df['msg_2_set'] = df['msg_2'].apply(lambda x: '|'.join(list(set(x))))
555 | df = df[['sn', 'fault_time', 'msg_set',
556 | 'msg_0_set', 'msg_1_set', 'msg_2_set']]
557 | return df
558 |
559 |
560 | def get_msg_unique_fea(train, test, time_type='last'):
561 | print('生成msg_unique_ fea')
562 | common_cols = ['msg_set', 'msg_0_set', 'msg_1_set', 'msg_2_set']
563 | df = pd.concat([train, test], axis=0, ignore_index=True)
564 | df['time_interval'] = (
565 | pd.to_datetime(
566 | df['fault_time']) -
567 | df['time']).apply(
568 | lambda x: x.total_seconds())
569 |
570 | last_fea = get_time_type_msg_unique_fea(df.query('time_interval >0'))
571 | last_fea = last_fea.rename(columns={i: f'last_{i}' for i in common_cols})
572 | next_fea = get_time_type_msg_unique_fea(df.query('time_interval <0'))
573 | next_fea = next_fea.rename(columns={i: f'next_{i}' for i in common_cols})
574 | all_fea = get_time_type_msg_unique_fea(df)
575 | all_fea = all_fea.rename(columns={i: f'all_{i}' for i in common_cols})
576 | msg_unique_fea = all_fea.merge(
577 | last_fea, on=['sn', 'fault_time'], how='outer')
578 | msg_unique_fea = msg_unique_fea.merge(
579 | next_fea, on=['sn', 'fault_time'], how='outer')
580 | return msg_unique_fea
581 |
582 |
583 | def get_duration_minutes_fea(train, test):
584 | print('生成 duration_minutes 特征')
585 | df = pd.concat([train, test], axis=0, ignore_index=True)
586 | df['duration_minutes'] = (pd.to_datetime(df['fault_time']) - pd.to_datetime(df['time'])).apply(
587 | lambda x: x.total_seconds())
588 | df['log_duration_minutes'] = np.log(df['duration_minutes'])
589 |
590 | df = df.sort_values(['sn', 'label', 'server_model',
591 | 'fault_time', 'time']).reset_index(drop=True)
592 | df['time_diff_1'] = (df.groupby(['sn', 'server_model', 'fault_time'])['time'].diff(1)).apply(
593 | lambda x: x.total_seconds())
594 | df['time_diff_1'] = df['time_diff_1'].fillna(0)
595 | df['log_time_diff_1'] = np.log(df['time_diff_1'])
596 |
597 | # time_quantile_fea_df = get_time_quantile_fea(df)
598 | # time_stat_fea_df = get_4_time_stat_fea(df)
599 | # df_tmp = time_quantile_fea_df.merge(time_stat_fea_df, on= ['sn', 'server_model','fault_time'],how = 'left')
600 | time_stat_fea_df = get_4_time_stat_fea(df)
601 | df_tmp = time_stat_fea_df
602 | print(f'生成 duration_minutes 特征完毕,特征维度{df_tmp.shape}')
603 | return df_tmp
604 |
605 |
606 | def get_msg_text_fea_all(all_data):
607 | all_data['label'] = all_data['label'].fillna(-1)
608 | all_data['msg_list'] = all_data['msg'].apply(lambda x: [i.strip() for i in x.split(' | ')])
609 | all_data['msg_0'] = all_data['msg'].apply(lambda x: [get_msg_location(x.split(' | '), 0)])
610 | all_data['msg_1'] = all_data['msg'].apply(lambda x: [get_msg_location(x.split(' | '), 1)])
611 | all_data['msg_2'] = all_data['msg'].apply(lambda x: [get_msg_location(x.split(' | '), 2)])
612 |
613 | all_data = all_data.sort_values(['sn', 'fault_time', 'time']).reset_index(drop=True)
614 | del all_data['label']
615 | last_data = all_data.query('time_interval >0')
616 | next_data = all_data.query('time_interval <=0')
617 |
618 | # id_cols = ['sn', 'fault_time', 'label']
619 |
620 | # all_msg_text_fea = get_msg_text_fea(all_data, msg_type='all')
621 | last_msg_text_fea = get_msg_text_fea(last_data, msg_type='last')
622 | # next_msg_text_fea = get_msg_text_fea(next_data, msg_type='next')
623 | msg_text_fea = last_msg_text_fea
624 | return msg_text_fea
625 |
626 | def get_test_key_words(train,test):
627 |
628 | df = pd.concat([train[['sn', 'fault_time', 'label','msg']],test[['sn', 'fault_time', 'msg']]],ignore_index = True).drop_duplicates(['sn', 'fault_time', 'msg'])
629 | df['label'] = df['label'].fillna(5)
630 | df['msg_list'] = df['msg'].apply(lambda x:[i.strip() for i in x.split(' | ')])
631 | words_cnt_df_list = []
632 | for label in df['label'].unique():
633 | label = int(label)
634 | df_tmp = df.query(f'label == {label}')
635 | counter = Counter()
636 | for words in df_tmp['msg_list']:
637 | words = [i.replace('_',' ') for i in words]
638 | # word_list = []
639 | # for i in words:
640 | # word_list+=i.split(' ')
641 | # words = word_list
642 | counter.update(words)
643 | words_cnt_df = pd.DataFrame(counter,index = [0]).T.reset_index().rename(columns = {'index':'word',0:f'cnt_{label}'})
644 | words_cnt_df_list.append(words_cnt_df)
645 | words_cnt_df = words_cnt_df_list[0]
646 | for i in words_cnt_df_list[1:]:
647 | words_cnt_df = words_cnt_df.merge(i,on = 'word',how = 'outer' )
648 |
649 | words_cnt_df = words_cnt_df.fillna(-1)
650 | words_cnt_df1 = words_cnt_df.query('cnt_0 >10 and cnt_2 >10 and cnt_1 >10 and cnt_3>10 and cnt_5>10 ')
651 | cnt_class = ['cnt_0','cnt_1','cnt_2','cnt_3','cnt_5']
652 | words_cnt_df1['word_cnt_sum'] = words_cnt_df1.loc[:,cnt_class].sum(1)
653 | for i in cnt_class:
654 | words_cnt_df1[f'{i}_ratio'] = words_cnt_df1[i]/words_cnt_df1['word_cnt_sum']
655 | words_cnt_df1['word_cnt_ratio_std'] = words_cnt_df1.loc[:,['cnt_0_ratio','cnt_1_ratio', 'cnt_2_ratio', 'cnt_3_ratio']].std(1)
656 | words_cnt_df1['cnt_1_0_diff'] = (words_cnt_df1['cnt_1_ratio'] - words_cnt_df1['cnt_0_ratio'])
657 | test_key_words = words_cnt_df1.sort_values('cnt_5',ascending = False)['word'].to_list()[5:40]
658 | return test_key_words
659 |
660 | def get_w2v_mean(w2v_model,sentences):
661 | emb_matrix = list()
662 | vec = list()
663 | for w in sentences.split():
664 | if w in w2v_model.wv:
665 | vec.append(w2v_model.wv[w])
666 | if len(vec) > 0:
667 | emb_matrix.append(np.mean(vec, axis=0))
668 | else:
669 | emb_matrix.append([0] * w2v_model.vector_size)
670 | return emb_matrix
671 | def get_tfidf_svd(tfv,svd,sentences, n_components=16):
672 | X_tfidf = tfv.transform(sentences)
673 | X_svd = svd.transform(X_tfidf)
674 | return np.mean(X_svd, axis=0)
675 | def get_w2v_tfidf_fea(all_data):
676 | print('w2v编码')
677 | df = all_data
678 | df['msg_list'] = df['msg'].apply(lambda x: [i.strip().lower().replace(' ','_') for i in x.split(" | ")])
679 | df = df.groupby(['sn']).agg({'msg_list': 'sum'}).reset_index()
680 | df['text'] = df['msg_list'].apply(lambda x: ' '.join(x))
681 |
682 | sentences_list = df['text'].values.tolist()
683 | sentences = []
684 | for s in sentences_list:
685 | sentences.append([w for w in s.split()])
686 | w2v_model = Word2Vec(sentences, vector_size=10, window=3, min_count=5, sg=0, hs=1, seed=2022)
687 | df['text_w2v'] = df['text'].apply(lambda x: get_w2v_mean(w2v_model, x)[0])
688 |
689 | print('tfidf编码')
690 | X = df['text'].to_list()
691 | tfv = TfidfVectorizer(ngram_range=(1, 3), min_df=5, max_features=50000)
692 | tfv.fit(X)
693 | X_tfidf = tfv.transform(X)
694 | svd = TruncatedSVD(n_components=16) # 降维
695 | svd.fit(X_tfidf)
696 | df['text_tfidf'] = df['text'].apply(lambda x: get_tfidf_svd(tfv, svd, x.split()))
697 |
698 | print("doc2vec编码")
699 | texts = df['text'].tolist()
700 | documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(texts)]
701 | model = Doc2Vec(documents, window=5, min_count=3, workers=4)
702 | docvecs = model.docvecs
703 | df['doc2vec'] = [docvecs[i] for i in range(len(docvecs))]
704 |
705 | for i in range(32):
706 | df[f'msg_w2v_{i}'] = df['text_w2v'].apply(lambda x: x[i])
707 | for i in range(16):
708 | df[f'msg_tfv_{i}'] = df['text_tfidf'].apply(lambda x: x[i])
709 | for i in range(100):
710 | df[f'msg_doc2vec_{i}'] = df['doc2vec'].apply(lambda x: x[i])
711 |
712 | save_cols = [i for i in df.columns if i not in ['msg_list', 'text', 'text_w2v', 'text_tfidf', 'doc2vec']]
713 | return df[save_cols]
714 |
715 | # w2v_tfidf_fea = get_w2v_tfidf_fea(all_data)
716 | class BetaEncoder(object):
717 |
718 | def __init__(self, group):
719 |
720 | self.group = group
721 | self.stats = None
722 |
723 | # get counts from df
724 | def fit(self, df, target_col):
725 | # 先验均值
726 | self.prior_mean = np.mean(df[target_col])
727 | stats = df[[target_col, self.group]].groupby(self.group)
728 | # count和sum
729 | stats = stats.agg(['sum', 'count'])[target_col]
730 | stats.rename(columns={'sum': 'n', 'count': 'N'}, inplace=True)
731 | stats.reset_index(level=0, inplace=True)
732 | self.stats = stats
733 |
734 | # extract posterior statistics
735 | def transform(self, df, stat_type, N_min=1):
736 |
737 | df_stats = pd.merge(df[[self.group]], self.stats, how='left')
738 | n = df_stats['n'].copy()
739 | N = df_stats['N'].copy()
740 |
741 | # fill in missing
742 | nan_indexs = np.isnan(n)
743 | n[nan_indexs] = self.prior_mean
744 | N[nan_indexs] = 1.0
745 |
746 | # prior parameters
747 | N_prior = np.maximum(N_min - N, 0)
748 | alpha_prior = self.prior_mean * N_prior
749 | beta_prior = (1 - self.prior_mean) * N_prior
750 |
751 | # posterior parameters
752 | alpha = alpha_prior + n
753 | beta = beta_prior + N - n
754 |
755 | # calculate statistics
756 | if stat_type == 'mean':
757 | num = alpha
758 | dem = alpha + beta
759 |
760 | elif stat_type == 'mode':
761 | num = alpha - 1
762 | dem = alpha + beta - 2
763 |
764 | elif stat_type == 'median':
765 | num = alpha - 1 / 3
766 | dem = alpha + beta - 2 / 3
767 |
768 | elif stat_type == 'var':
769 | num = alpha * beta
770 | dem = (alpha + beta) ** 2 * (alpha + beta + 1)
771 |
772 | elif stat_type == 'skewness':
773 | num = 2 * (beta - alpha) * np.sqrt(alpha + beta + 1)
774 | dem = (alpha + beta + 2) * np.sqrt(alpha * beta)
775 |
776 | elif stat_type == 'kurtosis':
777 | num = 6 * (alpha - beta) ** 2 * (alpha + beta + 1) - \
778 | alpha * beta * (alpha + beta + 2)
779 | dem = alpha * beta * (alpha + beta + 2) * (alpha + beta + 3)
780 |
781 | # replace missing
782 | value = num / dem
783 | value[np.isnan(value)] = np.nanmedian(value)
784 | return value
785 |
786 |
787 | def get_beta_target(train, test):
788 | N_min = 1000
789 | feature_cols = []
790 |
791 | # encode variables
792 | for c in ['server_model']:
793 | # fit encoder
794 | be = BetaEncoder(c)
795 | be.fit(train, 'label')
796 |
797 | # mean
798 | feature_name = f'{c}_mean'
799 | train[feature_name] = be.transform(train, 'mean', N_min)
800 | test[feature_name] = be.transform(test, 'mean', N_min)
801 | feature_cols.append(feature_name)
802 |
803 | # mode
804 | feature_name = f'{c}_mode'
805 | train[feature_name] = be.transform(train, 'mode', N_min)
806 | test[feature_name] = be.transform(test, 'mode', N_min)
807 | feature_cols.append(feature_name)
808 |
809 | # median
810 | feature_name = f'{c}_median'
811 | train[feature_name] = be.transform(train, 'median', N_min)
812 | test[feature_name] = be.transform(test, 'median', N_min)
813 | feature_cols.append(feature_name)
814 |
815 | # var
816 | feature_name = f'{c}_var'
817 | train[feature_name] = be.transform(train, 'var', N_min)
818 | test[feature_name] = be.transform(test, 'var', N_min)
819 | feature_cols.append(feature_name)
820 |
821 | # # skewness
822 | # feature_name = f'{c}_skewness'
823 | # train[feature_name] = be.transform(train, 'skewness', N_min)
824 | # test[feature_name] = be.transform(test, 'skewness', N_min)
825 | # feature_cols.append(feature_name)
826 |
827 | # kurtosis
828 | feature_name = f'{c}_kurtosis'
829 | train[feature_name] = be.transform(train, 'kurtosis', N_min)
830 | test[feature_name] = be.transform(test, 'kurtosis', N_min)
831 | feature_cols.append(feature_name)
832 | df = train.append(test).reset_index(drop=True)
833 | df = df[['sn', 'fault_time', 'server_model', 'server_model_mean',
834 | 'server_model_mode', 'server_model_median', 'server_model_var',
835 | 'server_model_kurtosis']].drop_duplicates().reset_index(drop=True)
836 | return df
837 |
--------------------------------------------------------------------------------