├── .gitignore
├── 3rd_PanJiu_AIOps_Competition
    ├── model
    │   └── model.pkl
    ├── code
    │   ├── requirements.txt
    │   ├── .DS_Store
    │   ├── log.py
    │   ├── stacking.py
    │   ├── generate_pseudo_label.py
    │   ├── model.py
    │   ├── utils.py
    │   ├── lgb_fs.py
    │   ├── catboost_fs.py
    │   ├── get_crashdump_venus_fea.py
    │   └── generate_feature.py
    ├── data
    │   ├── 数据集下载地址
    │   └── .DS_Store
    ├── tcdata
    │   └── 数据集下载地址
    ├── .DS_Store
    ├── feature
    │   └── .DS_Store
    ├── 答辩PPT
    │   ├── .DS_Store
    │   └── 悦智AI实验室_20220525.pdf
    ├── user_data
    │   └── .DS_Store
    ├── docker_push.sh
    ├── run.sh
    ├── run.log
    ├── Dockerfile
    ├── README.md
    ├── log
    │   └── catboost.log
    └── LICENSE
├── .DS_Store
├── README.md
├── .idea
    └── workspace.xml
└── LICENSE


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .idea
3 | 


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/model/model.pkl:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/code/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit_learn==1.0.2
2 | 


--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yz-intelligence/AI-Competition/HEAD/.DS_Store


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/data/数据集下载地址:
--------------------------------------------------------------------------------
1 | https://tianchi.aliyun.com/competition/entrance/531947/information?lang=zh-cn


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/tcdata/数据集下载地址:
--------------------------------------------------------------------------------
1 | https://tianchi.aliyun.com/competition/entrance/531947/information?lang=zh-cn


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yz-intelligence/AI-Competition/HEAD/3rd_PanJiu_AIOps_Competition/.DS_Store


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/code/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yz-intelligence/AI-Competition/HEAD/3rd_PanJiu_AIOps_Competition/code/.DS_Store


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yz-intelligence/AI-Competition/HEAD/3rd_PanJiu_AIOps_Competition/data/.DS_Store


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/feature/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yz-intelligence/AI-Competition/HEAD/3rd_PanJiu_AIOps_Competition/feature/.DS_Store


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/答辩PPT/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yz-intelligence/AI-Competition/HEAD/3rd_PanJiu_AIOps_Competition/答辩PPT/.DS_Store


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/user_data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yz-intelligence/AI-Competition/HEAD/3rd_PanJiu_AIOps_Competition/user_data/.DS_Store


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # AI-Competition
2 | 开源往期获奖竞赛代码
3 | 
4 | - [第三届阿里云磐久智维算法大赛亚军方案](./3rd_PanJiu_AIOps_Competition/README.md)
5 |   - INFO：Catboost、伪标签、对抗验证、毫秒级预测（所有竞赛中最优）


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/答辩PPT/悦智AI实验室_20220525.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yz-intelligence/AI-Competition/HEAD/3rd_PanJiu_AIOps_Competition/答辩PPT/悦智AI实验室_20220525.pdf


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/docker_push.sh:
--------------------------------------------------------------------------------
1 | # 创建镜像 并提交到你的镜像仓库
2 | rm -rf result.zip
3 | # built 镜像
4 | docker build -t [你的仓库地址]:[TAG] .
5 | # push 镜像
6 | docker push [你的仓库地址]:[TAG]


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/run.sh:
--------------------------------------------------------------------------------
1 | rm -rf model
2 | #unzip model.zip
3 | python3 code/get_crashdump_venus_fea.py
4 | python3 code/catboost_fs.py
5 | zip -j result.zip prediction_result/catboost_result.csv
6 | 


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/run.log:
--------------------------------------------------------------------------------
 1 | Archive:  model.zip
 2 |    creating: model/deberta-base/
 3 |   inflating: model/debert_model_v21_128_fs_flod_5.h5  
 4 |   inflating: model/debert_model_v21_128_fs_flod_6.h5  
 5 |   inflating: model/debert_model_v21_128_fs_flod_8.h5  
 6 |   inflating: model/README.txt        
 7 |   inflating: model/weight_cs6399_fold_8_v21_128_fs.npy  
 8 |   inflating: model/weight_cs6558_fold_5_v21_128_fs.npy  
 9 |   inflating: model/weight_cs6614_fold_6_v21_128_fs.npy  
10 |   inflating: model/weight_fs6138_fold_8_v21_128_fs.npy  
11 |   inflating: model/weight_fs6280_fold_5_v21_128_fs.npy  
12 |   inflating: model/weight_fs6359_fold_6_v21_128_fs.npy  
13 | 


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Base Images
 2 | ## 从天池基础镜像构建
 3 | FROM registry.cn-shanghai.aliyuncs.com/tcc-public/python:3
 4 | ## 把当前文件夹里的文件构建到镜像的根目录下
 5 | ADD . /
 6 | ## 指定默认工作目录为根目录（需要把run.sh和生成的结果文件都放在该文件夹下，提交后才能运行）
 7 | WORKDIR /
 8 | 
 9 | ## 安装所需要的包
10 | RUN pip config set global.index-url http://mirrors.aliyun.com/pypi/simple/
11 | RUN pip config set install.trusted-host mirrors.aliyun.com
12 | RUN pip3 install -r code/requirements.txt
13 | RUN pip install --upgrade pip
14 | RUN apt -y update
15 | RUN apt install zip
16 | RUN apt install vim -y
17 | RUN apt install screen -y
18 | RUN pip install catboost
19 | RUN pip install scikit-learn
20 | RUN pip install tqdm
21 | RUN pip install lightgbm
22 | RUN pip install gensim==4.1.2
23 | 
24 | ## 镜像启动后统一执行 sh run.sh
25 | CMD ["sh", "run.sh"]
26 | 


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/code/log.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import logging
 3 | import os
 4 | 
 5 | 
 6 | class Logger:
 7 |     def __init__(self, name, log_path, mode='a'):
 8 |         """
 9 |         程序运行日志类的构造函数
10 |         :param name: 需要保存的日志文件名称，默认后缀名称为 .log
11 |         :param log_path: 需要保存的日志文件路径
12 |         :param mode: 日志写入模式， a:追加， w:覆盖
13 |         使用说明：
14 |             1、创建日志实例对象
15 |                 logger = Logger("textCNN_train", log_path="../logs").get_log
16 |             2、将关键信息通过日志实例对象写入日志文件
17 |                 logger.info("")
18 |         """
19 |         self.__name = name
20 |         self.logger = logging.getLogger(self.__name)
21 |         self.logger.setLevel(logging.DEBUG)
22 |         self.log_path = log_path
23 |         self.mode = mode
24 | 
25 |         # 创建一个handler，用于写入日志文件
26 |         # log_path = os.path.dirname(os.path.abspath(__file__))
27 |         # 指定utf-8格式编码，避免输出的日志文本乱码
28 |         logname = os.path.join(self.log_path, self.__name + '.log')  # 指定输出的日志文件名
29 |         # 定义handler的输出格式
30 |         formatter = logging.Formatter(
31 |             '%(asctime)s-%(filename)s-[日志信息]-[%(module)s-%(funcName)s-line:%(lineno)d]-%(levelname)s: %(message)s')
32 | 
33 |         fh = logging.FileHandler(logname, mode=self.mode, encoding='utf-8')  # 不拆分日志文件，a指追加模式,w为覆盖模式
34 |         fh.setLevel(logging.DEBUG)
35 | 
36 |         # 创建一个handler，用于将日志输出到控制台
37 |         ch = logging.StreamHandler()
38 |         ch.setLevel(logging.DEBUG)
39 | 
40 |         fh.setFormatter(formatter)
41 |         ch.setFormatter(formatter)
42 | 
43 |         # 给logger添加handler
44 |         self.logger.addHandler(fh)
45 |         self.logger.addHandler(ch)
46 | 
47 |     @property
48 |     def get_log(self):
49 |         """定义一个函数，回调logger实例"""
50 |         return self.logger
51 | 


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/code/stacking.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import pandas as pd
 4 | from  utils import RESULT_DIR
 5 | lgb_result = pd.read_csv(os.path.join(RESULT_DIR,'lgb_prob_result.csv'))
 6 | lgb_result = lgb_result[lgb_result['label'].isnull()]
 7 | print(lgb_result.columns)
 8 | del lgb_result['label']
 9 | 
10 | cat_result = pd.read_csv(os.path.join(RESULT_DIR,'cat_prob_result.csv'))
11 | cat_result = cat_result[cat_result['label'].isnull()]
12 | del cat_result['label']
13 | 
14 | # bert_result = pd.read_csv(os.path.join(RESULT_DIR,'bert_prob_result.csv'))
15 | 
16 | model_weight = {'lgb':0.2,'cat':0.8,'bert':0.2}
17 | print(f'MODEL WEIGHT: {model_weight}')
18 | # for i in ['bert_class_0', 'bert_class_1', 'bert_class_2','bert_class_3']:
19 | #     bert_result[i] = bert_result[i]*model_weight['bert']
20 | 
21 | for i in  ['cat_class_0', 'cat_class_1', 'cat_class_2', 'cat_class_3']:
22 |     cat_result[i] = cat_result[i]*model_weight['cat']
23 | 
24 | for i in  ['lgb_class_0', 'lgb_class_1', 'lgb_class_2', 'lgb_class_3']:
25 |     lgb_result[i] = lgb_result[i]*model_weight['lgb']
26 | 
27 | result= lgb_result.merge(cat_result,on =['sn', 'fault_time'],how ='left' )
28 | 
29 | # result= bert_result.merge(cat_result,on =['sn', 'fault_time'],how ='left' )
30 | #
31 | # result['class_0'] =result.loc[:,['cat_class_0','bert_class_0']].sum(1)
32 | # result['class_1'] =result.loc[:,['cat_class_1','bert_class_0']].sum(1)
33 | # result['class_2'] =result.loc[:,['cat_class_2','bert_class_0']].sum(1)
34 | # result['class_3'] =result.loc[:,['cat_class_3','bert_class_0']].sum(1)
35 | 
36 | result['class_0'] =result.loc[:,['lgb_class_0','cat_class_0']].sum(1)
37 | result['class_1'] =result.loc[:,['lgb_class_1','cat_class_1']].sum(1)
38 | result['class_2'] =result.loc[:,['lgb_class_2','cat_class_2']].sum(1)
39 | result['class_3'] =result.loc[:,['lgb_class_3','cat_class_3']].sum(1)
40 | 
41 | result['label'] = np.argmax(result.loc[:,['class_0', 'class_1', 'class_2', 'class_3']].values,axis = 1)
42 | result = result[['sn', 'fault_time','label']]
43 | result.to_csv(os.path.join(RESULT_DIR,'stacking_result.csv'),index = False)
44 | 


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/README.md:
--------------------------------------------------------------------------------
 1 | # 第三届阿里云磐久智维算法大赛亚军方案
 2 | 
 3 | ## 官网地址
 4 | 
 5 | https://tianchi.aliyun.com/competition/entrance/531947/introduction
 6 | 
 7 | ## 项目目录结构
 8 | ```
 9 | ├── Dockerfile 
10 | ├── README.md
11 | ├── code
12 | │   ├── catboost_fs.py +++++++++++++++++++++++++++++++ 模型训练代码
13 | │   ├── generate_feature.py ++++++++++++++++++++++++++ 特征生成代码
14 | │   ├── generate_pseudo_label.py  ++++++++++++++++++++ 伪标签代码
15 | │   ├── get_crashdump_venus_fea.py +++++++++++++++++++ 新数据特征生成代码
16 | │   ├── requirements.txt +++++++++++++++++++++++++++++ python包版本
17 | │   ├── stacking.py ++++++++++++++++++++++++++++++++++ 模型融合代码
18 | │   └── utils.py +++++++++++++++++++++++++++++++++++++ 小工具脚本
19 | ├── data
20 | │   ├── preliminary_a_test +++++++++++++++++++++++++++ 初赛A榜测试数据集
21 | │   ├── preliminary_b_test +++++++++++++++++++++++++++ 初赛B榜测试数据集
22 | │   └── preliminary_train ++++++++++++++++++++++++++++ 训练集数据
23 | ├── docker_push.sh +++++++++++++++++++++++++++++++++++++++++ Docker镜像构建、push脚本
24 | ├── feature
25 | │   └── generation +++++++++++++++++++++++++++++++++++ 特征生成文件夹
26 | ├── log ++++++++++++++++++++++++++++++++++++++++++++++++++++ 日志文件夹
27 | │   ├── catboost.log +++++++++++++++++++++++++++++++++ 模型运行日志
28 | ├── model ++++++++++++++++++++++++++++++++++++++++++++++++++ 模型文件
29 | ├── prediction_result ++++++++++++++++++++++++++++++++++++++ 模型预测结果文件夹
30 | │   ├── cat_prob_result.csv ++++++++++++++++++++++++++ CATBOOST模型预测概率
31 | │   ├── catboost_result.csv ++++++++++++++++++++++++++ CATBOOST模型预测结果
32 | │   └── stacking_result.csv ++++++++++++++++++++++++++ 模型融合结果
33 | ├── run.log ++++++++++++++++++++++++++++++++++++++++++++++++ 代码运行日志
34 | ├── run.sh  ++++++++++++++++++++++++++++++++++++++++++++++++ 代码运行脚本
35 | ├── tcdata  ++++++++++++++++++++++++++++++++++++++++++++++++ 复赛测试集数据文件夹(具体文件请使用初赛相关文件更改文件名替换)
36 | │   ├── final_crashdump_dataset_b.csv ++++++++++++++++ 复赛B榜新数据文件
37 | │   ├── final_sel_log_dataset_b.csv ++++++++++++++++++ 复赛测试集日志文件
38 | │   ├── final_submit_dataset_b.csv +++++++++++++++++++ 复赛测试集ID
39 | │   └── final_venus_dataset_b.csv ++++++++++++++++++++ 复赛B榜新数据文件
40 | ├── user_data
41 | │   └── tmp_data +++++++++++++++++++++++++++++++++++++ 临时文件
42 | └── 答辩PPT
43 |     └── 悦智AI实验室_20220525.pdf
44 | ```
45 | ## 运行环境
46 | Python版本为3.8，各个Python包版本见requirements.txt，使用如下命令即可安装：
47 | ```
48 | pip install -r code/requirements.txt
49 | ```
50 | 
51 | ## 构建镜像运行代码
52 | ### 构建镜像
53 | ```
54 | docker build -t [你的镜像仓库]:[TAG] .
55 | ```
56 | ### 运行镜像
57 | ```
58 | docker run  [你的镜像ID] sh run.sh 
59 | ```
60 | ### push 镜像
61 | ```
62 | docker push [你的仓库地址]:[TAG]
63 | ```
64 | ### 运行&push 镜像
65 | ```
66 | bash docker_push.sh
67 | ```
68 | 
69 | ## 运行代码
70 | ```
71 | bash run.sh
72 | ```


--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="ChangeListManager">
 4 |     <list default="true" id="202c809e-76bc-467a-bac3-ba4677182041" name="Changes" comment="添加第三届阿里云磐久智维算法大赛亚军方案" />
 5 |     <option name="SHOW_DIALOG" value="false" />
 6 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
 7 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 8 |     <option name="LAST_RESOLUTION" value="IGNORE" />
 9 |   </component>
10 |   <component name="Git.Settings">
11 |     <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
12 |   </component>
13 |   <component name="ProjectId" id="29vKMAdvIvp9Imcwv3vNlmEjnH7" />
14 |   <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
15 |   <component name="ProjectViewState">
16 |     <option name="hideEmptyMiddlePackages" value="true" />
17 |     <option name="showLibraryContents" value="true" />
18 |     <option name="showMembers" value="true" />
19 |   </component>
20 |   <component name="PropertiesComponent">
21 |     <property name="RunOnceActivity.OpenProjectViewOnStart" value="true" />
22 |     <property name="RunOnceActivity.ShowReadmeOnStart" value="true" />
23 |     <property name="WebServerToolWindowFactoryState" value="false" />
24 |     <property name="last_opened_file_path" value="$PROJECT_DIR$" />
25 |   </component>
26 |   <component name="RecentsManager">
27 |     <key name="CopyFile.RECENT_KEYS">
28 |       <recent name="$PROJECT_DIR$" />
29 |     </key>
30 |   </component>
31 |   <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
32 |   <component name="TaskManager">
33 |     <task active="true" id="Default" summary="Default task">
34 |       <changelist id="202c809e-76bc-467a-bac3-ba4677182041" name="Changes" comment="" />
35 |       <created>1653988069044</created>
36 |       <option name="number" value="Default" />
37 |       <option name="presentableId" value="Default" />
38 |       <updated>1653988069044</updated>
39 |       <workItem from="1653988070376" duration="1680000" />
40 |       <workItem from="1654767330536" duration="24000" />
41 |     </task>
42 |     <task id="LOCAL-00001" summary="添加第三届阿里云磐久智维算法大赛亚军方案">
43 |       <created>1653988253675</created>
44 |       <option name="number" value="00001" />
45 |       <option name="presentableId" value="LOCAL-00001" />
46 |       <option name="project" value="LOCAL" />
47 |       <updated>1653988253675</updated>
48 |     </task>
49 |     <task id="LOCAL-00002" summary="添加第三届阿里云磐久智维算法大赛亚军方案">
50 |       <created>1653988258918</created>
51 |       <option name="number" value="00002" />
52 |       <option name="presentableId" value="LOCAL-00002" />
53 |       <option name="project" value="LOCAL" />
54 |       <updated>1653988258918</updated>
55 |     </task>
56 |     <option name="localTasksCounter" value="3" />
57 |     <servers />
58 |   </component>
59 |   <component name="TypeScriptGeneratedFilesManager">
60 |     <option name="version" value="3" />
61 |   </component>
62 |   <component name="Vcs.Log.Tabs.Properties">
63 |     <option name="TAB_STATES">
64 |       <map>
65 |         <entry key="MAIN">
66 |           <value>
67 |             <State />
68 |           </value>
69 |         </entry>
70 |       </map>
71 |     </option>
72 |     <option name="oldMeFiltersMigrated" value="true" />
73 |   </component>
74 |   <component name="VcsManagerConfiguration">
75 |     <MESSAGE value="添加第三届阿里云磐久智维算法大赛亚军方案" />
76 |     <option name="LAST_COMMIT_MESSAGE" value="添加第三届阿里云磐久智维算法大赛亚军方案" />
77 |   </component>
78 | </project>


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/code/generate_pseudo_label.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import os
 4 | from utils import TRAIN_DIR ,TEST_A_DIR,TEST_B_DIR,RESULT_DIR,DATA_DIR
 5 | 
 6 | log_dataset_a = pd.read_csv(os.path.join(DATA_DIR,'preliminary_a_test/preliminary_sel_log_dataset_a.csv'))
 7 | log_dataset_b = pd.read_csv(os.path.join(DATA_DIR,'preliminary_b_test/preliminary_sel_log_dataset_b.csv'))
 8 | submit_dataset_a = pd.read_csv(os.path.join(DATA_DIR,'preliminary_a_test/preliminary_submit_dataset_a.csv'))
 9 | submit_dataset_b = pd.read_csv(os.path.join(DATA_DIR,'preliminary_b_test/preliminary_submit_dataset_b.csv'))
10 | 
11 | log_dataset_c = pd.concat([log_dataset_a,log_dataset_b],ignore_index = True,axis = 0)
12 | submit_dataset_c = pd.concat([submit_dataset_a,submit_dataset_b],ignore_index = True,axis = 0)
13 | 
14 | log_dataset_c.to_csv(os.path.join(TEST_A_DIR,'final_log_dataset_c.csv'),index =False)
15 | submit_dataset_c.to_csv(os.path.join(TEST_A_DIR,'final_submit_dataset_c.csv'),index =False)
16 | 
17 | 
18 | #
19 | # cat_prob = pd.read_csv(os.path.join(RESULT_DIR,'../../../TianchiAIOps_bert_model/cat_prob_result.csv'))
20 | # lgb_prob = pd.read_csv(os.path.join(RESULT_DIR,'../../../TianchiAIOps_bert_model/lgb_prob_result.csv'))
21 | 
22 | cat_prob = pd.read_csv(os.path.join(RESULT_DIR,'B_prob_7511.csv'))
23 | lgb_prob = pd.read_csv(os.path.join(RESULT_DIR,'baseline_prob_7495.csv'))
24 | cat_prob.columns  = ['cat_class_0', 'cat_class_1', 'cat_class_2', 'cat_class_3', 'label', 'sn',
25 |        'fault_time']
26 | lgb_prob.columns = ['lgb_class_0', 'lgb_class_1', 'lgb_class_2', 'lgb_class_3', 'label', 'sn',
27 |        'fault_time']
28 | 
29 | lgb_prob = lgb_prob[lgb_prob['label'].isnull()]
30 | cat_prob = cat_prob[cat_prob['label'].isnull()]
31 | 
32 | cat_prob['cat_prob'] = cat_prob.loc[:,['cat_class_0', 'cat_class_1', 'cat_class_2', 'cat_class_3']].max(1)
33 | cat_prob['cat_label'] = np.argmax(cat_prob.loc[:,['cat_class_0', 'cat_class_1', 'cat_class_2', 'cat_class_3']].values,axis = 1)
34 | 
35 | lgb_prob['lgb_prob'] = lgb_prob.loc[:,['lgb_class_0', 'lgb_class_1', 'lgb_class_2', 'lgb_class_3']].max(1)
36 | lgb_prob['lgb_label'] = np.argmax(lgb_prob.loc[:,['lgb_class_0', 'lgb_class_1', 'lgb_class_2', 'lgb_class_3']].values,axis = 1)
37 | 
38 | lgb_prob = lgb_prob[['sn','fault_time','lgb_label','lgb_prob']]
39 | cat_prob = cat_prob[['sn','fault_time','cat_label','cat_prob']]
40 | 
41 | # prob = cat_prob.merge(lgb_prob,on =['sn','fault_time'],
42 | #                how = 'left' )
43 | 
44 | prob = pd.concat([cat_prob,lgb_prob],ignore_index = True)
45 | prob['cat_prob']=prob['cat_prob'].fillna(1)
46 | prob['lgb_prob']=prob['lgb_prob'].fillna(1)
47 | prob.loc[prob['cat_label'].isnull(),'cat_label'] = prob.loc[prob['cat_label'].isnull(),'lgb_label']
48 | prob.loc[prob['lgb_label'].isnull(),'lgb_label'] = prob.loc[prob['lgb_label'].isnull(),'cat_label']
49 | 
50 | 
51 | pseudo_labels = prob.query('cat_prob >0.85 and lgb_prob >0.85 and lgb_label == cat_label  ')
52 | 
53 | pseudo_labels = pseudo_labels[['sn','fault_time','cat_label']].rename(columns = {'cat_label':'label'}).reset_index(drop = True)
54 | pseudo_labels.to_csv(os.path.join(TRAIN_DIR,'pseudo_labels.csv'),index= False)
55 | print(f'生成伪标签的数据维度:{pseudo_labels.shape}')
56 | 
57 | pseudo_sel_log_dataset = pd.read_csv(os.path.join(TEST_A_DIR,'final_sel_log_dataset_c.csv'))
58 | pseudo_sel_log_dataset = pseudo_sel_log_dataset[pseudo_sel_log_dataset['sn'].isin(pseudo_labels['sn'].to_list())]
59 | pseudo_sel_log_dataset.to_csv(os.path.join(TRAIN_DIR,'pseudo_sel_log_dataset.csv'),index = False)
60 | print(f'生成伪标签的日志数据维度:{pseudo_sel_log_dataset.shape}')
61 | 
62 | # 制作新的测试集
63 | final_submit_dataset_d= prob.merge(pseudo_labels,on =['sn','fault_time'],how = 'left' )
64 | final_submit_dataset_d = final_submit_dataset_d[final_submit_dataset_d['label'].isnull()][['sn','fault_time' ]].reset_index(drop = True)
65 | final_submit_dataset_d.to_csv(os.path.join(TEST_A_DIR,'final_submit_dataset_d.csv'),index= False)
66 | print(f'生成新的测试集维度:{final_submit_dataset_d.shape}')
67 | 
68 | final_sel_log_dataset_d = pd.read_csv(os.path.join(TEST_A_DIR,'final_sel_log_dataset_c.csv'))
69 | final_sel_log_dataset_d = final_sel_log_dataset_d[final_sel_log_dataset_d['sn'].isin(final_submit_dataset_d['sn'].to_list())]
70 | 
71 | final_sel_log_dataset_d.to_csv(
72 |     os.path.join(TEST_A_DIR,'final_sel_log_dataset_d.csv'),index = False)
73 | print(f'生成新的测试集日志数据维度:{final_sel_log_dataset_d.shape}')


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/log/catboost.log:
--------------------------------------------------------------------------------
  1 | use_less_col:335
  2 | 使用的特征维度: 1762
  3 | ********************** RUN CATBOOST MODEL **********************
  4 | ******************  当前的 SEED 42 ********************** 
  5 | FOLD 1 IS RUNNING...
  6 | 0:	learn: 1.2939006	test: 1.2943096	best: 1.2943096 (0)	total: 135ms	remaining: 22m 27s
  7 | 800:	learn: 0.2057676	test: 0.2778188	best: 0.2778188 (800)	total: 1m 2s	remaining: 11m 55s
  8 | 1600:	learn: 0.1533318	test: 0.2698555	best: 0.2698522 (1599)	total: 2m 4s	remaining: 10m 54s
  9 | Stopped by overfitting detector  (100 iterations wait)
 10 | 
 11 | bestTest = 0.2677497192
 12 | bestIteration = 2222
 13 | 
 14 | Shrink model to first 2223 iterations.
 15 | {'learn': {'MultiClass': 0.12163532058790176}, 'validation': {'MultiClass': 0.26774971916097773}}
 16 | FOLD 2 IS RUNNING...
 17 | 0:	learn: 1.2947765	test: 1.2944610	best: 1.2944610 (0)	total: 81.8ms	remaining: 13m 38s
 18 | 800:	learn: 0.2009925	test: 0.2969940	best: 0.2969940 (800)	total: 1m 2s	remaining: 11m 53s
 19 | Stopped by overfitting detector  (100 iterations wait)
 20 | 
 21 | bestTest = 0.2898436422
 22 | bestIteration = 1413
 23 | 
 24 | Shrink model to first 1414 iterations.
 25 | {'learn': {'MultiClass': 0.15671545706553627}, 'validation': {'MultiClass': 0.2898436422052235}}
 26 | FOLD 3 IS RUNNING...
 27 | 0:	learn: 1.2956904	test: 1.2979653	best: 1.2979653 (0)	total: 83.6ms	remaining: 13m 55s
 28 | 800:	learn: 0.2010365	test: 0.3031897	best: 0.3031249 (796)	total: 1m 2s	remaining: 11m 56s
 29 | 1600:	learn: 0.1521093	test: 0.2952955	best: 0.2952927 (1598)	total: 2m 4s	remaining: 10m 54s
 30 | Stopped by overfitting detector  (100 iterations wait)
 31 | 
 32 | bestTest = 0.2948664255
 33 | bestIteration = 1799
 34 | 
 35 | Shrink model to first 1800 iterations.
 36 | {'learn': {'MultiClass': 0.13764700334845772}, 'validation': {'MultiClass': 0.2948664254808659}}
 37 | FOLD 4 IS RUNNING...
 38 | 0:	learn: 1.2944941	test: 1.2931731	best: 1.2931731 (0)	total: 83.8ms	remaining: 13m 58s
 39 | 800:	learn: 0.2055831	test: 0.2798750	best: 0.2798750 (800)	total: 1m 2s	remaining: 11m 54s
 40 | 1600:	learn: 0.1555797	test: 0.2733073	best: 0.2732265 (1590)	total: 2m 4s	remaining: 10m 54s
 41 | Stopped by overfitting detector  (100 iterations wait)
 42 | 
 43 | bestTest = 0.2729804824
 44 | bestIteration = 1672
 45 | 
 46 | Shrink model to first 1673 iterations.
 47 | {'learn': {'MultiClass': 0.14819996336927216}, 'validation': {'MultiClass': 0.27298048242230794}}
 48 | FOLD 5 IS RUNNING...
 49 | 0:	learn: 1.2909100	test: 1.2914652	best: 1.2914652 (0)	total: 86.9ms	remaining: 14m 29s
 50 | 800:	learn: 0.2014462	test: 0.2983963	best: 0.2983963 (800)	total: 1m 2s	remaining: 11m 55s
 51 | 1600:	learn: 0.1523926	test: 0.2909189	best: 0.2907775 (1582)	total: 2m 4s	remaining: 10m 54s
 52 | Stopped by overfitting detector  (100 iterations wait)
 53 | 
 54 | bestTest = 0.2898741689
 55 | bestIteration = 1887
 56 | 
 57 | Shrink model to first 1888 iterations.
 58 | {'learn': {'MultiClass': 0.13391467495348316}, 'validation': {'MultiClass': 0.289874168865446}}
 59 | 
 60 | OOF-MEAN-ERROR score:0.283063, OOF-STD:0.010657
 61 | Inint Score: 0.7240031522090993
 62 | round:  1
 63 | class:0, new_weight:1.01, f1 score: 0.7242893038330873
 64 | class:0, new_weight:1.02, f1 score: 0.7244468658037289
 65 | class:0, new_weight:1.03, f1 score: 0.7247189260435818
 66 | class:0, new_weight:1.05, f1 score: 0.7247883133652404
 67 | class:0, new_weight:1.06, f1 score: 0.7253074441711662
 68 | class:0, new_weight:1.07, f1 score: 0.7255838308898628
 69 | class:0, new_weight:1.09, f1 score: 0.7258591461588992
 70 | class:0, new_weight:1.1, f1 score: 0.7263732069942956
 71 | class:0, new_weight:1.11, f1 score: 0.7269810148203093
 72 | class:0, new_weight:1.12, f1 score: 0.727085092104794
 73 | class:0, new_weight:1.19, f1 score: 0.7275673332111965
 74 | class:0, new_weight:1.2, f1 score: 0.7277300984468054
 75 | class:0, new_weight:1.21, f1 score: 0.7300337938032027
 76 | class:0, new_weight:1.22, f1 score: 0.7302916982856817
 77 | class:0, new_weight:1.32, f1 score: 0.7302972834627351
 78 | class:0, new_weight:1.33, f1 score: 0.7305212560605624
 79 | class:0, new_weight:1.34, f1 score: 0.7307742905548762
 80 | class:0, new_weight:1.3800000000000001, f1 score: 0.731115696618317
 81 | class:0, new_weight:1.3900000000000001, f1 score: 0.7311341774607671
 82 | class:0, new_weight:1.4000000000000001, f1 score: 0.7321211157346706
 83 | class:0, new_weight:1.41, f1 score: 0.732530278451288
 84 | class:0, new_weight:1.42, f1 score: 0.7326514907204666
 85 | class:0, new_weight:1.43, f1 score: 0.7326655042252155
 86 | class:0, new_weight:1.44, f1 score: 0.7340465325949609
 87 | class:0, new_weight:1.45, f1 score: 0.7349701799847135
 88 | class:2, new_weight:0.47000000000000003, f1 score: 0.7351355277520346
 89 | class:2, new_weight:0.51, f1 score: 0.7352366908078052
 90 | class:2, new_weight:0.52, f1 score: 0.7354704485017871
 91 | class:2, new_weight:0.53, f1 score: 0.7356003615547112
 92 | class:2, new_weight:0.54, f1 score: 0.7358162977063339
 93 | class:2, new_weight:0.55, f1 score: 0.7360528528073605
 94 | class:2, new_weight:0.6, f1 score: 0.7360930396635706
 95 | class:2, new_weight:0.62, f1 score: 0.7361315695490319
 96 | class:3, new_weight:0.77, f1 score: 0.736236509770795
 97 | class:3, new_weight:0.79, f1 score: 0.7362861930960579
 98 | class:3, new_weight:0.8, f1 score: 0.73637330491084
 99 | class:3, new_weight:0.81, f1 score: 0.7364039172775363
100 | class:3, new_weight:0.8200000000000001, f1 score: 0.7365143106346561
101 | class:3, new_weight:0.8300000000000001, f1 score: 0.7366247783303799
102 | round:  2
103 | class:2, new_weight:0.55, f1 score: 0.7366811046538598
104 | round:  3
105 | ********************** SEARCH BEST WEIGHT : [1.45, 1.0, 0.55, 0.8300000000000001] **********************
106 | ********************** BEST MACRO_F1 : 0.7366811046538598 **********************


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/code/model.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | import datetime
  3 | import lightgbm as lgb
  4 | import numpy as np
  5 | import pandas as pd
  6 | from catboost import CatBoostClassifier
  7 | from sklearn.model_selection import StratifiedKFold
  8 | 
  9 | from utils import N_ROUNDS
 10 | import pickle
 11 | import os
 12 | warnings.filterwarnings('ignore')
 13 | 
 14 | 
 15 | def get_model_feature_importances(model):
 16 |     feature_importances = pd.DataFrame()
 17 |     feature_importances['fea'] = model.feature_names_
 18 |     feature_importances['importances'] = model.feature_importances_
 19 |     feature_importances = feature_importances.sort_values('importances', ascending=False).reset_index(drop=True)
 20 | 
 21 |     return feature_importances
 22 | 
 23 | 
 24 | def run_cbt(train, target, test, k, seed, NUM_CLASS=4, cat_cols=[]):
 25 |     print('********************** RUN CATBOOST MODEL **********************')
 26 |     print(f'******************  当前的 SEED {seed} ********************** ')
 27 |     folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
 28 |     oof_prob = np.zeros((train.shape[0], NUM_CLASS))
 29 |     test_prob = np.zeros((test.shape[0], NUM_CLASS))
 30 |     feature_importance_df = []
 31 |     offline_score = []
 32 |     model_list = []
 33 | 
 34 |     ## K-Fold
 35 |     for fold, (trn_idx, val_idx) in enumerate(folds.split(train, target)):
 36 |         print("FOLD {} IS RUNNING...".format(fold + 1))
 37 |         trn_x, trn_y = train.loc[trn_idx], target.loc[trn_idx]
 38 |         val_x, val_y = train.loc[val_idx], target.loc[val_idx]
 39 |         catboost_model = CatBoostClassifier(
 40 |             iterations=N_ROUNDS,
 41 |             od_type='Iter',
 42 |             od_wait=120,
 43 |             max_depth=8,
 44 |             learning_rate=0.05,
 45 |             l2_leaf_reg=9,
 46 |             random_seed=seed,
 47 |             fold_len_multiplier=1.1,
 48 |             loss_function='MultiClass',
 49 |             logging_level='Verbose',
 50 |             # task_type="GPU"
 51 | 
 52 |         )
 53 | 
 54 |         start_time = datetime.datetime.now()
 55 | 
 56 |         catboost_model.fit(trn_x,
 57 |                            trn_y,
 58 |                            eval_set=(val_x, val_y),
 59 |                            use_best_model=True,
 60 |                            verbose=800,
 61 |                            early_stopping_rounds=100,
 62 |                            cat_features=cat_cols,
 63 |                            )
 64 |         end_time = datetime.datetime.now()
 65 |         model_train_cost_time = end_time - start_time
 66 |         print('****************** 模型训练 COST TIME : ',str(model_train_cost_time),' ******************')
 67 | 
 68 |         start_time = datetime.datetime.now()
 69 |         oof_prob[val_idx] = catboost_model.predict_proba(train.loc[val_idx])
 70 |         end_time = datetime.datetime.now()
 71 |         model_pred_cost_time = end_time - start_time
 72 |         print('****************** 模型预测 COST TIME : ', str(model_pred_cost_time), ' ******************')
 73 |         #         catboost_model = catboost_model.get_best_iteration()
 74 |         test_prob += catboost_model.predict_proba(test) / folds.n_splits
 75 |         print(catboost_model.get_best_score())
 76 |         offline_score.append(catboost_model.get_best_score()['validation']['MultiClass'])
 77 | 
 78 |         feature_importance_df.append(get_model_feature_importances(catboost_model))
 79 |         model_list.append(catboost_model)
 80 |         with open(os.path.join('../model', f'cat_model_flod_{fold}.pkl'), 'wb') as f:
 81 |             pickle.dump(catboost_model, f)
 82 |     print('\nOOF-MEAN-ERROR score:%.6f, OOF-STD:%.6f' % (np.mean(offline_score), np.std(offline_score)))
 83 |     fea_imp_df = pd.concat(feature_importance_df, ignore_index=True).groupby('fea').agg(
 84 |         {'importances': 'mean'}).reset_index().sort_values('importances', ascending=False).reset_index(drop=True)
 85 | 
 86 |     return oof_prob, test_prob, fea_imp_df, model_list
 87 | 
 88 | 
 89 | def run_lgb(train, target, test, k, seed=42, NUM_CLASS=4, cat_cols=[]):
 90 |     # feats = [f for f in train.columns if f not in ['cust_no', 'label', 'I7', 'I9', 'B6']]
 91 |     #     print('Current num of features:', len(feats))
 92 |     print(f'********************** RUN LGBM MODEL **********************')
 93 |     print(f'******************  当前的 SEED {seed} ********************** ')
 94 |     cols_map = {j: i for i, j in enumerate(train.columns)}
 95 |     cat_cols = [cols_map[i] for i in cat_cols]
 96 |     train = train.rename(columns=cols_map)
 97 |     test = test.rename(columns=cols_map)
 98 |     folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
 99 |     oof_prob = np.zeros((train.shape[0], NUM_CLASS))
100 |     test_prob = np.zeros((test.shape[0], NUM_CLASS))
101 |     fea_imp_df_list = []
102 |     offline_score = []
103 |     model_list = []
104 |     ## K-Fold
105 |     for fold, (trn_idx, val_idx) in enumerate(folds.split(train, target)):
106 |         params = {
107 |             "objective": "multiclass",
108 |             "num_class": NUM_CLASS,
109 |             "learning_rate": 0.01,
110 |             "max_depth": -1,
111 |             "num_leaves": 32,
112 |             "verbose": -1,
113 |             "bagging_fraction": 0.8,
114 |             "feature_fraction": 0.8,
115 |             "seed": seed,
116 |             'metric': 'multi_error'
117 | 
118 |         }
119 |         print("FOLD {} IS RUNNING...".format(fold + 1))
120 |         trn_data = lgb.Dataset(train.loc[trn_idx], label=target.loc[trn_idx])
121 |         val_data = lgb.Dataset(train.loc[val_idx], label=target.loc[val_idx])
122 | 
123 |         # train
124 |         params['seed'] = seed
125 |         lgb_model = lgb.train(
126 |             params,
127 |             trn_data,
128 |             num_boost_round=N_ROUNDS,
129 |             valid_sets=[trn_data, val_data],
130 |             early_stopping_rounds=100,
131 |             verbose_eval=200,
132 |             categorical_feature=cat_cols,
133 | 
134 |         )
135 |         # predict
136 |         oof_prob[val_idx] = lgb_model.predict(train.loc[val_idx], num_iteration=lgb_model.best_iteration)
137 |         test_prob += lgb_model.predict(test, num_iteration=lgb_model.best_iteration) / folds.n_splits
138 |         offline_score.append(lgb_model.best_score['valid_1']['multi_error'])
139 |         fea_imp = pd.DataFrame()
140 |         fea_imp['feature_name'] = lgb_model.feature_name()
141 |         fea_imp['importance'] = lgb_model.feature_importance()
142 |         fea_imp['feature_name'] = fea_imp['feature_name'].map({str(cols_map[i]): i for i in cols_map})
143 |         fea_imp = fea_imp.sort_values('importance', ascending=False)
144 |         fea_imp_df_list.append(fea_imp)
145 | 
146 |         model_list.append(lgb_model)
147 |     print('\nOOF-MEAN-ERROR score:%.6f, OOF-STD:%.6f' % (np.mean(offline_score), np.std(offline_score)))
148 |     fea_imp_df = pd.concat(fea_imp_df_list, ignore_index=True).groupby('feature_name').agg(
149 |         {'importance': 'mean'}).reset_index().sort_values('importance', ascending=False)
150 |     return oof_prob, test_prob, fea_imp_df, model_list
151 | 


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/code/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from log import Logger
  4 | from collections import Counter
  5 | from tqdm import tqdm
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | ROOT_DIR = os.path.join(sys.path[0], '../')
 10 | LOG_DIR = os.path.join(ROOT_DIR, 'log')
 11 | 
 12 | DATA_DIR = os.path.join(ROOT_DIR, 'data')
 13 | TRAIN_DIR = os.path.join(DATA_DIR, 'preliminary_train')
 14 | # 提交docker时 需要打开更换
 15 | MODEL_PATH = os.path.join(ROOT_DIR, './model/deberta-base')
 16 | MODEL_1_PATH = os.path.join(ROOT_DIR, './model')
 17 | TEST_A_DIR = os.path.join(ROOT_DIR, './tcdata')
 18 | # TEST_A_DIR = os.path.join(ROOT_DIR, './tcdata_test')
 19 | PSEUDO_FALG = True
 20 | TEST_B_DIR = os.path.join(ROOT_DIR, 'tcdata')
 21 | 
 22 | 
 23 | 
 24 | RESULT_DIR = os.path.join(ROOT_DIR, 'prediction_result')
 25 | 
 26 | FEATURE_DIR = os.path.join(ROOT_DIR, 'feature')
 27 | GENERATION_DIR = os.path.join(FEATURE_DIR, 'generation')
 28 | CORRELATION_DIR = os.path.join(FEATURE_DIR, 'correlation')
 29 | 
 30 | 
 31 | USER_DATA_DIR = os.path.join(ROOT_DIR, 'user_data')
 32 | USER_MODEL_DIR = os.path.join(USER_DATA_DIR, 'model_data')
 33 | TMP_DIR = os.path.join(USER_DATA_DIR, 'tmp_data')
 34 | N_ROUNDS = 10000
 35 | TIME_INTERVAL = 60
 36 | 
 37 | KEY_1 = ['OEM record c2', 'Processor CPU_Core_Error', '001c4c', 'System Event Sys_Event', 'Power Supply PS0_Status',
 38 |          'Temperature CPU0_Margin_Temp', 'Reading 51 &gt; Threshold 85 degrees C', 'Lower Non-critical going low',
 39 |          'Temperature CPU1_Margin_Temp', 'System ACPI Power State #0x7d', 'Lower Critical going low']
 40 | KEY_2 = ['OEM CPU0 MCERR', 'OEM CPU0 CATERR', 'Reading 0 &lt; Threshold 2 degrees C', '0203c0a80101',
 41 |          'Unknown CPU0 MCERR', 'Unknown CPU0 CATERR', 'Microcontroller #0x3b', 'System Boot Initiated',
 42 |          'Processor #0xfa', 'Power Unit Pwr Unit Status', 'Hard reset', 'Power off/down', 'System Event #0xff',
 43 |          'Memory CPU1A1_DIMM_Stat', '000000', 'Power cycle', 'OEM record c3', 'Memory CPU1C0_DIMM_Stat',
 44 |          'Reading 0 &lt; Threshold 1 degrees C', 'IERR']
 45 | KEY_3 = ['Memory', 'Correctable ECC logging limit reached', 'Memory MEM_CHE0_Status', 'Memory Memory_Status',
 46 |          'Memory #0x87', 'Memory CPU0F0_DIMM_Stat', 'Memory Device Disabled', 'Memory #0xe2',
 47 |          'OS Stop/Shutdown OS Status', 'System Boot Initiated System Restart', 'OS Boot BIOS_Boot_Up',
 48 |          'System Boot Initiated BIOS_Boot_UP', 'Memory DIMM101', 'OS graceful shutdown', 'OS Critical Stop OS Status',
 49 |          'Memory #0xf9', 'Memory CPU0C0_DIMM_Stat', 'Memory DIMM111', 'Memory DIMM021', ]
 50 | KEY_4 = ['Drive Fault', 'NMI/Diag Interrupt', 'Failure detected', 'Power Supply AC lost', 'Power Supply PSU0_Supply',
 51 |          'AC out-of-range, but present', 'Predictive failure', 'Drive Present', 'Temperature Temp_DIMM_KLM',
 52 |          'Temperature Temp_DIMM_DEF', 'Power Supply PS1_Status', 'Identify Status', 'Power Supply PS2_Status',
 53 |          'Temperature DIMMG1_Temp', 'Upper Non-critical going high', 'Temperature DIMMG0_Temp',
 54 |          'Upper Critical going high', 'Power Button pressed', 'System Boot Initiated #0xb8', 'Deasserted']
 55 | TOP_KEY_WORDS = ['0203c0a80101', 'Configuration Error', 'Correctable ECC', 'Deasserted', 'Device Enabled', 'Drive Present',
 56 |                  'Event Logging Disabled SEL', 'Failure detected', 'IERR', 'Initiated by hard reset', 'Initiated by power up',
 57 |                  'Initiated by warm reset', 'Log area reset/cleared', 'Memory', 'Memory #0xe2', 'Memory CPU0C0',
 58 |                  'Microcontroller/Coprocessor BMC', 'OEM CPU0 CATERR', 'OEM CPU0 MCERR', 'OS Boot BIOS',
 59 |                  'OS Critical Stop OS Status', 'Power Supply PS1', 'Power Supply PS2', 'Presence detected', 'Processor', 'Processor CPU', 'Processor CPU0',
 60 |                  'Processor CPU1', 'S0/G0: working', 'S4/S5: soft-off', 'Slot / Connector PCIE', 'State Asserted', 'State Deasserted',
 61 |                  'System ACPI Power State ACPI', 'System Boot Initiated', 'System Boot Initiated #0xe0', 'System Boot Initiated BIOS',
 62 |                  'System Event', 'System Event #0x10', 'System Event #0xff', 'Timestamp Clock Sync', 'Transition to Running', 'Uncorrectable ECC',
 63 |                  'Uncorrectable machine check exception', 'Unknown CPU0 CATERR', 'Unknown CPU0 MCERR', 'Unknown Chassis', 'Watchdog2 IPMI',
 64 |                  ]
 65 | TOP_KEY_WORDS_2 = ['Processor CPU0 Status', 'System Boot Initiated BIOS Boot Up', 'Uncorrectable ECC', 'Initiated by power up',
 66 |                    'Configuration Error', 'Processor CPU CATERR', 'Processor CPU1 Status', 'Memory #0xe2', 'IERR', 'Initiated by warm reset',
 67 |                    'State Asserted', 'S4/S5: soft-off', 'Memory #0xf9', 'S0/G0: working', 'boot completed - device not specified', 'Timestamp Clock Sync',
 68 |                    'Presence detected', 'System Boot Initiated #0xe0', 'Drive Fault', 'Power Supply PS1 Status', 'Power off/down', 'OS Boot #0xe9',
 69 |                    'Failure detected', 'Uncorrectable machine check exception', 'Transition to Running', 'Power Supply PS2 Status',
 70 |                    'Memory Device Disabled', 'System Restart', 'System Event #0x10', 'Sensor access degraded or unavailable', 'Unknown #0x17',
 71 |                    'Drive Present', 'Management Subsys Health System Health', 'Power Supply AC lost', 'Microcontroller #0x16']
 72 | CHARATERS = ['#', '&', ]
 73 | # KEY_WORDS = KEY_1 + KEY_2 + KEY_3 + KEY_4 + CHARATERS
 74 | KEY_WORDS = KEY_1 + KEY_2 + KEY_3 + KEY_4 + CHARATERS + TOP_KEY_WORDS
 75 | KEY_WORDS = list(set(KEY_WORDS))
 76 | # cnt_1_0_diff_key_words = ['State Asserted','Processor CPU_CATERR','Unknown #0x17','Microcontroller #0x16','Transition to Running','State Deasserted','Processor #0xfa','Temperature CPU1_Margin_Temp','Temperature CPU0_Margin_Temp','Power cycle','Management Subsys Health System_Health','Sensor access degraded or unavailable','Power off/down','System ACPI Power State #0x7d']
 77 | # key_words_0 = ['Temperature CPU0_Margin_Temp','Lower Critical going low','System ACPI Power State #0x7d','Temperature CPU1_Margin_Temp','Lower Non-critical going low','Uncorrectable machine check exception','Reading 0 &lt; Threshold 1 degrees C','000000','Unknown #0x19','Temperature DIMMG1_Temp','Reading 0 &lt; Threshold 0 degrees C','001c4c','IERR','Upper Critical going high','Unknown Chassis_control','Temperature DIMMG0_Temp','Upper Non-critical going high','Temperature Temp_DIMM_DEF','Power cycle','Processor CPU0_Status','Temperature Temp_DIMM_KLM','Processor CPU1_Status','Management Subsys Health System_Health']
 78 | # key_words_1 = ['Processor #0xfa','State Deasserted','Power off/down','Power cycle','IERR','Unknown #0x17','Management Subsys Health System_Health','Processor CPU_CATERR','Reading 0 &lt; Threshold 1 degrees C','','Sensor access degraded or unavailable','Transition to Running','State Asserted','Microcontroller #0x16','Processor CPU0_Status','Processor CPU1_Status','Slot / Connector PCIE_Status','Fault Status','System ACPI Power State ACPI_PWR_Status','Management Subsystem Health System_Health','Configuration Error','Uncorrectable machine check exception','Timestamp Clock Sync']
 79 | # key_words_2 = ['Memory #0xe2','Memory Device Disabled','Memory #0x87','Memory #0xf9','Correctable ECC','Memory CPU0D0_DIMM_Stat','Uncorrectable ECC','Memory CPU1B0_DIMM_Stat','System Boot Initiated BIOS_Boot_UP','System Restart','Presence Detected','Temperature CPU0_Temp','boot completed - device not specified','Log almost full','Device Present','Legacy OFF state','System Boot Initiated #0xe0','System Event #0x10','Legacy ON state','OS Boot #0xe0','Unknown #0xc5','System Boot Initiated #0xb8','Event Logging Disabled SEL_Status']
 80 | # key_words_3 = ['Drive Fault','Failure detected','Drive Present','Temperature Temp_DIMM_KLM','Temperature Temp_DIMM_DEF','Power Supply PS4_Status','Upper Non-critical going high','Temperature DIMMG0_Temp','Temperature DIMMG1_Temp','Power Supply PS3_Status','Upper Critical going high','Predictive failure','Power Supply AC lost','Unknown #0x19','Power Unit Power Unit','AC out-of-range, but present','Power Supply PS1_Status','Power Supply PS2_Status','Log area reset/cleared','Microcontroller/Coprocessor BMC_Boot_Up','System Boot Initiated #0xb8','Power Button pressed','Device Present']
 81 | # top_key_words = [ 'Configuration Error','Uncorrectable ECC','Processor CPU0_Status','Initiated by power up','','Presence Detected','Processor CPU1_Status','S0/G0: working','Processor CPU_CATERR','Presence detected','S4/S5: soft-off','Upper Critical going high','Memory #0xe2','IERR','Initiated by warm reset','State Asserted','Upper Non-critical going high','boot completed - device not specified','Memory Device Disabled','Timestamp Clock Sync','Lower Critical going low','Transition to Running','Memory #0xf9','Power Supply PS1_Status']
 82 | # key_words_1_desc = ['#0xfa', '#0x','#0xff','CATERR','cycle','Unit','IERR','IPMI','#0x17', 'Running','#0x7c','Unknown','CPU', 'Sensor','CPU0','CPU1','Subsys']
 83 | #
 84 | # key_words = cnt_1_0_diff_key_words +key_words_0+key_words_1+key_words_2+key_words_3+top_key_words+key_words_1_desc
 85 | # key_words = list(set(key_words))
 86 | # KEY_WORDS = key_words+CHARATERS
 87 | 
 88 | 
 89 | def create_dir(dir):
 90 |     """
 91 |     创建目录
 92 |     :param dir: 目录名
 93 |     :return:
 94 |     """
 95 |     if not os.path.exists(dir):
 96 |         os.mkdir(dir)
 97 |         print(f'{dir}目录不存在,创建{dir}目录成功.')
 98 |     else:
 99 |         print(f'{dir}目录已存在.')
100 | 
101 | 
102 | def create_all_dir():
103 |     """
104 |     创建所有需要的目录
105 |     :return:
106 |     """
107 |     create_dir(ROOT_DIR)
108 |     create_dir(LOG_DIR)
109 | 
110 |     # create_dir(MODEL_DIR)
111 |     create_dir(RESULT_DIR)
112 | 
113 |     create_dir(FEATURE_DIR)
114 |     create_dir(GENERATION_DIR)
115 |     create_dir(CORRELATION_DIR)
116 | 
117 |     create_dir(DATA_DIR)
118 |     create_dir(TRAIN_DIR)
119 |     create_dir(TEST_A_DIR)
120 |     # create_dir(TEST_B_DIR)
121 | 
122 |     create_dir(USER_DATA_DIR)
123 |     create_dir(USER_MODEL_DIR)
124 |     create_dir(TMP_DIR)
125 | 
126 | 
127 | def clean_str(string):
128 |     return string
129 | 
130 | 
131 | def my_tokenizer(s):
132 |     return s.split(' | ')
133 | 
134 | 
135 | def get_word_counter(data):
136 |     print('获取异常日志计数字典')
137 | 
138 |     counter = Counter()
139 |     for string_ in tqdm(data['msg']):
140 |         string_ = string_.strip()
141 |         counter.update(my_tokenizer(clean_str(string_)))
142 |     return counter
143 | 
144 | 
145 | def macro_f1(target_df: pd.DataFrame, submit_df: pd.DataFrame):
146 |     """
147 |     计算得分
148 |     :param target_df: [sn,fault_time,label]
149 |     :param submit_df: [sn,fault_time,label]
150 |     :return:
151 |     """
152 | 
153 |     weights = [5 / 11, 4 / 11, 1 / 11, 1 / 11]
154 | 
155 |     # weights = [3 / 7, 2 / 7, 1 / 7, 1 / 7]
156 |     overall_df = target_df.merge(
157 |         submit_df, how='left', on=[
158 |             'sn', 'fault_time'], suffixes=[
159 |             '_gt', '_pr'])
160 |     overall_df.fillna(-1)
161 |     macro_F1 = 0.
162 |     for i in range(len(weights)):
163 |         TP = len(overall_df[(overall_df['label_gt'] == i)
164 |                  & (overall_df['label_pr'] == i)])
165 |         FP = len(overall_df[(overall_df['label_gt'] != i)
166 |                  & (overall_df['label_pr'] == i)])
167 |         FN = len(overall_df[(overall_df['label_gt'] == i)
168 |                  & (overall_df['label_pr'] != i)])
169 |         precision = TP / (TP + FP) if (TP + FP) > 0 else 0
170 |         recall = TP / (TP + FN) if (TP + FN) > 0 else 0
171 |         F1 = 2 * precision * recall / \
172 |             (precision + recall) if (precision + recall) > 0 else 0
173 |         macro_F1 += weights[i] * F1
174 |     return macro_F1
175 | 
176 | 
177 | def search_weight(train, valid_y, raw_prob, init_weight=[
178 |                   1.0], class_num=4, step=0.001):
179 |     weight = init_weight.copy() * class_num
180 |     oof = train[['sn', 'fault_time']]
181 |     oof['label'] = raw_prob.argmax(axis=1)
182 |     f_best = macro_f1(train[['sn', 'fault_time', 'label']], oof)
183 |     print("Inint Score:", f_best)
184 | 
185 |     #     f_best = f1_score(y_true=valid_y, y_pred=raw_prob.argmax(axis=1),average='macro')
186 |     flag_score = 0
187 |     round_num = 1
188 |     while (flag_score != f_best):
189 |         print("round: ", round_num)
190 |         round_num += 1
191 |         flag_score = f_best
192 |         for c in range(class_num):
193 |             for n_w in range(0, 2000, 10):
194 |                 num = n_w * step
195 |                 new_weight = weight.copy()
196 |                 new_weight[c] = num
197 |                 prob_df = raw_prob.copy()
198 |                 prob_df = prob_df * np.array(new_weight)
199 | 
200 |                 oof['label'] = prob_df.argmax(axis=1)
201 |                 f = macro_f1(train[['sn', 'fault_time', 'label']], oof)
202 |                 #                 f = f1_score(y_true=valid_y, y_pred=prob_df.argmax(axis=1),average='macro')
203 |                 if f > f_best:
204 |                     weight = new_weight.copy()
205 |                     f_best = f
206 |                     print(f"class:{c}, new_weight:{num}, f1 score: {f}")
207 |     print(
208 |         f'********************** SEARCH BEST WEIGHT : {weight} **********************')
209 |     return weight
210 | 
211 | 
212 | def get_new_cols(df, key=['sn', 'fault_time']):
213 |     if isinstance(df.columns[0], tuple):
214 | 
215 |         new_cols = []
216 |         for i in df.columns:
217 |             if i[0] in key:
218 |                 new_cols.append(i[0])
219 |             else:
220 |                 new_cols.append(f'{i[0]}_{i[1]}')
221 |         df.columns = new_cols
222 |         return df
223 |     else:
224 |         print('当前的DataFrame没有二级列名，请检查。')
225 |         return df
226 | 
227 | 
228 | if __name__ == '__main__':
229 |     # create_all_dir()
230 |     logger = Logger(name=os.path.basename(__file__).split(
231 |         '.py')[0], log_path=LOG_DIR, mode="w").get_log
232 |     print(len(KEY_WORDS))
233 | 


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/code/lgb_fs.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import warnings
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import datetime
  7 | from generate_feature import get_beta_target, add_last_next_time4fault, get_feature, \
  8 |     get_duration_minutes_fea, get_nearest_msg_fea, get_server_model_sn_fea_2, \
  9 |     get_server_model_fea, get_msg_text_fea_all, get_key_word_cross_fea, get_server_model_time_interval_stat_fea, \
 10 |     get_w2v_feats
 11 | from model import run_cbt,run_lgb
 12 | from utils import RESULT_DIR, TRAIN_DIR, \
 13 |     TEST_A_DIR, KEY_WORDS, get_word_counter, search_weight, macro_f1, TIME_INTERVAL,PSEUDO_FALG,GENERATION_DIR
 14 | 
 15 | warnings.filterwarnings('ignore')
 16 | 
 17 | 
 18 | def get_label(PSEUDO_FALG):
 19 |     preliminary_train_label_dataset = pd.read_csv(preliminary_train_label_dataset_path)
 20 |     preliminary_train_label_dataset_s = pd.read_csv(preliminary_train_label_dataset_s_path)
 21 | 
 22 |     if PSEUDO_FALG:
 23 |         print('获取伪标签LABEL')
 24 |         pseudo_labels = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_labels.csv'))
 25 |         label = pd.concat([preliminary_train_label_dataset,
 26 |                            pseudo_labels,
 27 |                            preliminary_train_label_dataset_s],
 28 |                           ignore_index=True,
 29 |                           axis=0).sort_values(
 30 |             ['sn', 'fault_time']).reset_index(drop=True)
 31 |     else:
 32 |         print('不使用伪标签数据')
 33 |         label = pd.concat([preliminary_train_label_dataset,
 34 |                            preliminary_train_label_dataset_s],
 35 |                           ignore_index=True,
 36 |                           axis=0).sort_values(
 37 |             ['sn', 'fault_time']).reset_index(drop=True)
 38 |     label['fault_time'] = label['fault_time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
 39 |     label['fault_time'] = label['fault_time'].apply(lambda x: str(x))
 40 |     return label
 41 | 
 42 | 
 43 | def get_log_dateset(PSEUDO_FALG):
 44 |     preliminary_sel_log_dataset = pd.read_csv(preliminary_sel_log_dataset_path)
 45 |     preliminary_sel_log_dataset_a = pd.read_csv(preliminary_sel_log_dataset_a_path)
 46 |     if PSEUDO_FALG:
 47 |         print('获取伪标签日志数据')
 48 |         pseudo_sel_log_dataset = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_sel_log_dataset.csv'))
 49 |         log_dataset = pd.concat([preliminary_sel_log_dataset,
 50 |                                  pseudo_sel_log_dataset,
 51 |                                  preliminary_sel_log_dataset_a],
 52 |                                 ignore_index=True,
 53 |                                 axis=0).sort_values(
 54 |             ['sn', 'time', 'server_model']).reset_index(drop=True)
 55 |     else:
 56 |         print('不使用伪标签数据')
 57 |         log_dataset = pd.concat([preliminary_sel_log_dataset,
 58 |                                  preliminary_sel_log_dataset_a],
 59 |                                 ignore_index=True,
 60 |                                 axis=0).sort_values(
 61 |             ['sn', 'time', 'server_model']).reset_index(drop=True)
 62 |     log_dataset['time'] = log_dataset['time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
 63 | 
 64 |     return log_dataset
 65 | 
 66 | def get_fea_distribute(feature_df, feature_importances, dataset_type, top=30):
 67 |     print('根据特征重要性，获取数据集的分布情况，用于验证训练集和测试集是否分布一致')
 68 |     fea_distribute_list = []
 69 |     for i in feature_importances[:top]['fea'].to_list():
 70 |         fea_distribute_tmp = (feature_df[i].value_counts() / len(feature_df)).reset_index().rename(
 71 |             columns={'index': 'value'})
 72 |         fea_distribute_list.append(fea_distribute_tmp)
 73 | 
 74 |     fea_distribute = fea_distribute_list[-1]
 75 |     for i in fea_distribute_list[:-1]:
 76 |         fea_distribute = fea_distribute.merge(i, on='value', how='left')
 77 |     fea_distribute['value'] = fea_distribute['value'].apply(lambda x: f'{dataset_type}_{int(x)}')
 78 |     return fea_distribute
 79 | 
 80 | 
 81 | def get_train_test(label, preliminary_submit_dataset_a, log_dataset):
 82 |     print('获取训练集数据与测试集数据')
 83 |     train = label.merge(log_dataset, on='sn', how='left')
 84 |     test = preliminary_submit_dataset_a.merge(log_dataset, on='sn', how='left')
 85 |     #     train['time_interval']  = (pd.to_datetime( train['fault_time'])-train['time']  ).apply(lambda x:x.total_seconds())
 86 |     #     test['time_interval']  = (pd.to_datetime( test['fault_time'])- test['time']  ).apply(lambda x:x.total_seconds())
 87 |     #     train = train.query('time_interval > 0')
 88 |     #     test = test.query('time_interval > 0')
 89 |     print(f'训练集维度:{train.shape},测试集维度:{test.shape}')
 90 |     train = train.drop_duplicates().reset_index(drop=True)
 91 |     test = test.drop_duplicates().reset_index(drop=True)
 92 |     train['time'] = pd.to_datetime(train['time'])
 93 |     test['time'] = pd.to_datetime(test['time'])
 94 |     return train, test
 95 | 
 96 | start_time = datetime.datetime.now()
 97 | 
 98 | additional_sel_log_dataset_path = os.path.join(TRAIN_DIR, 'additional_sel_log_dataset.csv')
 99 | preliminary_train_label_dataset_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset.csv')
100 | preliminary_train_label_dataset_s_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset_s.csv')
101 | preliminary_sel_log_dataset_path = os.path.join(TRAIN_DIR, 'preliminary_sel_log_dataset.csv')
102 | 
103 | preliminary_submit_dataset_a_path = os.path.join(TEST_A_DIR, 'final_submit_dataset_b.csv')
104 | preliminary_sel_log_dataset_a_path = os.path.join(TEST_A_DIR, 'final_sel_log_dataset_b.csv')
105 | 
106 | print(preliminary_submit_dataset_a_path, preliminary_sel_log_dataset_a_path)
107 | 
108 | preliminary_submit_dataset_a = pd.read_csv(preliminary_submit_dataset_a_path)
109 | preliminary_submit_dataset_a.head()
110 | 
111 | log_dataset = get_log_dateset(PSEUDO_FALG)
112 | label = get_label(PSEUDO_FALG)
113 | 
114 | 
115 | next_time_list = [i / TIME_INTERVAL for i in [3, 5, 10, 15, 30, 45, 60, 90, 120, 240, 360, 480, 540, 600]] + [1000000]
116 | 
117 | label, preliminary_submit_dataset_a = add_last_next_time4fault(label, preliminary_submit_dataset_a, TIME_INTERVAL,
118 |                                                                next_time_list)
119 | train, test = get_train_test(label, preliminary_submit_dataset_a, log_dataset)
120 | train = train.drop_duplicates(['sn', 'fault_time', 'time', 'msg', 'server_model']).reset_index(drop=True)
121 | 
122 | train['time_interval'] = (pd.to_datetime(train['fault_time']) - pd.to_datetime(train['time'])).apply(
123 |     lambda x: x.total_seconds())
124 | test['time_interval'] = (pd.to_datetime(test['fault_time']) - pd.to_datetime(test['time'])).apply(
125 |     lambda x: x.total_seconds())
126 | 
127 | all_data = pd.concat([train, test], axis=0, ignore_index=True)
128 | all_data = all_data.sort_values(['sn','server_model', 'fault_time', 'time'])
129 | w2v_feats = get_w2v_feats(all_data,
130 |                           f1_list = ['sn'],
131 |                           f2_list = ['msg_list', 'msg_0', 'msg_1', 'msg_2'])
132 | 
133 | # 获取 server_model_time_interval_stat_fea
134 | server_model_time_interval_stat_fea = get_server_model_time_interval_stat_fea(all_data)
135 | 
136 | msg_text_fea = get_msg_text_fea_all(all_data)
137 | # 获取时间差特征
138 | duration_minutes_fea = get_duration_minutes_fea(train, test)
139 | 
140 | # 获取时间server_model特征
141 | server_model_fea = get_server_model_fea(train, test)
142 | counter = get_word_counter(train)
143 | 
144 | # 获取时间 nearest_msg 特征
145 | nearest_msg_fea = get_nearest_msg_fea(train, test)
146 | # 获取时间 server_model beta_target 特征
147 | beta_target_fea = get_beta_target(train, test)
148 | 
149 | key = ['sn', 'fault_time', 'label', 'server_model']
150 | 
151 | fea_num = len(KEY_WORDS)
152 | time_list = [i * TIME_INTERVAL for i in next_time_list]
153 | train = get_feature(train, time_list, KEY_WORDS, fea_num, key=['sn', 'fault_time', 'label', 'server_model'])
154 | test = get_feature(test, time_list, KEY_WORDS, fea_num, key=['sn', 'fault_time', 'server_model'])
155 | 
156 | print('添加 时间差 特征')
157 | train = train.merge(duration_minutes_fea, on=['sn', 'fault_time', 'server_model'])
158 | test = test.merge(duration_minutes_fea, on=['sn', 'fault_time', 'server_model'])
159 | 
160 | print('添加 server_model特征')
161 | train = train.merge(server_model_fea, on=['sn', 'server_model'])
162 | test = test.merge(server_model_fea, on=['sn', 'server_model'])
163 | 
164 | print('添加 w2v_feats')
165 | train = train.merge(w2v_feats, on=['sn' ])
166 | test = test.merge(w2v_feats, on=['sn', ])
167 | 
168 | print('添加 nearest_msg 特征')
169 | train = train.merge(nearest_msg_fea, on=['sn', 'server_model', 'fault_time'])
170 | test = test.merge(nearest_msg_fea, on=['sn', 'server_model', 'fault_time'])
171 | 
172 | print('添加 beta_target 特征')
173 | train = train.merge(beta_target_fea, on=['sn', 'server_model', 'fault_time'])
174 | test = test.merge(beta_target_fea, on=['sn', 'server_model', 'fault_time'])
175 | 
176 | server_model_sn_fea_2 = get_server_model_sn_fea_2(train, test)
177 | print('添加 server_model_sn_fea_2 特征')
178 | train = train.merge(server_model_sn_fea_2, on=['sn', 'server_model'])
179 | test = test.merge(server_model_sn_fea_2, on=['sn', 'server_model'])
180 | 
181 | # crashdump_venus_fea = pd.read_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea.csv') )
182 | # print('添加 crashdump_venus_fea 特征')
183 | # print(train.shape,test.shape,crashdump_venus_fea.shape)
184 | # train = train.merge(crashdump_venus_fea, on=['sn' , 'fault_time'],how = 'left')
185 | # test = test.merge(crashdump_venus_fea, on=['sn', 'fault_time' ],how = 'left')
186 | # print(train.shape,test.shape )
187 | 
188 | crashdump_venus_fea = pd.read_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea_v1.csv') )
189 | print('添加 crashdump_venus_fea 特征')
190 | print(train.shape,test.shape,crashdump_venus_fea.shape)
191 | train = train.merge(crashdump_venus_fea, on=['sn' , 'fault_time'],how = 'left')
192 | test = test.merge(crashdump_venus_fea, on=['sn', 'fault_time' ],how = 'left')
193 | print(train.shape,test.shape )
194 | test.to_csv(os.path.join(GENERATION_DIR,'test.csv'),index =False)
195 | train.to_csv(os.path.join(GENERATION_DIR,'train.csv'),index =False)
196 | 
197 | # print('添加 msg_text_fea 特征')
198 | # train = train.merge(msg_text_fea, on=['sn', 'fault_time' ], how='left')
199 | # test = test.merge(msg_text_fea, on=['sn', 'fault_time'], how='left')
200 | 
201 | # print('添加 关键词交叉特征  ')
202 | # train,test = get_key_word_cross_fea(train,test)
203 | 
204 | # print('添加 server_model_time_interval_stat_fea 特征')
205 | # train = train.merge(server_model_time_interval_stat_fea, on=['server_model' ],how ='left')
206 | # test = test.merge(server_model_time_interval_stat_fea, on=['server_model'  ],how ='left')
207 | 
208 | 
209 | use_less_cols_1 = ['last_last_msg_cnt', 'last_first_msg_cnt','time_diff_1_min',
210 |        'last_msg_list_unique_LabelEnc', 'last_msg_0_unique_LabelEnc',
211 |        'last_msg_1_unique_LabelEnc', 'last_msg_2_unique_LabelEnc',
212 |        'last_msg_list_list_LabelEnc', 'last_msg_0_list_LabelEnc',
213 |        'last_msg_1_list_LabelEnc', 'last_msg_2_list_LabelEnc',
214 |        'last_msg_0_first_LabelEnc', 'last_msg_1_first_LabelEnc',
215 |        'last_msg_2_first_LabelEnc', 'last_msg_0_last_LabelEnc',
216 |        'last_msg_1_last_LabelEnc', 'last_msg_2_last_LabelEnc',
217 |        'last_msg_last_LabelEnc', 'last_msg_first_LabelEnc']
218 | 
219 | use_less_col = [i for i in train.columns if train[i].nunique() < 2] + use_less_cols_1
220 | 
221 | 
222 | print(f'use_less_col:{len(use_less_col)}')
223 | use_cols = [i for i in train.columns if i not in ['sn', 'fault_time', 'label', 'server_model'] + use_less_col]
224 | cat_cols = ['server_model_LabelEnc', 'msg_LabelEnc', 'msg_0_LabelEnc', 'msg_1_LabelEnc', 'msg_2_LabelEnc',]
225 | use_cols = sorted(use_cols)
226 | print('使用的特征维度:',len(use_cols))
227 | 
228 | # cat_cols = []
229 | # for i in use_cols:
230 | #     if '_LabelEnc' in i:
231 | #         cat_cols.append(i)
232 | 
233 | oof_prob = np.zeros((train.shape[0], 4))
234 | test_prob = np.zeros((test.shape[0], 4))
235 | # seeds = [42,4242,40424,1024,2048]
236 | seeds = [42 ]
237 | for seed in seeds:
238 |     oof_prob, test_prob, fea_imp_df, model_list = run_lgb(train[use_cols], train[['label']], test[use_cols], k=5,
239 |                                               seed=seed, cat_cols=cat_cols)
240 |     oof_prob +=oof_prob/len(seeds)
241 |     test_prob +=test_prob/len(seeds)
242 | 
243 | weight = search_weight(train, train[['label']], oof_prob, init_weight=[1.0], class_num=4, step=0.001)
244 | oof_prob = oof_prob * np.array(weight)
245 | test_prob = test_prob * np.array(weight)
246 | 
247 | target_df = train[['sn', 'fault_time', 'label']]
248 | submit_df = train[['sn', 'fault_time']]
249 | submit_df['label'] = oof_prob.argmax(axis=1)
250 | 
251 | score = macro_f1(target_df=target_df, submit_df=submit_df)
252 | print(f'********************** BEST MACRO_F1 : {score} **********************')
253 | score = round(score, 5)
254 | 
255 | y_pred = test_prob.argmax(axis=1)
256 | result = test[['sn', 'fault_time']]
257 | result['label'] = y_pred
258 | result = preliminary_submit_dataset_a.merge(result, on=['sn', 'fault_time'], how='left')[['sn', 'fault_time', 'label']]
259 | result['label'] = result['label'].fillna(0).astype(int)
260 | 
261 | result.to_csv(os.path.join(RESULT_DIR, f'lgb_result.csv'), index=False)
262 | 
263 | fea_imp_df = fea_imp_df.reset_index(drop=True)
264 | fea_imp_df.to_csv(os.path.join(RESULT_DIR, f'./lgb_fea_imp_{int(score * 100000)}.csv'), index=False)
265 | 
266 | train_result_prob = pd.DataFrame(oof_prob).add_prefix('lgb_class_')
267 | test_result_prob = pd.DataFrame(test_prob).add_prefix('lgb_class_')
268 | train_result_prob['label'] = train['label']
269 | train_result_prob['sn'] = train['sn']
270 | train_result_prob['fault_time'] = train['fault_time']
271 | test_result_prob['sn'] = test['sn']
272 | test_result_prob['fault_time'] = test['fault_time']
273 | 
274 | result_prob = pd.concat([train_result_prob,test_result_prob],ignore_index = True)
275 | result_prob.to_csv(os.path.join(RESULT_DIR,f'lgb_prob_result.csv'),index = False)
276 | 
277 | 
278 | end_time = datetime.datetime.now()
279 | cost_time = end_time - start_time
280 | print('****************** LIGHTGBM COST TIME : ',str(cost_time),' ******************')
281 | 
282 | '''
283 | 
284 | v7 最优版本  线下 7356
285 | v8: v7 添加 关键词交叉特征 线下 0.7357  线上 7338
286 | v8.1 v7 添加 关键词交叉特征 并作为类别变量输入模型 0.73361
287 | v8.2 v7 添加 关键词交叉特征 并作为类别变量输入模型  删除 TOP_KEY_WORDS 7117
288 | v8.3 v7 添加 关键词交叉特征 并作为类别变量输入模型  使用 TOP_KEY_WORDS_2 7260
289 | v8.3 v7 添加 关键词交叉特征 并作为类别变量输入模型  添加 TOP_KEY_WORDS_2 7260
290 | 
291 | '''
292 | 


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/code/catboost_fs.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import os
  3 | import warnings
  4 | 
  5 | import numpy as np
  6 | import pandas  as pd
  7 | 
  8 | 
  9 | from generate_feature import get_beta_target, add_last_next_time4fault, get_feature, \
 10 |     get_duration_minutes_fea, get_nearest_msg_fea, get_server_model_sn_fea_2, \
 11 |     get_server_model_fea, get_msg_text_fea_all, get_key_word_cross_fea, get_server_model_time_interval_stat_fea, \
 12 |     get_w2v_feats, get_key_for_top_fea,get_time_diff_feats_v2
 13 | from model import run_cbt
 14 | from utils import RESULT_DIR, TRAIN_DIR, \
 15 |     TEST_A_DIR, KEY_WORDS, get_word_counter, search_weight, macro_f1, TIME_INTERVAL,PSEUDO_FALG,GENERATION_DIR
 16 | 
 17 | warnings.filterwarnings('ignore')
 18 | 
 19 | 
 20 | def get_label(PSEUDO_FALG):
 21 |     preliminary_train_label_dataset = pd.read_csv(preliminary_train_label_dataset_path)
 22 |     preliminary_train_label_dataset_s = pd.read_csv(preliminary_train_label_dataset_s_path)
 23 | 
 24 |     if PSEUDO_FALG:
 25 |         print('获取伪标签LABEL')
 26 |         pseudo_labels = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_labels.csv'))
 27 |         label = pd.concat([preliminary_train_label_dataset,
 28 |                            pseudo_labels,
 29 |                            preliminary_train_label_dataset_s],
 30 |                           ignore_index=True,
 31 |                           axis=0).sort_values(
 32 |             ['sn', 'fault_time']).reset_index(drop=True)
 33 |     else:
 34 |         print('不使用伪标签数据')
 35 |         label = pd.concat([preliminary_train_label_dataset,
 36 |                            preliminary_train_label_dataset_s],
 37 |                           ignore_index=True,
 38 |                           axis=0).sort_values(
 39 |             ['sn', 'fault_time']).reset_index(drop=True)
 40 |     label['fault_time'] = label['fault_time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
 41 |     label['fault_time'] = label['fault_time'].apply(lambda x: str(x))
 42 |     return label
 43 | 
 44 | 
 45 | def get_log_dateset(PSEUDO_FALG):
 46 |     preliminary_sel_log_dataset = pd.read_csv(preliminary_sel_log_dataset_path)
 47 |     preliminary_sel_log_dataset_a = pd.read_csv(preliminary_sel_log_dataset_a_path)
 48 |     if PSEUDO_FALG:
 49 |         print('获取伪标签日志数据')
 50 |         pseudo_sel_log_dataset = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_sel_log_dataset.csv'))
 51 |         log_dataset = pd.concat([preliminary_sel_log_dataset,
 52 |                                  pseudo_sel_log_dataset,
 53 |                                  preliminary_sel_log_dataset_a],
 54 |                                 ignore_index=True,
 55 |                                 axis=0).sort_values(
 56 |             ['sn', 'time', 'server_model']).reset_index(drop=True)
 57 |     else:
 58 |         print('不使用伪标签数据')
 59 |         log_dataset = pd.concat([preliminary_sel_log_dataset,
 60 |                                  preliminary_sel_log_dataset_a],
 61 |                                 ignore_index=True,
 62 |                                 axis=0).sort_values(
 63 |             ['sn', 'time', 'server_model']).reset_index(drop=True)
 64 |     log_dataset['time'] = log_dataset['time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
 65 | 
 66 |     return log_dataset
 67 | 
 68 | def get_fea_distribute(feature_df, feature_importances, dataset_type, top=30):
 69 |     print('根据特征重要性，获取数据集的分布情况，用于验证训练集和测试集是否分布一致')
 70 |     fea_distribute_list = []
 71 |     for i in feature_importances[:top]['fea'].to_list():
 72 |         fea_distribute_tmp = (feature_df[i].value_counts() / len(feature_df)).reset_index().rename(
 73 |             columns={'index': 'value'})
 74 |         fea_distribute_list.append(fea_distribute_tmp)
 75 | 
 76 |     fea_distribute = fea_distribute_list[-1]
 77 |     for i in fea_distribute_list[:-1]:
 78 |         fea_distribute = fea_distribute.merge(i, on='value', how='left')
 79 |     fea_distribute['value'] = fea_distribute['value'].apply(lambda x: f'{dataset_type}_{int(x)}')
 80 |     return fea_distribute
 81 | 
 82 | 
 83 | 
 84 | def get_train_test(label, preliminary_submit_dataset_a, log_dataset):
 85 |     print('获取训练集数据与测试集数据')
 86 |     train = label.merge(log_dataset, on='sn', how='left')
 87 |     test = preliminary_submit_dataset_a.merge(log_dataset, on='sn', how='left')
 88 |     #     train['time_interval']  = (pd.to_datetime( train['fault_time'])-train['time']  ).apply(lambda x:x.total_seconds())
 89 |     #     test['time_interval']  = (pd.to_datetime( test['fault_time'])- test['time']  ).apply(lambda x:x.total_seconds())
 90 |     #     train = train.query('time_interval > 0')
 91 |     #     test = test.query('time_interval > 0')
 92 |     print(f'训练集维度:{train.shape},测试集维度:{test.shape}')
 93 |     train = train.drop_duplicates().reset_index(drop=True)
 94 |     test = test.drop_duplicates().reset_index(drop=True)
 95 |     train['time'] = pd.to_datetime(train['time'])
 96 |     test['time'] = pd.to_datetime(test['time'])
 97 |     return train, test
 98 | start_time = datetime.datetime.now()
 99 | 
100 | additional_sel_log_dataset_path = os.path.join(TRAIN_DIR, 'additional_sel_log_dataset.csv')
101 | preliminary_train_label_dataset_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset.csv')
102 | preliminary_train_label_dataset_s_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset_s.csv')
103 | preliminary_sel_log_dataset_path = os.path.join(TRAIN_DIR, 'preliminary_sel_log_dataset.csv')
104 | 
105 | preliminary_submit_dataset_a_path = os.path.join(TEST_A_DIR, 'final_submit_dataset_b.csv')
106 | preliminary_sel_log_dataset_a_path = os.path.join(TEST_A_DIR, 'final_sel_log_dataset_b.csv')
107 | 
108 | print(preliminary_submit_dataset_a_path, preliminary_sel_log_dataset_a_path)
109 | 
110 | preliminary_submit_dataset_a = pd.read_csv(preliminary_submit_dataset_a_path)
111 | preliminary_submit_dataset_a.head()
112 | 
113 | log_dataset = get_log_dateset(PSEUDO_FALG)
114 | label = get_label(PSEUDO_FALG)
115 | 
116 | 
117 | next_time_list = [i / TIME_INTERVAL for i in [3, 5, 10, 15, 30, 45, 60, 90, 120, 240, 360, 480, 540, 600]] + [1000000]
118 | 
119 | label, preliminary_submit_dataset_a = add_last_next_time4fault(label, preliminary_submit_dataset_a, TIME_INTERVAL,
120 |                                                                next_time_list)
121 | train, test = get_train_test(label, preliminary_submit_dataset_a, log_dataset)
122 | train = train.drop_duplicates(['sn', 'fault_time', 'time', 'msg', 'server_model']).reset_index(drop=True)
123 | 
124 | train['time_interval'] = (pd.to_datetime(train['fault_time']) - pd.to_datetime(train['time'])).apply(
125 |     lambda x: x.total_seconds())
126 | test['time_interval'] = (pd.to_datetime(test['fault_time']) - pd.to_datetime(test['time'])).apply(
127 |     lambda x: x.total_seconds())
128 | 
129 | all_data = pd.concat([train, test], axis=0, ignore_index=True)
130 | all_data = all_data.sort_values(['sn','server_model', 'fault_time', 'time'])
131 | w2v_feats = get_w2v_feats(all_data,
132 |                           f1_list = ['sn'],
133 |                           f2_list = ['msg_list', 'msg_0', 'msg_1', 'msg_2'])
134 | # 获取 time_diff_feats_v2
135 | time_diff_feats_v2 = get_time_diff_feats_v2(all_data)
136 | # 获取 server_model_time_interval_stat_fea
137 | server_model_time_interval_stat_fea = get_server_model_time_interval_stat_fea(all_data)
138 | 
139 | msg_text_fea = get_msg_text_fea_all(all_data)
140 | # 获取时间差特征
141 | duration_minutes_fea = get_duration_minutes_fea(train, test)
142 | 
143 | # 获取时间server_model特征
144 | server_model_fea = get_server_model_fea(train, test)
145 | counter = get_word_counter(train)
146 | 
147 | # 获取时间 nearest_msg 特征
148 | nearest_msg_fea = get_nearest_msg_fea(train, test)
149 | # 获取时间 server_model beta_target 特征
150 | beta_target_fea = get_beta_target(train, test)
151 | 
152 | key = ['sn', 'fault_time', 'label', 'server_model']
153 | 
154 | fea_num = len(KEY_WORDS)
155 | time_list = [i * TIME_INTERVAL for i in next_time_list]
156 | train = get_feature(train, time_list, KEY_WORDS, fea_num, key=['sn', 'fault_time', 'label', 'server_model'])
157 | test = get_feature(test, time_list, KEY_WORDS, fea_num, key=['sn', 'fault_time', 'server_model'])
158 | 
159 | print('添加 时间差 特征')
160 | train = train.merge(duration_minutes_fea, on=['sn', 'fault_time', 'server_model'])
161 | test = test.merge(duration_minutes_fea, on=['sn', 'fault_time', 'server_model'])
162 | 
163 | print('添加 server_model特征')
164 | train = train.merge(server_model_fea, on=['sn', 'server_model'])
165 | test = test.merge(server_model_fea, on=['sn', 'server_model'])
166 | 
167 | print('添加 w2v_feats')
168 | train = train.merge(w2v_feats, on=['sn' ])
169 | test = test.merge(w2v_feats, on=['sn', ])
170 | 
171 | print('添加 nearest_msg 特征')
172 | train = train.merge(nearest_msg_fea, on=['sn', 'server_model', 'fault_time'])
173 | test = test.merge(nearest_msg_fea, on=['sn', 'server_model', 'fault_time'])
174 | 
175 | print('添加 beta_target 特征')
176 | train = train.merge(beta_target_fea, on=['sn', 'server_model', 'fault_time'])
177 | test = test.merge(beta_target_fea, on=['sn', 'server_model', 'fault_time'])
178 | 
179 | server_model_sn_fea_2 = get_server_model_sn_fea_2(train, test)
180 | print('添加 server_model_sn_fea_2 特征')
181 | train = train.merge(server_model_sn_fea_2, on=['sn', 'server_model'])
182 | test = test.merge(server_model_sn_fea_2, on=['sn', 'server_model'])
183 | 
184 | print('添加 time_diff_feats_v2 特征')
185 | train = train.merge(time_diff_feats_v2, on=['sn', 'server_model', 'fault_time'])
186 | test = test.merge(time_diff_feats_v2, on=['sn', 'server_model', 'fault_time'])
187 | 
188 | # test.to_csv(os.path.join(GENERATION_DIR,'test.csv'),index =False)
189 | # train.to_csv(os.path.join(GENERATION_DIR,'train.csv'),index =False)
190 | 
191 | # crashdump_venus_fea = pd.read_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea.csv') )
192 | # print('添加 crashdump_venus_fea 特征')
193 | # print(train.shape,test.shape,crashdump_venus_fea.shape)
194 | # train = train.merge(crashdump_venus_fea, on=['sn' , 'fault_time'],how = 'left')
195 | # test = test.merge(crashdump_venus_fea, on=['sn', 'fault_time' ],how = 'left')
196 | # print(train.shape,test.shape )
197 | 
198 | # crashdump_venus_fea = pd.read_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea_v1.csv') )
199 | # print('添加 crashdump_venus_fea 特征')
200 | # print(train.shape,test.shape,crashdump_venus_fea.shape)
201 | # train = train.merge(crashdump_venus_fea, on=['sn' , 'fault_time'],how = 'left')
202 | # test = test.merge(crashdump_venus_fea, on=['sn', 'fault_time' ],how = 'left')
203 | # print(train.shape,test.shape )
204 | # test.to_csv(os.path.join(GENERATION_DIR,'test.csv'),index =False)
205 | # train.to_csv(os.path.join(GENERATION_DIR,'train.csv'),index =False)
206 | # print('添加 key_for_top_fea 特征')
207 | # train,test = get_key_for_top_fea(train,test)
208 | 
209 | # print('添加 w2v_tfidf_doc2v_fea 特征')
210 | # w2v_tfidf_fea = pd.read_csv(os.path.join(GENERATION_DIR,'w2v_tfidf_fea.csv'))
211 | # drop_cols = [i for i in w2v_tfidf_fea if 'doc2vec' in i ]+[i for i in w2v_tfidf_fea if 'tfidf' in i ]
212 | # for col in drop_cols:
213 | #     del w2v_tfidf_fea[col]
214 | #
215 | # train = train.merge(w2v_tfidf_fea, on=['sn'  ], how='left')
216 | # test = test.merge(w2v_tfidf_fea, on=['sn' ], how='left')
217 | 
218 | # print('添加 关键词交叉特征  ')
219 | # train,test = get_key_word_cross_fea(train,test)
220 | 
221 | # print('添加 server_model_time_interval_stat_fea 特征')
222 | # train = train.merge(server_model_time_interval_stat_fea, on=['server_model' ],how ='left')
223 | # test = test.merge(server_model_time_interval_stat_fea, on=['server_model'  ],how ='left')
224 | 
225 | 
226 | use_less_cols_1 = ['last_last_msg_cnt', 'last_first_msg_cnt','time_diff_1_min',
227 |        'last_msg_list_unique_LabelEnc', 'last_msg_0_unique_LabelEnc',
228 |        'last_msg_1_unique_LabelEnc', 'last_msg_2_unique_LabelEnc',
229 |        'last_msg_list_list_LabelEnc', 'last_msg_0_list_LabelEnc',
230 |        'last_msg_1_list_LabelEnc', 'last_msg_2_list_LabelEnc',
231 |        'last_msg_0_first_LabelEnc', 'last_msg_1_first_LabelEnc',
232 |        'last_msg_2_first_LabelEnc', 'last_msg_0_last_LabelEnc',
233 |        'last_msg_1_last_LabelEnc', 'last_msg_2_last_LabelEnc',
234 |        'last_msg_last_LabelEnc', 'last_msg_first_LabelEnc']
235 | 
236 | use_less_col = [i for i in train.columns if train[i].nunique() < 2] + use_less_cols_1
237 | 
238 | 
239 | print(f'use_less_col:{len(use_less_col)}')
240 | use_cols = [i for i in train.columns if i not in ['sn', 'fault_time', 'label', 'server_model'] + use_less_col]
241 | 
242 | cat_cols = ['server_model_LabelEnc', 'msg_LabelEnc', 'msg_0_LabelEnc', 'msg_1_LabelEnc', 'msg_2_LabelEnc',]
243 | use_cols = sorted(use_cols)
244 | 
245 | cat_cols = []
246 | for i in use_cols:
247 |     if '_LabelEnc' in i:
248 |         cat_cols.append(i)
249 | print('使用的特征维度:',len(use_cols),'类别特征维度:',len(cat_cols))
250 | # fs = FeatureSelector(data=train[use_cols], labels=train['label'])
251 | #
252 | # # 选择出missing value 百分比大于60%的特征
253 | # fs.identify_missing(missing_threshold=0.9)
254 | #
255 | # # # 查看选择出的特征
256 | # # fs.ops['missing']
257 | # # 不对feature进行one-hot encoding（默认为False）, 然后选择出相关性大于98%的feature,
258 | # fs.identify_collinear(correlation_threshold=0.99, one_hot=False)
259 | #
260 | # # # 查看选择的feature
261 | # # fs.ops['collinear']
262 | #
263 | # # 选择出只有单个值的feature
264 | # fs.identify_single_unique()
265 | #
266 | # # # 查看选择出的feature
267 | # # fs.ops['single_unique']
268 | #
269 | # train_removed = fs.remove(methods = ['missing', 'single_unique', 'collinear',], keep_one_hot=False)
270 | # use_cols = train_removed.columns
271 | # print('特征选择之后，使用的特征维度:',len(use_cols))
272 | 
273 | 
274 | oof_prob = np.zeros((train.shape[0], 4))
275 | test_prob = np.zeros((test.shape[0], 4))
276 | # seeds = [42,4242,40424,1024,2048]
277 | seeds = [42 ]
278 | for seed in seeds:
279 |     oof_prob, test_prob, fea_imp_df, model_list = run_cbt(train[use_cols] , train[['label']] , test[use_cols], k=5,
280 |                                               seed=seed, cat_cols=cat_cols)
281 |     oof_prob +=oof_prob/len(seeds)
282 |     test_prob +=test_prob/len(seeds)
283 | 
284 | 
285 | weight = search_weight(train, train[['label']], oof_prob, init_weight=[1.0], class_num=4, step=0.001)
286 | oof_prob = oof_prob * np.array(weight)
287 | test_prob = test_prob * np.array(weight)
288 | 
289 | target_df = train[['sn', 'fault_time', 'label']]
290 | submit_df = train[['sn', 'fault_time']]
291 | submit_df['label'] = oof_prob.argmax(axis=1)
292 | 
293 | score = macro_f1(target_df=target_df, submit_df=submit_df)
294 | print(f'********************** BEST MACRO_F1 : {score} **********************')
295 | score = round(score, 5)
296 | 
297 | y_pred = test_prob.argmax(axis=1)
298 | result = test[['sn', 'fault_time']]
299 | result['label'] = y_pred
300 | result = preliminary_submit_dataset_a.merge(result, on=['sn', 'fault_time'], how='left')[['sn', 'fault_time', 'label']]
301 | result['label'] = result['label'].fillna(0).astype(int)
302 | 
303 | result.to_csv(os.path.join(RESULT_DIR,f'catboost_result.csv'), index=False)
304 | print(result['label'].value_counts())
305 | fea_imp_df = fea_imp_df.reset_index(drop = True)
306 | fea_imp_df.to_csv(os.path.join(RESULT_DIR,f'./cat_fea_imp_{int(score*100000)}.csv'),index = False)
307 | 
308 | train_result_prob = pd.DataFrame(oof_prob).add_prefix('cat_class_')
309 | test_result_prob = pd.DataFrame(test_prob).add_prefix('cat_class_')
310 | train_result_prob['label'] = train['label']
311 | train_result_prob['sn'] = train['sn']
312 | train_result_prob['fault_time'] = train['fault_time']
313 | test_result_prob['sn'] = test['sn']
314 | test_result_prob['fault_time'] = test['fault_time']
315 | 
316 | result_prob = pd.concat([train_result_prob,test_result_prob],ignore_index = True)
317 | result_prob.to_csv(os.path.join(RESULT_DIR,f'cat_prob_result.csv'),index = False)
318 | 
319 | end_time = datetime.datetime.now()
320 | cost_time = end_time - start_time
321 | print('****************** CATBOOST COST TIME : ',str(cost_time),' ******************')
322 | 
323 | '''
324 |  
325 | v7: 最优 线下 0.7303
326 | v8: v7 添加 关键词交叉特征 并作为类别变量输入模型 0.73114
327 | 
328 | '''


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/code/get_crashdump_venus_fea.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import os
  3 | import gc
  4 | import warnings
  5 | import pandas as pd
  6 | import pickle
  7 | from gensim.models.word2vec import Word2Vec
  8 | from gensim.models.doc2vec import Doc2Vec, TaggedDocument
  9 | from sklearn.utils.class_weight import compute_class_weight
 10 | from sklearn.preprocessing import LabelEncoder
 11 | from sklearn.feature_extraction.text import TfidfVectorizer
 12 | from sklearn.decomposition import TruncatedSVD
 13 | import numpy as np
 14 | import pandas as pd
 15 | from generate_feature import add_w2v_feats, cat2num
 16 | from generate_feature import get_key
 17 | 
 18 | from generate_feature import get_beta_target, add_last_next_time4fault, get_feature, \
 19 |     get_duration_minutes_fea, get_nearest_msg_fea, get_server_model_sn_fea_2, \
 20 |     get_server_model_fea, get_msg_text_fea_all, get_key_word_cross_fea, get_server_model_time_interval_stat_fea, \
 21 |     get_w2v_feats, get_key, get_class_key_words_nunique
 22 | from model import run_cbt, run_lgb
 23 | from utils import RESULT_DIR, TRAIN_DIR, \
 24 |     TEST_A_DIR, KEY_WORDS, TOP_KEY_WORDS, get_word_counter, search_weight, macro_f1, TIME_INTERVAL, PSEUDO_FALG, \
 25 |     GENERATION_DIR
 26 | 
 27 | warnings.filterwarnings('ignore')
 28 | 
 29 | 
 30 | def get_fault_code_list(x):
 31 |     try:
 32 |         x = x.replace('.', ',').split(',')
 33 |     except:
 34 |         x = []
 35 |     return x
 36 | 
 37 | 
 38 | def get_module_cause_list(x):
 39 |     try:
 40 |         x = x.replace(',', '_').replace('，', '_')
 41 |         x = list(set(x.split('_')))
 42 |     except:
 43 |         x = []
 44 |     return x
 45 | 
 46 | 
 47 | def get_label(PSEUDO_FALG):
 48 |     preliminary_train_label_dataset = pd.read_csv(preliminary_train_label_dataset_path)
 49 |     preliminary_train_label_dataset_s = pd.read_csv(preliminary_train_label_dataset_s_path)
 50 | 
 51 |     if PSEUDO_FALG:
 52 |         print('获取伪标签LABEL')
 53 |         pseudo_labels = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_labels.csv'))
 54 |         label = pd.concat([preliminary_train_label_dataset,
 55 |                            pseudo_labels,
 56 |                            preliminary_train_label_dataset_s],
 57 |                           ignore_index=True,
 58 |                           axis=0).sort_values(
 59 |             ['sn', 'fault_time']).reset_index(drop=True)
 60 |     else:
 61 |         print('不使用伪标签数据')
 62 |         label = pd.concat([preliminary_train_label_dataset,
 63 |                            preliminary_train_label_dataset_s],
 64 |                           ignore_index=True,
 65 |                           axis=0).sort_values(
 66 |             ['sn', 'fault_time']).reset_index(drop=True)
 67 |     label['fault_time'] = label['fault_time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
 68 |     label['fault_time'] = label['fault_time'].apply(lambda x: str(x))
 69 |     return label
 70 | 
 71 | 
 72 | def get_module_cause_code(x, code_name):
 73 |     code_list = []
 74 |     for i in x:
 75 |         if code_name in i:
 76 |             code_list.append(i)
 77 |     return code_list
 78 | 
 79 | 
 80 | def get_alertname_code(x, alertname):
 81 |     x = x.split(',')
 82 | 
 83 |     try:
 84 |         alertname_code = x[x.index(alertname) + 1]
 85 |     except:
 86 |         alertname_code = np.nan
 87 |     return alertname_code
 88 | 
 89 | 
 90 | def get_alertname_code_2(x, alertname):
 91 |     # x =x.split(',')
 92 | 
 93 |     try:
 94 |         alertname_code = x[x.index(alertname) + 1]
 95 |     except:
 96 |         alertname_code = ' '
 97 |     return alertname_code
 98 | 
 99 | 
100 | def get_last_msg_cnt(x):
101 |     last_msg = x[-1]
102 |     cnt = x.count(last_msg)
103 |     return cnt
104 | 
105 | 
106 | def get_first_msg_cnt(x):
107 |     first_msg = x[0]
108 |     cnt = x.count(first_msg)
109 |     return cnt
110 | 
111 | 
112 | def get_crashdump_venus_data():
113 |     final_venus_dataset = pd.read_csv(os.path.join(TEST_A_DIR, 'final_venus_dataset_b.csv'))
114 |     final_crashdump_dataset = pd.read_csv(os.path.join(TEST_A_DIR, 'final_crashdump_dataset_b.csv'))
115 |     final_crashdump_venus = final_crashdump_dataset.merge(final_venus_dataset, on=['sn', 'fault_time'],
116 |                                                           how='outer')
117 | 
118 |     preliminary_venus_dataset = pd.read_csv(os.path.join(TRAIN_DIR, 'preliminary_venus_dataset.csv'))
119 |     preliminary_crashdump_dataset = pd.read_csv(os.path.join(TRAIN_DIR, 'preliminary_crashdump_dataset.csv'))
120 |     preliminary_crashdump_venus = preliminary_crashdump_dataset.merge(preliminary_venus_dataset,
121 |                                                                       on=['sn', 'fault_time'],
122 |                                                                       how='outer')
123 | 
124 |     crashdump_venus = pd.concat([final_crashdump_venus, preliminary_crashdump_venus],
125 |                                 ignore_index=True).drop_duplicates()
126 |     crashdump_venus = crashdump_venus.sort_values(['sn', 'fault_time']).reset_index(drop=True)
127 |     return crashdump_venus
128 | 
129 | 
130 | def get_crashdump_venus_fea(crashdump_venus):
131 |     print('生成 crashdump_venus 特征')
132 |     crashdump_venus['module_cause_list'] = crashdump_venus['module_cause'].apply(lambda x: get_module_cause_list(x))
133 |     crashdump_venus['fault_code_list'] = crashdump_venus['fault_code'].apply(lambda x: get_fault_code_list(x))
134 | 
135 |     code_name_list = ['module', 'cod1', 'cod2', 'addr', 'port']
136 |     for code_name in code_name_list:
137 |         crashdump_venus[f'module_cause_{code_name}'] = crashdump_venus['module_cause_list'].apply(
138 |             lambda x: get_module_cause_code(x, code_name))
139 |         crashdump_venus[f'module_cause_{code_name}_len'] = crashdump_venus[f'module_cause_{code_name}'].apply(
140 |             lambda x: len(x))
141 |         crashdump_venus[f'module_cause_{code_name}'] = crashdump_venus[f'module_cause_{code_name}'].apply(
142 |             lambda x: '_'.join(set(x)))
143 |     code_name_list = ['cha', '0x', 'cod', 'core', 'cpu', 'm2m', 'pcu']
144 |     for code_name in code_name_list:
145 |         crashdump_venus[f'fault_{code_name}'] = crashdump_venus['fault_code_list'].apply(
146 |             lambda x: get_module_cause_code(x, code_name))
147 |         crashdump_venus[f'fault_{code_name}_len'] = crashdump_venus[f'fault_{code_name}'].apply(lambda x: len(x))
148 |         crashdump_venus[f'fault_{code_name}'] = crashdump_venus[f'fault_{code_name}'].apply(lambda x: '_'.join(set(x)))
149 | 
150 |     cols_tmp = ['module_cause', 'fault_code', 'module_cause_module',
151 |                 'module_cause_cod1', 'module_cause_cod2', 'module_cause_addr',
152 |                 'module_cause_port', 'fault_cha', 'fault_0x', 'fault_cod', 'fault_core',
153 |                 'fault_cpu', 'fault_m2m', 'fault_pcu', ]
154 |     new_cat_cols = []
155 |     crashdump_venus = cat2num(crashdump_venus, cols_tmp)
156 |     for name in cols_tmp:
157 |         # le = LabelEncoder()
158 |         # crashdump_venus[f'{name}_LabelEnc'] = le.fit_transform(crashdump_venus[name])
159 |         new_cat_cols.append(f'{name}_LabelEnc')
160 | 
161 |     num_cols = ['fault_pcu_len', 'fault_m2m_len',
162 |                 'fault_cpu_len', 'fault_0x_len', 'fault_cod_len',
163 |                 'module_cause_module_len', 'module_cause_cod1_len',
164 |                 'module_cause_cod2_len', 'module_cause_addr_len',
165 |                 'module_cause_port_len', 'fault_cha_len', 'fault_core_len', ]
166 | 
167 |     crashdump_venus = crashdump_venus[['sn', 'fault_time'] + new_cat_cols + num_cols]
168 |     crashdump_venus = crashdump_venus.rename(columns={'fault_time': 'crashdump_fault_time'})
169 | 
170 |     crashdump_venus['crashdump_fault_time'] = pd.to_datetime(crashdump_venus['crashdump_fault_time'])
171 |     del crashdump_venus['crashdump_fault_time']
172 |     print(f'生成 crashdump_venus 特征完毕,特征维度 {crashdump_venus.shape}')
173 |     return crashdump_venus
174 | 
175 | 
176 | def get_location_word(x, num):
177 |     try:
178 |         return x[num]
179 |     except:
180 |         return
181 | 
182 | 
183 | def get_label(PSEUDO_FALG):
184 |     preliminary_train_label_dataset = pd.read_csv(preliminary_train_label_dataset_path)
185 |     preliminary_train_label_dataset_s = pd.read_csv(preliminary_train_label_dataset_s_path)
186 | 
187 |     if PSEUDO_FALG:
188 |         print('获取伪标签LABEL')
189 |         pseudo_labels = pd.read_csv(os.path.join(TRAIN_DIR, 'pseudo_labels.csv'))
190 |         label = pd.concat([preliminary_train_label_dataset,
191 |                            pseudo_labels,
192 |                            preliminary_train_label_dataset_s],
193 |                           ignore_index=True,
194 |                           axis=0).sort_values(
195 |             ['sn', 'fault_time']).reset_index(drop=True)
196 |     else:
197 |         print('不使用伪标签数据')
198 |         label = pd.concat([preliminary_train_label_dataset,
199 |                            preliminary_train_label_dataset_s],
200 |                           ignore_index=True,
201 |                           axis=0).sort_values(
202 |             ['sn', 'fault_time']).reset_index(drop=True)
203 |     label['fault_time'] = label['fault_time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
204 |     label['fault_time'] = label['fault_time'].apply(lambda x: str(x))
205 |     return label
206 | 
207 | 
208 | module_list = ['module0','module1','module2','module3','module4','module5','module7','module8','module9',
209 |              'module10','module11','module12','module13','module14','module17','module18','module19',
210 |              'in traffic control',
211 |              'irpp0','irpp1',
212 |              'pcie rootport 0:0.0','pcie rootport a2:0.0','pcie rootport 2b:3.0',
213 |              'port a','port c']
214 | module_list2 = ['module0','module1','module2','module3','module4','module5','module7','module8','module9',
215 | 'module10','module11','module12','module13','module14','module17','module18','module19']
216 | other_module_list = ['in traffic control', 'irpp0', 'irpp1', 'pcie rootport 0:0.0',
217 |        'pcie rootport a2:0.0', 'pcie rootport 2b:3.0', 'port a', 'port c']
218 | module_content_list = ['module0_cod1', 'module0_cod2', 'module0_addr',
219 |        'module1_cod1', 'module1_cod2', 'module1_addr', 'module2_cod1',
220 |        'module2_cod2', 'module2_addr', 'module3_cod1', 'module3_cod2',
221 |        'module3_addr', 'module4_cod1', 'module4_cod2', 'module4_addr',
222 |        'module5_cod1', 'module5_cod2', 'module5_addr', 'module7_cod1',
223 |        'module7_cod2', 'module7_addr', 'module8_cod1', 'module8_cod2',
224 |        'module8_addr', 'module9_cod1', 'module9_cod2', 'module9_addr',
225 |        'module10_cod1', 'module10_cod2', 'module10_addr', 'module11_cod1',
226 |        'module11_cod2', 'module11_addr', 'module12_cod1', 'module12_cod2',
227 |        'module12_addr', 'module13_cod1', 'module13_cod2', 'module13_addr',
228 |        'module14_cod1', 'module14_cod2', 'module14_addr', 'module17_cod1',
229 |        'module17_cod2', 'module17_addr', 'module18_cod1', 'module18_cod2',
230 |        'module18_addr', 'module19_cod1', 'module19_cod2', 'module19_addr']
231 | fault_code_content_list = ['fault_code_cod1', 'fault_code_cod2',
232 |        'fault_code_cpu0', 'fault_code_cpu1']
233 | 
234 | 
235 | crashdump_venus = get_crashdump_venus_data()
236 | crashdump_venus['module_cause_list'] = crashdump_venus['module_cause'].fillna('_').apply(lambda x:x.split(','))
237 | crashdump_venus['module_cause'] = crashdump_venus['module_cause'].fillna('_').apply(lambda x:x.replace(':','_').replace(',','_'))
238 | for module in module_list:
239 |     crashdump_venus['module_cause'] = crashdump_venus['module_cause'].fillna('_').apply(
240 |     lambda x:x.replace(f'{module}_',f'{module}:').replace(f'_{module}',f',{module}'))
241 | crashdump_venus['module_cause'] = crashdump_venus['module_cause'].apply(lambda x:x.replace(':',','))
242 | 
243 | for module in module_list:
244 |     crashdump_venus[module] = crashdump_venus['module_cause'].apply(lambda x:get_alertname_code(x,module))
245 |     crashdump_venus[module] = crashdump_venus.loc[:,module].fillna(' ').apply(lambda x:x.replace('_',' '))
246 |     crashdump_venus[module] = crashdump_venus[module].apply(lambda x:x.split(' '))
247 | crashdump_venus['module_cause_new'] = crashdump_venus.loc[:,module_list].sum(1)
248 | 
249 | 
250 | for module in module_list2:
251 |     crashdump_venus[f'{module}_cod1'] = crashdump_venus[module].apply(lambda x:[get_alertname_code_2(x,'cod1')])
252 |     crashdump_venus[f'{module}_cod2'] = crashdump_venus[module].apply(lambda x:[get_alertname_code_2(x,'cod2')])
253 |     crashdump_venus[f'{module}_addr'] = crashdump_venus[module].apply(lambda x:[get_alertname_code_2(x,'addr')])
254 |     del crashdump_venus[module]
255 |     gc.collect()
256 | 
257 | crashdump_venus['fault_code_list'] = crashdump_venus['fault_code'].fillna(' ').apply(lambda x:x.split('.'))
258 | for i in ['cod1','cod2','cpu0','cpu1']:
259 |     crashdump_venus[f'fault_code_{i}'] = crashdump_venus['fault_code_list'].apply(lambda x:[get_alertname_code_2(x,i)])
260 | 
261 | 
262 | crashdump_venus['other_module_list'] = crashdump_venus.loc[:,other_module_list].sum(1)
263 | crashdump_venus['module_content_list'] = crashdump_venus.loc[:,module_content_list].sum(1)
264 | crashdump_venus['module_cause_new'] = crashdump_venus.loc[:,other_module_list+module_content_list].sum(1)
265 | crashdump_venus['fault_code_content_list'] = crashdump_venus.loc[:,fault_code_content_list].sum(1)
266 | crashdump_venus['all_crashdump_venus'] = crashdump_venus.loc[:,other_module_list+module_content_list+fault_code_content_list].sum(1)
267 | 
268 | f1_list = ['sn']
269 | f2_list = ['other_module_list','module_content_list','module_cause_new','fault_code_content_list','all_crashdump_venus']
270 | w2v_feats_df = crashdump_venus[f1_list].drop_duplicates()
271 | w2v_feats_df_list = []
272 | for f1 in f1_list:
273 |     for f2 in f2_list:
274 |         w2v_fea_tmp = add_w2v_feats(crashdump_venus,w2v_feats_df,f1,f2,emb_size = 10,window = 5,min_count  =5,)
275 |         w2v_feats_df_list.append(w2v_fea_tmp)
276 | w2v_feats_df = w2v_feats_df_list[0]
277 | for i in w2v_feats_df_list[1:]:
278 |     w2v_feats_df = w2v_feats_df.merge(i,on = 'sn',how = 'left')
279 | 
280 | for i in other_module_list+module_content_list+fault_code_content_list:
281 |     crashdump_venus[i] = crashdump_venus[i].astype(str)
282 | 
283 | crashdump_venus = cat2num(crashdump_venus,other_module_list+module_content_list+fault_code_content_list)
284 | for i in other_module_list+module_content_list+fault_code_content_list:
285 |     del crashdump_venus[i]
286 | gc.collect()
287 | crashdump_venus = crashdump_venus.merge(w2v_feats_df,on ='sn',how ='left').rename(columns ={'fault_time':'crashdump_venus_fault_time'} )
288 | 
289 | preliminary_train_label_dataset_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset.csv')
290 | preliminary_train_label_dataset_s_path = os.path.join(TRAIN_DIR, 'preliminary_train_label_dataset_s.csv')
291 | test = pd.read_csv(os.path.join(TEST_A_DIR, 'final_submit_dataset_b.csv'))[['sn', 'fault_time' ]]
292 | train = get_label(False)[['sn', 'fault_time', 'label',]]
293 | 
294 | test_tmp = test[['sn', 'fault_time']]
295 | test_tmp = test_tmp.merge(crashdump_venus, on='sn').drop_duplicates(['sn', 'fault_time']).reset_index(drop=True)
296 | train_tmp = train[['sn', 'fault_time', 'label', ]]
297 | train_tmp = train_tmp.merge(crashdump_venus, on='sn').drop_duplicates(['sn', 'fault_time']).reset_index(drop=True)
298 | 
299 | 
300 | train_tmp['duration_fault_time'] = pd.to_datetime(train_tmp['fault_time']) - pd.to_datetime(train_tmp['crashdump_venus_fault_time'])
301 | test_tmp['duration_fault_time'] = pd.to_datetime(test_tmp['fault_time']) - pd.to_datetime(test_tmp['crashdump_venus_fault_time'])
302 | 
303 | train_tmp['duration_fault_time'] = train_tmp['duration_fault_time'].apply(lambda x:x.total_seconds())
304 | test_tmp['duration_fault_time']  = test_tmp['duration_fault_time'].apply(lambda x:x.total_seconds())
305 | 
306 | 
307 | drop_cols = ['sn', 'fault_time', 'fault_code', 'module_cause', 'module','crashdump_venus_fault_time',
308 |        'module_cause_list', 'module_cause_new', 'fault_code_list','label','duration_fault_time',
309 |        'other_module_list', 'module_content_list', 'fault_code_content_list',
310 |        'all_crashdump_venus',]
311 | use_cols = [i for i in train_tmp.columns if i not in drop_cols]
312 | 
313 | cat_cols = [f'{i}_LabelEnc' for i in other_module_list+module_content_list+fault_code_content_list]
314 | 
315 | oof_prob = np.zeros((train.shape[0], 4))
316 | 
317 | test_prob = np.zeros((test.shape[0], 4))
318 | # seeds = [42,4242,40424,1024,2048]
319 | seeds = [42 ]
320 | for seed in seeds:
321 |     oof_prob, test_prob, fea_imp_df, model_list = run_cbt(train_tmp[use_cols], train_tmp[['label']], test_tmp[use_cols], k=5,
322 |                                               seed=seed, cat_cols=cat_cols)
323 |     oof_prob +=oof_prob/len(seeds)
324 |     test_prob +=test_prob/len(seeds)
325 | 
326 | 
327 | weight = search_weight(train_tmp, train_tmp[['label']], oof_prob, init_weight=[1.0], class_num=4, step=0.001)
328 | oof_prob = oof_prob * np.array(weight)
329 | test_prob = test_prob * np.array(weight)
330 | 
331 | 
332 | target_df = train_tmp[['sn', 'fault_time', 'label']].drop_duplicates(['sn', 'fault_time'])
333 | submit_df = train_tmp[['sn', 'fault_time']]
334 | submit_df['label'] = oof_prob.argmax(axis=1)
335 | submit_df = submit_df.drop_duplicates(['sn', 'fault_time'])
336 | # submit_df = pd.read_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea1.csv')).rename(columns = {'crashdump_venus_label':'label'})
337 | 
338 | 
339 | score = macro_f1(target_df=target_df, submit_df=submit_df)
340 | print(f'********************** BEST MACRO_F1 : {score} **********************')
341 | score = round(score, 5)
342 | 
343 | print(fea_imp_df[:20])
344 | y_pred = test_prob.argmax(axis=1)
345 | result = test_tmp[['sn', 'fault_time']]
346 | result['label'] = y_pred
347 | result = result.drop_duplicates(['sn', 'fault_time'])
348 | 
349 | crashdump_venus_fea = pd.concat([submit_df,result],ignore_index = False,axis = 0)
350 | crashdump_venus_fea = crashdump_venus_fea.rename(columns = {'label':'crashdump_venus_label_v1'})
351 | crashdump_venus_fea.to_csv(os.path.join(GENERATION_DIR,'crashdump_venus_fea_v1.csv'),index= False)
352 | print(crashdump_venus_fea['crashdump_venus_label_v1'].value_counts())
353 | 
354 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU AFFERO GENERAL PUBLIC LICENSE
  2 |                        Version 3, 19 November 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU Affero General Public License is a free, copyleft license for
 11 | software and other kinds of works, specifically designed to ensure
 12 | cooperation with the community in the case of network server software.
 13 | 
 14 |   The licenses for most software and other practical works are designed
 15 | to take away your freedom to share and change the works.  By contrast,
 16 | our General Public Licenses are intended to guarantee your freedom to
 17 | share and change all versions of a program--to make sure it remains free
 18 | software for all its users.
 19 | 
 20 |   When we speak of free software, we are referring to freedom, not
 21 | price.  Our General Public Licenses are designed to make sure that you
 22 | have the freedom to distribute copies of free software (and charge for
 23 | them if you wish), that you receive source code or can get it if you
 24 | want it, that you can change the software or use pieces of it in new
 25 | free programs, and that you know you can do these things.
 26 | 
 27 |   Developers that use our General Public Licenses protect your rights
 28 | with two steps: (1) assert copyright on the software, and (2) offer
 29 | you this License which gives you legal permission to copy, distribute
 30 | and/or modify the software.
 31 | 
 32 |   A secondary benefit of defending all users' freedom is that
 33 | improvements made in alternate versions of the program, if they
 34 | receive widespread use, become available for other developers to
 35 | incorporate.  Many developers of free software are heartened and
 36 | encouraged by the resulting cooperation.  However, in the case of
 37 | software used on network servers, this result may fail to come about.
 38 | The GNU General Public License permits making a modified version and
 39 | letting the public access it on a server without ever releasing its
 40 | source code to the public.
 41 | 
 42 |   The GNU Affero General Public License is designed specifically to
 43 | ensure that, in such cases, the modified source code becomes available
 44 | to the community.  It requires the operator of a network server to
 45 | provide the source code of the modified version running there to the
 46 | users of that server.  Therefore, public use of a modified version, on
 47 | a publicly accessible server, gives the public access to the source
 48 | code of the modified version.
 49 | 
 50 |   An older license, called the Affero General Public License and
 51 | published by Affero, was designed to accomplish similar goals.  This is
 52 | a different license, not a version of the Affero GPL, but Affero has
 53 | released a new version of the Affero GPL which permits relicensing under
 54 | this license.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                        TERMS AND CONDITIONS
 60 | 
 61 |   0. Definitions.
 62 | 
 63 |   "This License" refers to version 3 of the GNU Affero General Public License.
 64 | 
 65 |   "Copyright" also means copyright-like laws that apply to other kinds of
 66 | works, such as semiconductor masks.
 67 | 
 68 |   "The Program" refers to any copyrightable work licensed under this
 69 | License.  Each licensee is addressed as "you".  "Licensees" and
 70 | "recipients" may be individuals or organizations.
 71 | 
 72 |   To "modify" a work means to copy from or adapt all or part of the work
 73 | in a fashion requiring copyright permission, other than the making of an
 74 | exact copy.  The resulting work is called a "modified version" of the
 75 | earlier work or a work "based on" the earlier work.
 76 | 
 77 |   A "covered work" means either the unmodified Program or a work based
 78 | on the Program.
 79 | 
 80 |   To "propagate" a work means to do anything with it that, without
 81 | permission, would make you directly or secondarily liable for
 82 | infringement under applicable copyright law, except executing it on a
 83 | computer or modifying a private copy.  Propagation includes copying,
 84 | distribution (with or without modification), making available to the
 85 | public, and in some countries other activities as well.
 86 | 
 87 |   To "convey" a work means any kind of propagation that enables other
 88 | parties to make or receive copies.  Mere interaction with a user through
 89 | a computer network, with no transfer of a copy, is not conveying.
 90 | 
 91 |   An interactive user interface displays "Appropriate Legal Notices"
 92 | to the extent that it includes a convenient and prominently visible
 93 | feature that (1) displays an appropriate copyright notice, and (2)
 94 | tells the user that there is no warranty for the work (except to the
 95 | extent that warranties are provided), that licensees may convey the
 96 | work under this License, and how to view a copy of this License.  If
 97 | the interface presents a list of user commands or options, such as a
 98 | menu, a prominent item in the list meets this criterion.
 99 | 
100 |   1. Source Code.
101 | 
102 |   The "source code" for a work means the preferred form of the work
103 | for making modifications to it.  "Object code" means any non-source
104 | form of a work.
105 | 
106 |   A "Standard Interface" means an interface that either is an official
107 | standard defined by a recognized standards body, or, in the case of
108 | interfaces specified for a particular programming language, one that
109 | is widely used among developers working in that language.
110 | 
111 |   The "System Libraries" of an executable work include anything, other
112 | than the work as a whole, that (a) is included in the normal form of
113 | packaging a Major Component, but which is not part of that Major
114 | Component, and (b) serves only to enable use of the work with that
115 | Major Component, or to implement a Standard Interface for which an
116 | implementation is available to the public in source code form.  A
117 | "Major Component", in this context, means a major essential component
118 | (kernel, window system, and so on) of the specific operating system
119 | (if any) on which the executable work runs, or a compiler used to
120 | produce the work, or an object code interpreter used to run it.
121 | 
122 |   The "Corresponding Source" for a work in object code form means all
123 | the source code needed to generate, install, and (for an executable
124 | work) run the object code and to modify the work, including scripts to
125 | control those activities.  However, it does not include the work's
126 | System Libraries, or general-purpose tools or generally available free
127 | programs which are used unmodified in performing those activities but
128 | which are not part of the work.  For example, Corresponding Source
129 | includes interface definition files associated with source files for
130 | the work, and the source code for shared libraries and dynamically
131 | linked subprograms that the work is specifically designed to require,
132 | such as by intimate data communication or control flow between those
133 | subprograms and other parts of the work.
134 | 
135 |   The Corresponding Source need not include anything that users
136 | can regenerate automatically from other parts of the Corresponding
137 | Source.
138 | 
139 |   The Corresponding Source for a work in source code form is that
140 | same work.
141 | 
142 |   2. Basic Permissions.
143 | 
144 |   All rights granted under this License are granted for the term of
145 | copyright on the Program, and are irrevocable provided the stated
146 | conditions are met.  This License explicitly affirms your unlimited
147 | permission to run the unmodified Program.  The output from running a
148 | covered work is covered by this License only if the output, given its
149 | content, constitutes a covered work.  This License acknowledges your
150 | rights of fair use or other equivalent, as provided by copyright law.
151 | 
152 |   You may make, run and propagate covered works that you do not
153 | convey, without conditions so long as your license otherwise remains
154 | in force.  You may convey covered works to others for the sole purpose
155 | of having them make modifications exclusively for you, or provide you
156 | with facilities for running those works, provided that you comply with
157 | the terms of this License in conveying all material for which you do
158 | not control copyright.  Those thus making or running the covered works
159 | for you must do so exclusively on your behalf, under your direction
160 | and control, on terms that prohibit them from making any copies of
161 | your copyrighted material outside their relationship with you.
162 | 
163 |   Conveying under any other circumstances is permitted solely under
164 | the conditions stated below.  Sublicensing is not allowed; section 10
165 | makes it unnecessary.
166 | 
167 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
168 | 
169 |   No covered work shall be deemed part of an effective technological
170 | measure under any applicable law fulfilling obligations under article
171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
172 | similar laws prohibiting or restricting circumvention of such
173 | measures.
174 | 
175 |   When you convey a covered work, you waive any legal power to forbid
176 | circumvention of technological measures to the extent such circumvention
177 | is effected by exercising rights under this License with respect to
178 | the covered work, and you disclaim any intention to limit operation or
179 | modification of the work as a means of enforcing, against the work's
180 | users, your or third parties' legal rights to forbid circumvention of
181 | technological measures.
182 | 
183 |   4. Conveying Verbatim Copies.
184 | 
185 |   You may convey verbatim copies of the Program's source code as you
186 | receive it, in any medium, provided that you conspicuously and
187 | appropriately publish on each copy an appropriate copyright notice;
188 | keep intact all notices stating that this License and any
189 | non-permissive terms added in accord with section 7 apply to the code;
190 | keep intact all notices of the absence of any warranty; and give all
191 | recipients a copy of this License along with the Program.
192 | 
193 |   You may charge any price or no price for each copy that you convey,
194 | and you may offer support or warranty protection for a fee.
195 | 
196 |   5. Conveying Modified Source Versions.
197 | 
198 |   You may convey a work based on the Program, or the modifications to
199 | produce it from the Program, in the form of source code under the
200 | terms of section 4, provided that you also meet all of these conditions:
201 | 
202 |     a) The work must carry prominent notices stating that you modified
203 |     it, and giving a relevant date.
204 | 
205 |     b) The work must carry prominent notices stating that it is
206 |     released under this License and any conditions added under section
207 |     7.  This requirement modifies the requirement in section 4 to
208 |     "keep intact all notices".
209 | 
210 |     c) You must license the entire work, as a whole, under this
211 |     License to anyone who comes into possession of a copy.  This
212 |     License will therefore apply, along with any applicable section 7
213 |     additional terms, to the whole of the work, and all its parts,
214 |     regardless of how they are packaged.  This License gives no
215 |     permission to license the work in any other way, but it does not
216 |     invalidate such permission if you have separately received it.
217 | 
218 |     d) If the work has interactive user interfaces, each must display
219 |     Appropriate Legal Notices; however, if the Program has interactive
220 |     interfaces that do not display Appropriate Legal Notices, your
221 |     work need not make them do so.
222 | 
223 |   A compilation of a covered work with other separate and independent
224 | works, which are not by their nature extensions of the covered work,
225 | and which are not combined with it such as to form a larger program,
226 | in or on a volume of a storage or distribution medium, is called an
227 | "aggregate" if the compilation and its resulting copyright are not
228 | used to limit the access or legal rights of the compilation's users
229 | beyond what the individual works permit.  Inclusion of a covered work
230 | in an aggregate does not cause this License to apply to the other
231 | parts of the aggregate.
232 | 
233 |   6. Conveying Non-Source Forms.
234 | 
235 |   You may convey a covered work in object code form under the terms
236 | of sections 4 and 5, provided that you also convey the
237 | machine-readable Corresponding Source under the terms of this License,
238 | in one of these ways:
239 | 
240 |     a) Convey the object code in, or embodied in, a physical product
241 |     (including a physical distribution medium), accompanied by the
242 |     Corresponding Source fixed on a durable physical medium
243 |     customarily used for software interchange.
244 | 
245 |     b) Convey the object code in, or embodied in, a physical product
246 |     (including a physical distribution medium), accompanied by a
247 |     written offer, valid for at least three years and valid for as
248 |     long as you offer spare parts or customer support for that product
249 |     model, to give anyone who possesses the object code either (1) a
250 |     copy of the Corresponding Source for all the software in the
251 |     product that is covered by this License, on a durable physical
252 |     medium customarily used for software interchange, for a price no
253 |     more than your reasonable cost of physically performing this
254 |     conveying of source, or (2) access to copy the
255 |     Corresponding Source from a network server at no charge.
256 | 
257 |     c) Convey individual copies of the object code with a copy of the
258 |     written offer to provide the Corresponding Source.  This
259 |     alternative is allowed only occasionally and noncommercially, and
260 |     only if you received the object code with such an offer, in accord
261 |     with subsection 6b.
262 | 
263 |     d) Convey the object code by offering access from a designated
264 |     place (gratis or for a charge), and offer equivalent access to the
265 |     Corresponding Source in the same way through the same place at no
266 |     further charge.  You need not require recipients to copy the
267 |     Corresponding Source along with the object code.  If the place to
268 |     copy the object code is a network server, the Corresponding Source
269 |     may be on a different server (operated by you or a third party)
270 |     that supports equivalent copying facilities, provided you maintain
271 |     clear directions next to the object code saying where to find the
272 |     Corresponding Source.  Regardless of what server hosts the
273 |     Corresponding Source, you remain obligated to ensure that it is
274 |     available for as long as needed to satisfy these requirements.
275 | 
276 |     e) Convey the object code using peer-to-peer transmission, provided
277 |     you inform other peers where the object code and Corresponding
278 |     Source of the work are being offered to the general public at no
279 |     charge under subsection 6d.
280 | 
281 |   A separable portion of the object code, whose source code is excluded
282 | from the Corresponding Source as a System Library, need not be
283 | included in conveying the object code work.
284 | 
285 |   A "User Product" is either (1) a "consumer product", which means any
286 | tangible personal property which is normally used for personal, family,
287 | or household purposes, or (2) anything designed or sold for incorporation
288 | into a dwelling.  In determining whether a product is a consumer product,
289 | doubtful cases shall be resolved in favor of coverage.  For a particular
290 | product received by a particular user, "normally used" refers to a
291 | typical or common use of that class of product, regardless of the status
292 | of the particular user or of the way in which the particular user
293 | actually uses, or expects or is expected to use, the product.  A product
294 | is a consumer product regardless of whether the product has substantial
295 | commercial, industrial or non-consumer uses, unless such uses represent
296 | the only significant mode of use of the product.
297 | 
298 |   "Installation Information" for a User Product means any methods,
299 | procedures, authorization keys, or other information required to install
300 | and execute modified versions of a covered work in that User Product from
301 | a modified version of its Corresponding Source.  The information must
302 | suffice to ensure that the continued functioning of the modified object
303 | code is in no case prevented or interfered with solely because
304 | modification has been made.
305 | 
306 |   If you convey an object code work under this section in, or with, or
307 | specifically for use in, a User Product, and the conveying occurs as
308 | part of a transaction in which the right of possession and use of the
309 | User Product is transferred to the recipient in perpetuity or for a
310 | fixed term (regardless of how the transaction is characterized), the
311 | Corresponding Source conveyed under this section must be accompanied
312 | by the Installation Information.  But this requirement does not apply
313 | if neither you nor any third party retains the ability to install
314 | modified object code on the User Product (for example, the work has
315 | been installed in ROM).
316 | 
317 |   The requirement to provide Installation Information does not include a
318 | requirement to continue to provide support service, warranty, or updates
319 | for a work that has been modified or installed by the recipient, or for
320 | the User Product in which it has been modified or installed.  Access to a
321 | network may be denied when the modification itself materially and
322 | adversely affects the operation of the network or violates the rules and
323 | protocols for communication across the network.
324 | 
325 |   Corresponding Source conveyed, and Installation Information provided,
326 | in accord with this section must be in a format that is publicly
327 | documented (and with an implementation available to the public in
328 | source code form), and must require no special password or key for
329 | unpacking, reading or copying.
330 | 
331 |   7. Additional Terms.
332 | 
333 |   "Additional permissions" are terms that supplement the terms of this
334 | License by making exceptions from one or more of its conditions.
335 | Additional permissions that are applicable to the entire Program shall
336 | be treated as though they were included in this License, to the extent
337 | that they are valid under applicable law.  If additional permissions
338 | apply only to part of the Program, that part may be used separately
339 | under those permissions, but the entire Program remains governed by
340 | this License without regard to the additional permissions.
341 | 
342 |   When you convey a copy of a covered work, you may at your option
343 | remove any additional permissions from that copy, or from any part of
344 | it.  (Additional permissions may be written to require their own
345 | removal in certain cases when you modify the work.)  You may place
346 | additional permissions on material, added by you to a covered work,
347 | for which you have or can give appropriate copyright permission.
348 | 
349 |   Notwithstanding any other provision of this License, for material you
350 | add to a covered work, you may (if authorized by the copyright holders of
351 | that material) supplement the terms of this License with terms:
352 | 
353 |     a) Disclaiming warranty or limiting liability differently from the
354 |     terms of sections 15 and 16 of this License; or
355 | 
356 |     b) Requiring preservation of specified reasonable legal notices or
357 |     author attributions in that material or in the Appropriate Legal
358 |     Notices displayed by works containing it; or
359 | 
360 |     c) Prohibiting misrepresentation of the origin of that material, or
361 |     requiring that modified versions of such material be marked in
362 |     reasonable ways as different from the original version; or
363 | 
364 |     d) Limiting the use for publicity purposes of names of licensors or
365 |     authors of the material; or
366 | 
367 |     e) Declining to grant rights under trademark law for use of some
368 |     trade names, trademarks, or service marks; or
369 | 
370 |     f) Requiring indemnification of licensors and authors of that
371 |     material by anyone who conveys the material (or modified versions of
372 |     it) with contractual assumptions of liability to the recipient, for
373 |     any liability that these contractual assumptions directly impose on
374 |     those licensors and authors.
375 | 
376 |   All other non-permissive additional terms are considered "further
377 | restrictions" within the meaning of section 10.  If the Program as you
378 | received it, or any part of it, contains a notice stating that it is
379 | governed by this License along with a term that is a further
380 | restriction, you may remove that term.  If a license document contains
381 | a further restriction but permits relicensing or conveying under this
382 | License, you may add to a covered work material governed by the terms
383 | of that license document, provided that the further restriction does
384 | not survive such relicensing or conveying.
385 | 
386 |   If you add terms to a covered work in accord with this section, you
387 | must place, in the relevant source files, a statement of the
388 | additional terms that apply to those files, or a notice indicating
389 | where to find the applicable terms.
390 | 
391 |   Additional terms, permissive or non-permissive, may be stated in the
392 | form of a separately written license, or stated as exceptions;
393 | the above requirements apply either way.
394 | 
395 |   8. Termination.
396 | 
397 |   You may not propagate or modify a covered work except as expressly
398 | provided under this License.  Any attempt otherwise to propagate or
399 | modify it is void, and will automatically terminate your rights under
400 | this License (including any patent licenses granted under the third
401 | paragraph of section 11).
402 | 
403 |   However, if you cease all violation of this License, then your
404 | license from a particular copyright holder is reinstated (a)
405 | provisionally, unless and until the copyright holder explicitly and
406 | finally terminates your license, and (b) permanently, if the copyright
407 | holder fails to notify you of the violation by some reasonable means
408 | prior to 60 days after the cessation.
409 | 
410 |   Moreover, your license from a particular copyright holder is
411 | reinstated permanently if the copyright holder notifies you of the
412 | violation by some reasonable means, this is the first time you have
413 | received notice of violation of this License (for any work) from that
414 | copyright holder, and you cure the violation prior to 30 days after
415 | your receipt of the notice.
416 | 
417 |   Termination of your rights under this section does not terminate the
418 | licenses of parties who have received copies or rights from you under
419 | this License.  If your rights have been terminated and not permanently
420 | reinstated, you do not qualify to receive new licenses for the same
421 | material under section 10.
422 | 
423 |   9. Acceptance Not Required for Having Copies.
424 | 
425 |   You are not required to accept this License in order to receive or
426 | run a copy of the Program.  Ancillary propagation of a covered work
427 | occurring solely as a consequence of using peer-to-peer transmission
428 | to receive a copy likewise does not require acceptance.  However,
429 | nothing other than this License grants you permission to propagate or
430 | modify any covered work.  These actions infringe copyright if you do
431 | not accept this License.  Therefore, by modifying or propagating a
432 | covered work, you indicate your acceptance of this License to do so.
433 | 
434 |   10. Automatic Licensing of Downstream Recipients.
435 | 
436 |   Each time you convey a covered work, the recipient automatically
437 | receives a license from the original licensors, to run, modify and
438 | propagate that work, subject to this License.  You are not responsible
439 | for enforcing compliance by third parties with this License.
440 | 
441 |   An "entity transaction" is a transaction transferring control of an
442 | organization, or substantially all assets of one, or subdividing an
443 | organization, or merging organizations.  If propagation of a covered
444 | work results from an entity transaction, each party to that
445 | transaction who receives a copy of the work also receives whatever
446 | licenses to the work the party's predecessor in interest had or could
447 | give under the previous paragraph, plus a right to possession of the
448 | Corresponding Source of the work from the predecessor in interest, if
449 | the predecessor has it or can get it with reasonable efforts.
450 | 
451 |   You may not impose any further restrictions on the exercise of the
452 | rights granted or affirmed under this License.  For example, you may
453 | not impose a license fee, royalty, or other charge for exercise of
454 | rights granted under this License, and you may not initiate litigation
455 | (including a cross-claim or counterclaim in a lawsuit) alleging that
456 | any patent claim is infringed by making, using, selling, offering for
457 | sale, or importing the Program or any portion of it.
458 | 
459 |   11. Patents.
460 | 
461 |   A "contributor" is a copyright holder who authorizes use under this
462 | License of the Program or a work on which the Program is based.  The
463 | work thus licensed is called the contributor's "contributor version".
464 | 
465 |   A contributor's "essential patent claims" are all patent claims
466 | owned or controlled by the contributor, whether already acquired or
467 | hereafter acquired, that would be infringed by some manner, permitted
468 | by this License, of making, using, or selling its contributor version,
469 | but do not include claims that would be infringed only as a
470 | consequence of further modification of the contributor version.  For
471 | purposes of this definition, "control" includes the right to grant
472 | patent sublicenses in a manner consistent with the requirements of
473 | this License.
474 | 
475 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
476 | patent license under the contributor's essential patent claims, to
477 | make, use, sell, offer for sale, import and otherwise run, modify and
478 | propagate the contents of its contributor version.
479 | 
480 |   In the following three paragraphs, a "patent license" is any express
481 | agreement or commitment, however denominated, not to enforce a patent
482 | (such as an express permission to practice a patent or covenant not to
483 | sue for patent infringement).  To "grant" such a patent license to a
484 | party means to make such an agreement or commitment not to enforce a
485 | patent against the party.
486 | 
487 |   If you convey a covered work, knowingly relying on a patent license,
488 | and the Corresponding Source of the work is not available for anyone
489 | to copy, free of charge and under the terms of this License, through a
490 | publicly available network server or other readily accessible means,
491 | then you must either (1) cause the Corresponding Source to be so
492 | available, or (2) arrange to deprive yourself of the benefit of the
493 | patent license for this particular work, or (3) arrange, in a manner
494 | consistent with the requirements of this License, to extend the patent
495 | license to downstream recipients.  "Knowingly relying" means you have
496 | actual knowledge that, but for the patent license, your conveying the
497 | covered work in a country, or your recipient's use of the covered work
498 | in a country, would infringe one or more identifiable patents in that
499 | country that you have reason to believe are valid.
500 | 
501 |   If, pursuant to or in connection with a single transaction or
502 | arrangement, you convey, or propagate by procuring conveyance of, a
503 | covered work, and grant a patent license to some of the parties
504 | receiving the covered work authorizing them to use, propagate, modify
505 | or convey a specific copy of the covered work, then the patent license
506 | you grant is automatically extended to all recipients of the covered
507 | work and works based on it.
508 | 
509 |   A patent license is "discriminatory" if it does not include within
510 | the scope of its coverage, prohibits the exercise of, or is
511 | conditioned on the non-exercise of one or more of the rights that are
512 | specifically granted under this License.  You may not convey a covered
513 | work if you are a party to an arrangement with a third party that is
514 | in the business of distributing software, under which you make payment
515 | to the third party based on the extent of your activity of conveying
516 | the work, and under which the third party grants, to any of the
517 | parties who would receive the covered work from you, a discriminatory
518 | patent license (a) in connection with copies of the covered work
519 | conveyed by you (or copies made from those copies), or (b) primarily
520 | for and in connection with specific products or compilations that
521 | contain the covered work, unless you entered into that arrangement,
522 | or that patent license was granted, prior to 28 March 2007.
523 | 
524 |   Nothing in this License shall be construed as excluding or limiting
525 | any implied license or other defenses to infringement that may
526 | otherwise be available to you under applicable patent law.
527 | 
528 |   12. No Surrender of Others' Freedom.
529 | 
530 |   If conditions are imposed on you (whether by court order, agreement or
531 | otherwise) that contradict the conditions of this License, they do not
532 | excuse you from the conditions of this License.  If you cannot convey a
533 | covered work so as to satisfy simultaneously your obligations under this
534 | License and any other pertinent obligations, then as a consequence you may
535 | not convey it at all.  For example, if you agree to terms that obligate you
536 | to collect a royalty for further conveying from those to whom you convey
537 | the Program, the only way you could satisfy both those terms and this
538 | License would be to refrain entirely from conveying the Program.
539 | 
540 |   13. Remote Network Interaction; Use with the GNU General Public License.
541 | 
542 |   Notwithstanding any other provision of this License, if you modify the
543 | Program, your modified version must prominently offer all users
544 | interacting with it remotely through a computer network (if your version
545 | supports such interaction) an opportunity to receive the Corresponding
546 | Source of your version by providing access to the Corresponding Source
547 | from a network server at no charge, through some standard or customary
548 | means of facilitating copying of software.  This Corresponding Source
549 | shall include the Corresponding Source for any work covered by version 3
550 | of the GNU General Public License that is incorporated pursuant to the
551 | following paragraph.
552 | 
553 |   Notwithstanding any other provision of this License, you have
554 | permission to link or combine any covered work with a work licensed
555 | under version 3 of the GNU General Public License into a single
556 | combined work, and to convey the resulting work.  The terms of this
557 | License will continue to apply to the part which is the covered work,
558 | but the work with which it is combined will remain governed by version
559 | 3 of the GNU General Public License.
560 | 
561 |   14. Revised Versions of this License.
562 | 
563 |   The Free Software Foundation may publish revised and/or new versions of
564 | the GNU Affero General Public License from time to time.  Such new versions
565 | will be similar in spirit to the present version, but may differ in detail to
566 | address new problems or concerns.
567 | 
568 |   Each version is given a distinguishing version number.  If the
569 | Program specifies that a certain numbered version of the GNU Affero General
570 | Public License "or any later version" applies to it, you have the
571 | option of following the terms and conditions either of that numbered
572 | version or of any later version published by the Free Software
573 | Foundation.  If the Program does not specify a version number of the
574 | GNU Affero General Public License, you may choose any version ever published
575 | by the Free Software Foundation.
576 | 
577 |   If the Program specifies that a proxy can decide which future
578 | versions of the GNU Affero General Public License can be used, that proxy's
579 | public statement of acceptance of a version permanently authorizes you
580 | to choose that version for the Program.
581 | 
582 |   Later license versions may give you additional or different
583 | permissions.  However, no additional obligations are imposed on any
584 | author or copyright holder as a result of your choosing to follow a
585 | later version.
586 | 
587 |   15. Disclaimer of Warranty.
588 | 
589 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
590 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
594 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
595 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
597 | 
598 |   16. Limitation of Liability.
599 | 
600 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
608 | SUCH DAMAGES.
609 | 
610 |   17. Interpretation of Sections 15 and 16.
611 | 
612 |   If the disclaimer of warranty and limitation of liability provided
613 | above cannot be given local legal effect according to their terms,
614 | reviewing courts shall apply local law that most closely approximates
615 | an absolute waiver of all civil liability in connection with the
616 | Program, unless a warranty or assumption of liability accompanies a
617 | copy of the Program in return for a fee.
618 | 
619 |                      END OF TERMS AND CONDITIONS
620 | 
621 |             How to Apply These Terms to Your New Programs
622 | 
623 |   If you develop a new program, and you want it to be of the greatest
624 | possible use to the public, the best way to achieve this is to make it
625 | free software which everyone can redistribute and change under these terms.
626 | 
627 |   To do so, attach the following notices to the program.  It is safest
628 | to attach them to the start of each source file to most effectively
629 | state the exclusion of warranty; and each file should have at least
630 | the "copyright" line and a pointer to where the full notice is found.
631 | 
632 |     <one line to give the program's name and a brief idea of what it does.>
633 |     Copyright (C) <year>  <name of author>
634 | 
635 |     This program is free software: you can redistribute it and/or modify
636 |     it under the terms of the GNU Affero General Public License as published
637 |     by the Free Software Foundation, either version 3 of the License, or
638 |     (at your option) any later version.
639 | 
640 |     This program is distributed in the hope that it will be useful,
641 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
642 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
643 |     GNU Affero General Public License for more details.
644 | 
645 |     You should have received a copy of the GNU Affero General Public License
646 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
647 | 
648 | Also add information on how to contact you by electronic and paper mail.
649 | 
650 |   If your software can interact with users remotely through a computer
651 | network, you should also make sure that it provides a way for users to
652 | get its source.  For example, if your program is a web application, its
653 | interface could display a "Source" link that leads users to an archive
654 | of the code.  There are many ways you could offer source, and different
655 | solutions will be better for different programs; see section 13 for the
656 | specific requirements.
657 | 
658 |   You should also get your employer (if you work as a programmer) or school,
659 | if any, to sign a "copyright disclaimer" for the program, if necessary.
660 | For more information on this, and how to apply and follow the GNU AGPL, see
661 | <https://www.gnu.org/licenses/>.
662 | 


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU AFFERO GENERAL PUBLIC LICENSE
  2 |                        Version 3, 19 November 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU Affero General Public License is a free, copyleft license for
 11 | software and other kinds of works, specifically designed to ensure
 12 | cooperation with the community in the case of network server software.
 13 | 
 14 |   The licenses for most software and other practical works are designed
 15 | to take away your freedom to share and change the works.  By contrast,
 16 | our General Public Licenses are intended to guarantee your freedom to
 17 | share and change all versions of a program--to make sure it remains free
 18 | software for all its users.
 19 | 
 20 |   When we speak of free software, we are referring to freedom, not
 21 | price.  Our General Public Licenses are designed to make sure that you
 22 | have the freedom to distribute copies of free software (and charge for
 23 | them if you wish), that you receive source code or can get it if you
 24 | want it, that you can change the software or use pieces of it in new
 25 | free programs, and that you know you can do these things.
 26 | 
 27 |   Developers that use our General Public Licenses protect your rights
 28 | with two steps: (1) assert copyright on the software, and (2) offer
 29 | you this License which gives you legal permission to copy, distribute
 30 | and/or modify the software.
 31 | 
 32 |   A secondary benefit of defending all users' freedom is that
 33 | improvements made in alternate versions of the program, if they
 34 | receive widespread use, become available for other developers to
 35 | incorporate.  Many developers of free software are heartened and
 36 | encouraged by the resulting cooperation.  However, in the case of
 37 | software used on network servers, this result may fail to come about.
 38 | The GNU General Public License permits making a modified version and
 39 | letting the public access it on a server without ever releasing its
 40 | source code to the public.
 41 | 
 42 |   The GNU Affero General Public License is designed specifically to
 43 | ensure that, in such cases, the modified source code becomes available
 44 | to the community.  It requires the operator of a network server to
 45 | provide the source code of the modified version running there to the
 46 | users of that server.  Therefore, public use of a modified version, on
 47 | a publicly accessible server, gives the public access to the source
 48 | code of the modified version.
 49 | 
 50 |   An older license, called the Affero General Public License and
 51 | published by Affero, was designed to accomplish similar goals.  This is
 52 | a different license, not a version of the Affero GPL, but Affero has
 53 | released a new version of the Affero GPL which permits relicensing under
 54 | this license.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                        TERMS AND CONDITIONS
 60 | 
 61 |   0. Definitions.
 62 | 
 63 |   "This License" refers to version 3 of the GNU Affero General Public License.
 64 | 
 65 |   "Copyright" also means copyright-like laws that apply to other kinds of
 66 | works, such as semiconductor masks.
 67 | 
 68 |   "The Program" refers to any copyrightable work licensed under this
 69 | License.  Each licensee is addressed as "you".  "Licensees" and
 70 | "recipients" may be individuals or organizations.
 71 | 
 72 |   To "modify" a work means to copy from or adapt all or part of the work
 73 | in a fashion requiring copyright permission, other than the making of an
 74 | exact copy.  The resulting work is called a "modified version" of the
 75 | earlier work or a work "based on" the earlier work.
 76 | 
 77 |   A "covered work" means either the unmodified Program or a work based
 78 | on the Program.
 79 | 
 80 |   To "propagate" a work means to do anything with it that, without
 81 | permission, would make you directly or secondarily liable for
 82 | infringement under applicable copyright law, except executing it on a
 83 | computer or modifying a private copy.  Propagation includes copying,
 84 | distribution (with or without modification), making available to the
 85 | public, and in some countries other activities as well.
 86 | 
 87 |   To "convey" a work means any kind of propagation that enables other
 88 | parties to make or receive copies.  Mere interaction with a user through
 89 | a computer network, with no transfer of a copy, is not conveying.
 90 | 
 91 |   An interactive user interface displays "Appropriate Legal Notices"
 92 | to the extent that it includes a convenient and prominently visible
 93 | feature that (1) displays an appropriate copyright notice, and (2)
 94 | tells the user that there is no warranty for the work (except to the
 95 | extent that warranties are provided), that licensees may convey the
 96 | work under this License, and how to view a copy of this License.  If
 97 | the interface presents a list of user commands or options, such as a
 98 | menu, a prominent item in the list meets this criterion.
 99 | 
100 |   1. Source Code.
101 | 
102 |   The "source code" for a work means the preferred form of the work
103 | for making modifications to it.  "Object code" means any non-source
104 | form of a work.
105 | 
106 |   A "Standard Interface" means an interface that either is an official
107 | standard defined by a recognized standards body, or, in the case of
108 | interfaces specified for a particular programming language, one that
109 | is widely used among developers working in that language.
110 | 
111 |   The "System Libraries" of an executable work include anything, other
112 | than the work as a whole, that (a) is included in the normal form of
113 | packaging a Major Component, but which is not part of that Major
114 | Component, and (b) serves only to enable use of the work with that
115 | Major Component, or to implement a Standard Interface for which an
116 | implementation is available to the public in source code form.  A
117 | "Major Component", in this context, means a major essential component
118 | (kernel, window system, and so on) of the specific operating system
119 | (if any) on which the executable work runs, or a compiler used to
120 | produce the work, or an object code interpreter used to run it.
121 | 
122 |   The "Corresponding Source" for a work in object code form means all
123 | the source code needed to generate, install, and (for an executable
124 | work) run the object code and to modify the work, including scripts to
125 | control those activities.  However, it does not include the work's
126 | System Libraries, or general-purpose tools or generally available free
127 | programs which are used unmodified in performing those activities but
128 | which are not part of the work.  For example, Corresponding Source
129 | includes interface definition files associated with source files for
130 | the work, and the source code for shared libraries and dynamically
131 | linked subprograms that the work is specifically designed to require,
132 | such as by intimate data communication or control flow between those
133 | subprograms and other parts of the work.
134 | 
135 |   The Corresponding Source need not include anything that users
136 | can regenerate automatically from other parts of the Corresponding
137 | Source.
138 | 
139 |   The Corresponding Source for a work in source code form is that
140 | same work.
141 | 
142 |   2. Basic Permissions.
143 | 
144 |   All rights granted under this License are granted for the term of
145 | copyright on the Program, and are irrevocable provided the stated
146 | conditions are met.  This License explicitly affirms your unlimited
147 | permission to run the unmodified Program.  The output from running a
148 | covered work is covered by this License only if the output, given its
149 | content, constitutes a covered work.  This License acknowledges your
150 | rights of fair use or other equivalent, as provided by copyright law.
151 | 
152 |   You may make, run and propagate covered works that you do not
153 | convey, without conditions so long as your license otherwise remains
154 | in force.  You may convey covered works to others for the sole purpose
155 | of having them make modifications exclusively for you, or provide you
156 | with facilities for running those works, provided that you comply with
157 | the terms of this License in conveying all material for which you do
158 | not control copyright.  Those thus making or running the covered works
159 | for you must do so exclusively on your behalf, under your direction
160 | and control, on terms that prohibit them from making any copies of
161 | your copyrighted material outside their relationship with you.
162 | 
163 |   Conveying under any other circumstances is permitted solely under
164 | the conditions stated below.  Sublicensing is not allowed; section 10
165 | makes it unnecessary.
166 | 
167 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
168 | 
169 |   No covered work shall be deemed part of an effective technological
170 | measure under any applicable law fulfilling obligations under article
171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
172 | similar laws prohibiting or restricting circumvention of such
173 | measures.
174 | 
175 |   When you convey a covered work, you waive any legal power to forbid
176 | circumvention of technological measures to the extent such circumvention
177 | is effected by exercising rights under this License with respect to
178 | the covered work, and you disclaim any intention to limit operation or
179 | modification of the work as a means of enforcing, against the work's
180 | users, your or third parties' legal rights to forbid circumvention of
181 | technological measures.
182 | 
183 |   4. Conveying Verbatim Copies.
184 | 
185 |   You may convey verbatim copies of the Program's source code as you
186 | receive it, in any medium, provided that you conspicuously and
187 | appropriately publish on each copy an appropriate copyright notice;
188 | keep intact all notices stating that this License and any
189 | non-permissive terms added in accord with section 7 apply to the code;
190 | keep intact all notices of the absence of any warranty; and give all
191 | recipients a copy of this License along with the Program.
192 | 
193 |   You may charge any price or no price for each copy that you convey,
194 | and you may offer support or warranty protection for a fee.
195 | 
196 |   5. Conveying Modified Source Versions.
197 | 
198 |   You may convey a work based on the Program, or the modifications to
199 | produce it from the Program, in the form of source code under the
200 | terms of section 4, provided that you also meet all of these conditions:
201 | 
202 |     a) The work must carry prominent notices stating that you modified
203 |     it, and giving a relevant date.
204 | 
205 |     b) The work must carry prominent notices stating that it is
206 |     released under this License and any conditions added under section
207 |     7.  This requirement modifies the requirement in section 4 to
208 |     "keep intact all notices".
209 | 
210 |     c) You must license the entire work, as a whole, under this
211 |     License to anyone who comes into possession of a copy.  This
212 |     License will therefore apply, along with any applicable section 7
213 |     additional terms, to the whole of the work, and all its parts,
214 |     regardless of how they are packaged.  This License gives no
215 |     permission to license the work in any other way, but it does not
216 |     invalidate such permission if you have separately received it.
217 | 
218 |     d) If the work has interactive user interfaces, each must display
219 |     Appropriate Legal Notices; however, if the Program has interactive
220 |     interfaces that do not display Appropriate Legal Notices, your
221 |     work need not make them do so.
222 | 
223 |   A compilation of a covered work with other separate and independent
224 | works, which are not by their nature extensions of the covered work,
225 | and which are not combined with it such as to form a larger program,
226 | in or on a volume of a storage or distribution medium, is called an
227 | "aggregate" if the compilation and its resulting copyright are not
228 | used to limit the access or legal rights of the compilation's users
229 | beyond what the individual works permit.  Inclusion of a covered work
230 | in an aggregate does not cause this License to apply to the other
231 | parts of the aggregate.
232 | 
233 |   6. Conveying Non-Source Forms.
234 | 
235 |   You may convey a covered work in object code form under the terms
236 | of sections 4 and 5, provided that you also convey the
237 | machine-readable Corresponding Source under the terms of this License,
238 | in one of these ways:
239 | 
240 |     a) Convey the object code in, or embodied in, a physical product
241 |     (including a physical distribution medium), accompanied by the
242 |     Corresponding Source fixed on a durable physical medium
243 |     customarily used for software interchange.
244 | 
245 |     b) Convey the object code in, or embodied in, a physical product
246 |     (including a physical distribution medium), accompanied by a
247 |     written offer, valid for at least three years and valid for as
248 |     long as you offer spare parts or customer support for that product
249 |     model, to give anyone who possesses the object code either (1) a
250 |     copy of the Corresponding Source for all the software in the
251 |     product that is covered by this License, on a durable physical
252 |     medium customarily used for software interchange, for a price no
253 |     more than your reasonable cost of physically performing this
254 |     conveying of source, or (2) access to copy the
255 |     Corresponding Source from a network server at no charge.
256 | 
257 |     c) Convey individual copies of the object code with a copy of the
258 |     written offer to provide the Corresponding Source.  This
259 |     alternative is allowed only occasionally and noncommercially, and
260 |     only if you received the object code with such an offer, in accord
261 |     with subsection 6b.
262 | 
263 |     d) Convey the object code by offering access from a designated
264 |     place (gratis or for a charge), and offer equivalent access to the
265 |     Corresponding Source in the same way through the same place at no
266 |     further charge.  You need not require recipients to copy the
267 |     Corresponding Source along with the object code.  If the place to
268 |     copy the object code is a network server, the Corresponding Source
269 |     may be on a different server (operated by you or a third party)
270 |     that supports equivalent copying facilities, provided you maintain
271 |     clear directions next to the object code saying where to find the
272 |     Corresponding Source.  Regardless of what server hosts the
273 |     Corresponding Source, you remain obligated to ensure that it is
274 |     available for as long as needed to satisfy these requirements.
275 | 
276 |     e) Convey the object code using peer-to-peer transmission, provided
277 |     you inform other peers where the object code and Corresponding
278 |     Source of the work are being offered to the general public at no
279 |     charge under subsection 6d.
280 | 
281 |   A separable portion of the object code, whose source code is excluded
282 | from the Corresponding Source as a System Library, need not be
283 | included in conveying the object code work.
284 | 
285 |   A "User Product" is either (1) a "consumer product", which means any
286 | tangible personal property which is normally used for personal, family,
287 | or household purposes, or (2) anything designed or sold for incorporation
288 | into a dwelling.  In determining whether a product is a consumer product,
289 | doubtful cases shall be resolved in favor of coverage.  For a particular
290 | product received by a particular user, "normally used" refers to a
291 | typical or common use of that class of product, regardless of the status
292 | of the particular user or of the way in which the particular user
293 | actually uses, or expects or is expected to use, the product.  A product
294 | is a consumer product regardless of whether the product has substantial
295 | commercial, industrial or non-consumer uses, unless such uses represent
296 | the only significant mode of use of the product.
297 | 
298 |   "Installation Information" for a User Product means any methods,
299 | procedures, authorization keys, or other information required to install
300 | and execute modified versions of a covered work in that User Product from
301 | a modified version of its Corresponding Source.  The information must
302 | suffice to ensure that the continued functioning of the modified object
303 | code is in no case prevented or interfered with solely because
304 | modification has been made.
305 | 
306 |   If you convey an object code work under this section in, or with, or
307 | specifically for use in, a User Product, and the conveying occurs as
308 | part of a transaction in which the right of possession and use of the
309 | User Product is transferred to the recipient in perpetuity or for a
310 | fixed term (regardless of how the transaction is characterized), the
311 | Corresponding Source conveyed under this section must be accompanied
312 | by the Installation Information.  But this requirement does not apply
313 | if neither you nor any third party retains the ability to install
314 | modified object code on the User Product (for example, the work has
315 | been installed in ROM).
316 | 
317 |   The requirement to provide Installation Information does not include a
318 | requirement to continue to provide support service, warranty, or updates
319 | for a work that has been modified or installed by the recipient, or for
320 | the User Product in which it has been modified or installed.  Access to a
321 | network may be denied when the modification itself materially and
322 | adversely affects the operation of the network or violates the rules and
323 | protocols for communication across the network.
324 | 
325 |   Corresponding Source conveyed, and Installation Information provided,
326 | in accord with this section must be in a format that is publicly
327 | documented (and with an implementation available to the public in
328 | source code form), and must require no special password or key for
329 | unpacking, reading or copying.
330 | 
331 |   7. Additional Terms.
332 | 
333 |   "Additional permissions" are terms that supplement the terms of this
334 | License by making exceptions from one or more of its conditions.
335 | Additional permissions that are applicable to the entire Program shall
336 | be treated as though they were included in this License, to the extent
337 | that they are valid under applicable law.  If additional permissions
338 | apply only to part of the Program, that part may be used separately
339 | under those permissions, but the entire Program remains governed by
340 | this License without regard to the additional permissions.
341 | 
342 |   When you convey a copy of a covered work, you may at your option
343 | remove any additional permissions from that copy, or from any part of
344 | it.  (Additional permissions may be written to require their own
345 | removal in certain cases when you modify the work.)  You may place
346 | additional permissions on material, added by you to a covered work,
347 | for which you have or can give appropriate copyright permission.
348 | 
349 |   Notwithstanding any other provision of this License, for material you
350 | add to a covered work, you may (if authorized by the copyright holders of
351 | that material) supplement the terms of this License with terms:
352 | 
353 |     a) Disclaiming warranty or limiting liability differently from the
354 |     terms of sections 15 and 16 of this License; or
355 | 
356 |     b) Requiring preservation of specified reasonable legal notices or
357 |     author attributions in that material or in the Appropriate Legal
358 |     Notices displayed by works containing it; or
359 | 
360 |     c) Prohibiting misrepresentation of the origin of that material, or
361 |     requiring that modified versions of such material be marked in
362 |     reasonable ways as different from the original version; or
363 | 
364 |     d) Limiting the use for publicity purposes of names of licensors or
365 |     authors of the material; or
366 | 
367 |     e) Declining to grant rights under trademark law for use of some
368 |     trade names, trademarks, or service marks; or
369 | 
370 |     f) Requiring indemnification of licensors and authors of that
371 |     material by anyone who conveys the material (or modified versions of
372 |     it) with contractual assumptions of liability to the recipient, for
373 |     any liability that these contractual assumptions directly impose on
374 |     those licensors and authors.
375 | 
376 |   All other non-permissive additional terms are considered "further
377 | restrictions" within the meaning of section 10.  If the Program as you
378 | received it, or any part of it, contains a notice stating that it is
379 | governed by this License along with a term that is a further
380 | restriction, you may remove that term.  If a license document contains
381 | a further restriction but permits relicensing or conveying under this
382 | License, you may add to a covered work material governed by the terms
383 | of that license document, provided that the further restriction does
384 | not survive such relicensing or conveying.
385 | 
386 |   If you add terms to a covered work in accord with this section, you
387 | must place, in the relevant source files, a statement of the
388 | additional terms that apply to those files, or a notice indicating
389 | where to find the applicable terms.
390 | 
391 |   Additional terms, permissive or non-permissive, may be stated in the
392 | form of a separately written license, or stated as exceptions;
393 | the above requirements apply either way.
394 | 
395 |   8. Termination.
396 | 
397 |   You may not propagate or modify a covered work except as expressly
398 | provided under this License.  Any attempt otherwise to propagate or
399 | modify it is void, and will automatically terminate your rights under
400 | this License (including any patent licenses granted under the third
401 | paragraph of section 11).
402 | 
403 |   However, if you cease all violation of this License, then your
404 | license from a particular copyright holder is reinstated (a)
405 | provisionally, unless and until the copyright holder explicitly and
406 | finally terminates your license, and (b) permanently, if the copyright
407 | holder fails to notify you of the violation by some reasonable means
408 | prior to 60 days after the cessation.
409 | 
410 |   Moreover, your license from a particular copyright holder is
411 | reinstated permanently if the copyright holder notifies you of the
412 | violation by some reasonable means, this is the first time you have
413 | received notice of violation of this License (for any work) from that
414 | copyright holder, and you cure the violation prior to 30 days after
415 | your receipt of the notice.
416 | 
417 |   Termination of your rights under this section does not terminate the
418 | licenses of parties who have received copies or rights from you under
419 | this License.  If your rights have been terminated and not permanently
420 | reinstated, you do not qualify to receive new licenses for the same
421 | material under section 10.
422 | 
423 |   9. Acceptance Not Required for Having Copies.
424 | 
425 |   You are not required to accept this License in order to receive or
426 | run a copy of the Program.  Ancillary propagation of a covered work
427 | occurring solely as a consequence of using peer-to-peer transmission
428 | to receive a copy likewise does not require acceptance.  However,
429 | nothing other than this License grants you permission to propagate or
430 | modify any covered work.  These actions infringe copyright if you do
431 | not accept this License.  Therefore, by modifying or propagating a
432 | covered work, you indicate your acceptance of this License to do so.
433 | 
434 |   10. Automatic Licensing of Downstream Recipients.
435 | 
436 |   Each time you convey a covered work, the recipient automatically
437 | receives a license from the original licensors, to run, modify and
438 | propagate that work, subject to this License.  You are not responsible
439 | for enforcing compliance by third parties with this License.
440 | 
441 |   An "entity transaction" is a transaction transferring control of an
442 | organization, or substantially all assets of one, or subdividing an
443 | organization, or merging organizations.  If propagation of a covered
444 | work results from an entity transaction, each party to that
445 | transaction who receives a copy of the work also receives whatever
446 | licenses to the work the party's predecessor in interest had or could
447 | give under the previous paragraph, plus a right to possession of the
448 | Corresponding Source of the work from the predecessor in interest, if
449 | the predecessor has it or can get it with reasonable efforts.
450 | 
451 |   You may not impose any further restrictions on the exercise of the
452 | rights granted or affirmed under this License.  For example, you may
453 | not impose a license fee, royalty, or other charge for exercise of
454 | rights granted under this License, and you may not initiate litigation
455 | (including a cross-claim or counterclaim in a lawsuit) alleging that
456 | any patent claim is infringed by making, using, selling, offering for
457 | sale, or importing the Program or any portion of it.
458 | 
459 |   11. Patents.
460 | 
461 |   A "contributor" is a copyright holder who authorizes use under this
462 | License of the Program or a work on which the Program is based.  The
463 | work thus licensed is called the contributor's "contributor version".
464 | 
465 |   A contributor's "essential patent claims" are all patent claims
466 | owned or controlled by the contributor, whether already acquired or
467 | hereafter acquired, that would be infringed by some manner, permitted
468 | by this License, of making, using, or selling its contributor version,
469 | but do not include claims that would be infringed only as a
470 | consequence of further modification of the contributor version.  For
471 | purposes of this definition, "control" includes the right to grant
472 | patent sublicenses in a manner consistent with the requirements of
473 | this License.
474 | 
475 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
476 | patent license under the contributor's essential patent claims, to
477 | make, use, sell, offer for sale, import and otherwise run, modify and
478 | propagate the contents of its contributor version.
479 | 
480 |   In the following three paragraphs, a "patent license" is any express
481 | agreement or commitment, however denominated, not to enforce a patent
482 | (such as an express permission to practice a patent or covenant not to
483 | sue for patent infringement).  To "grant" such a patent license to a
484 | party means to make such an agreement or commitment not to enforce a
485 | patent against the party.
486 | 
487 |   If you convey a covered work, knowingly relying on a patent license,
488 | and the Corresponding Source of the work is not available for anyone
489 | to copy, free of charge and under the terms of this License, through a
490 | publicly available network server or other readily accessible means,
491 | then you must either (1) cause the Corresponding Source to be so
492 | available, or (2) arrange to deprive yourself of the benefit of the
493 | patent license for this particular work, or (3) arrange, in a manner
494 | consistent with the requirements of this License, to extend the patent
495 | license to downstream recipients.  "Knowingly relying" means you have
496 | actual knowledge that, but for the patent license, your conveying the
497 | covered work in a country, or your recipient's use of the covered work
498 | in a country, would infringe one or more identifiable patents in that
499 | country that you have reason to believe are valid.
500 | 
501 |   If, pursuant to or in connection with a single transaction or
502 | arrangement, you convey, or propagate by procuring conveyance of, a
503 | covered work, and grant a patent license to some of the parties
504 | receiving the covered work authorizing them to use, propagate, modify
505 | or convey a specific copy of the covered work, then the patent license
506 | you grant is automatically extended to all recipients of the covered
507 | work and works based on it.
508 | 
509 |   A patent license is "discriminatory" if it does not include within
510 | the scope of its coverage, prohibits the exercise of, or is
511 | conditioned on the non-exercise of one or more of the rights that are
512 | specifically granted under this License.  You may not convey a covered
513 | work if you are a party to an arrangement with a third party that is
514 | in the business of distributing software, under which you make payment
515 | to the third party based on the extent of your activity of conveying
516 | the work, and under which the third party grants, to any of the
517 | parties who would receive the covered work from you, a discriminatory
518 | patent license (a) in connection with copies of the covered work
519 | conveyed by you (or copies made from those copies), or (b) primarily
520 | for and in connection with specific products or compilations that
521 | contain the covered work, unless you entered into that arrangement,
522 | or that patent license was granted, prior to 28 March 2007.
523 | 
524 |   Nothing in this License shall be construed as excluding or limiting
525 | any implied license or other defenses to infringement that may
526 | otherwise be available to you under applicable patent law.
527 | 
528 |   12. No Surrender of Others' Freedom.
529 | 
530 |   If conditions are imposed on you (whether by court order, agreement or
531 | otherwise) that contradict the conditions of this License, they do not
532 | excuse you from the conditions of this License.  If you cannot convey a
533 | covered work so as to satisfy simultaneously your obligations under this
534 | License and any other pertinent obligations, then as a consequence you may
535 | not convey it at all.  For example, if you agree to terms that obligate you
536 | to collect a royalty for further conveying from those to whom you convey
537 | the Program, the only way you could satisfy both those terms and this
538 | License would be to refrain entirely from conveying the Program.
539 | 
540 |   13. Remote Network Interaction; Use with the GNU General Public License.
541 | 
542 |   Notwithstanding any other provision of this License, if you modify the
543 | Program, your modified version must prominently offer all users
544 | interacting with it remotely through a computer network (if your version
545 | supports such interaction) an opportunity to receive the Corresponding
546 | Source of your version by providing access to the Corresponding Source
547 | from a network server at no charge, through some standard or customary
548 | means of facilitating copying of software.  This Corresponding Source
549 | shall include the Corresponding Source for any work covered by version 3
550 | of the GNU General Public License that is incorporated pursuant to the
551 | following paragraph.
552 | 
553 |   Notwithstanding any other provision of this License, you have
554 | permission to link or combine any covered work with a work licensed
555 | under version 3 of the GNU General Public License into a single
556 | combined work, and to convey the resulting work.  The terms of this
557 | License will continue to apply to the part which is the covered work,
558 | but the work with which it is combined will remain governed by version
559 | 3 of the GNU General Public License.
560 | 
561 |   14. Revised Versions of this License.
562 | 
563 |   The Free Software Foundation may publish revised and/or new versions of
564 | the GNU Affero General Public License from time to time.  Such new versions
565 | will be similar in spirit to the present version, but may differ in detail to
566 | address new problems or concerns.
567 | 
568 |   Each version is given a distinguishing version number.  If the
569 | Program specifies that a certain numbered version of the GNU Affero General
570 | Public License "or any later version" applies to it, you have the
571 | option of following the terms and conditions either of that numbered
572 | version or of any later version published by the Free Software
573 | Foundation.  If the Program does not specify a version number of the
574 | GNU Affero General Public License, you may choose any version ever published
575 | by the Free Software Foundation.
576 | 
577 |   If the Program specifies that a proxy can decide which future
578 | versions of the GNU Affero General Public License can be used, that proxy's
579 | public statement of acceptance of a version permanently authorizes you
580 | to choose that version for the Program.
581 | 
582 |   Later license versions may give you additional or different
583 | permissions.  However, no additional obligations are imposed on any
584 | author or copyright holder as a result of your choosing to follow a
585 | later version.
586 | 
587 |   15. Disclaimer of Warranty.
588 | 
589 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
590 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
594 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
595 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
597 | 
598 |   16. Limitation of Liability.
599 | 
600 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
608 | SUCH DAMAGES.
609 | 
610 |   17. Interpretation of Sections 15 and 16.
611 | 
612 |   If the disclaimer of warranty and limitation of liability provided
613 | above cannot be given local legal effect according to their terms,
614 | reviewing courts shall apply local law that most closely approximates
615 | an absolute waiver of all civil liability in connection with the
616 | Program, unless a warranty or assumption of liability accompanies a
617 | copy of the Program in return for a fee.
618 | 
619 |                      END OF TERMS AND CONDITIONS
620 | 
621 |             How to Apply These Terms to Your New Programs
622 | 
623 |   If you develop a new program, and you want it to be of the greatest
624 | possible use to the public, the best way to achieve this is to make it
625 | free software which everyone can redistribute and change under these terms.
626 | 
627 |   To do so, attach the following notices to the program.  It is safest
628 | to attach them to the start of each source file to most effectively
629 | state the exclusion of warranty; and each file should have at least
630 | the "copyright" line and a pointer to where the full notice is found.
631 | 
632 |     <one line to give the program's name and a brief idea of what it does.>
633 |     Copyright (C) <year>  <name of author>
634 | 
635 |     This program is free software: you can redistribute it and/or modify
636 |     it under the terms of the GNU Affero General Public License as published
637 |     by the Free Software Foundation, either version 3 of the License, or
638 |     (at your option) any later version.
639 | 
640 |     This program is distributed in the hope that it will be useful,
641 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
642 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
643 |     GNU Affero General Public License for more details.
644 | 
645 |     You should have received a copy of the GNU Affero General Public License
646 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
647 | 
648 | Also add information on how to contact you by electronic and paper mail.
649 | 
650 |   If your software can interact with users remotely through a computer
651 | network, you should also make sure that it provides a way for users to
652 | get its source.  For example, if your program is a web application, its
653 | interface could display a "Source" link that leads users to an archive
654 | of the code.  There are many ways you could offer source, and different
655 | solutions will be better for different programs; see section 13 for the
656 | specific requirements.
657 | 
658 |   You should also get your employer (if you work as a programmer) or school,
659 | if any, to sign a "copyright disclaimer" for the program, if necessary.
660 | For more information on this, and how to apply and follow the GNU AGPL, see
661 | <https://www.gnu.org/licenses/>.
662 | 


--------------------------------------------------------------------------------
/3rd_PanJiu_AIOps_Competition/code/generate_feature.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import os
  3 | import pickle
  4 | from collections import Counter
  5 | from utils import get_new_cols
  6 | import numpy as np
  7 | import pandas as pd
  8 | from tqdm import tqdm
  9 | from gensim.models import Word2Vec
 10 | from utils import GENERATION_DIR
 11 | from utils import KEY_1, KEY_2, KEY_3, KEY_4
 12 | from tqdm import tqdm
 13 | from scipy import stats
 14 | 
 15 | def cat2num(df, cat_cols, Transfer2num=True):
 16 |     '''
 17 | 
 18 |     :param df:
 19 |     :param cat_cols: 类别特征列表
 20 |     :param Transfer2num: 类别特征转换为数值特征
 21 |     :return:
 22 |     '''
 23 |     if Transfer2num:
 24 | 
 25 |         print('Transfer category feature to  num feature ')
 26 |         for col in cat_cols:
 27 | 
 28 |             if not os.path.exists(os.path.join(GENERATION_DIR, f'{col}_map.pkl')):
 29 |                 print(f'Transfer : {col}')
 30 |                 tmp_map = dict(zip(df[col].unique(), range(df[col].nunique())))
 31 |                 with open(os.path.join(GENERATION_DIR, f'{col}_map.pkl'), 'wb') as f:
 32 |                     pickle.dump(tmp_map, f)
 33 |             else:
 34 |                 with open(os.path.join(GENERATION_DIR, f'{col}_map.pkl'), 'rb') as f:
 35 |                     tmp_map = pickle.load(f)
 36 |             df[f'{col}_LabelEnc'] = df[col].map(tmp_map).fillna(-1).astype(int)
 37 |     else:
 38 |         print('Transfer category feature to  category feature ')
 39 |         for col in cat_cols:
 40 |             df[col] = df[col].astype('category')
 41 |     print('Transfer category feature to  num feature  Down...')
 42 |     return df
 43 | 
 44 | def add_minutes(x, minutes=5):
 45 |     dt = datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
 46 |     out_date = (dt + datetime.timedelta(minutes=minutes)
 47 |                 ).strftime('%Y-%m-%d %H:%M:%S')
 48 |     return out_date
 49 | 
 50 | 
 51 | def time_process(df, time_cols, minutes_):
 52 |     df[f'time_{minutes_}'] = df[time_cols].apply(
 53 |         lambda x: add_minutes(str(x), minutes_))
 54 |     return df
 55 | 
 56 | 
 57 | def get_fea(x, fea):
 58 |     if fea in x:
 59 |         return 1
 60 |     else:
 61 |         return 0
 62 | 
 63 | 
 64 | def get_last_msg_cnt(x):
 65 |     last_msg = x[-1]
 66 |     cnt = x.count(last_msg)
 67 |     return cnt
 68 | 
 69 | 
 70 | def get_first_msg_cnt(x):
 71 |     first_msg = x[0]
 72 |     cnt = x.count(first_msg)
 73 |     return cnt
 74 | 
 75 | 
 76 | def add_last_next_time4fault(label, preliminary_submit_dataset_a,
 77 |                              time_interval, next_time_list):
 78 |     print(f'添加自定义异常出现的时间间隔{time_interval}的前后的时间点')
 79 |     for i in tqdm([-i for i in next_time_list] + next_time_list):
 80 |         label = time_process(label, 'fault_time', i * time_interval)
 81 |         preliminary_submit_dataset_a = time_process(
 82 |             preliminary_submit_dataset_a, 'fault_time', i * time_interval)
 83 | 
 84 |     return label, preliminary_submit_dataset_a
 85 | 
 86 | 
 87 | def get_msg_text_fea(df, msg_type='last'):
 88 |     print(f'获取 msg text {msg_type}特征')
 89 | 
 90 |     df_fea = df.groupby(['sn', 'fault_time']).agg(
 91 |         {'msg_list': 'sum', 'msg_0': 'sum', 'msg_1': 'sum', 'msg_2': 'sum'}).reset_index()
 92 |     df_fea['msg_list_unique'] = df_fea['msg_list'].apply(lambda x: str(set(x)))
 93 |     df_fea['msg_0_unique'] = df_fea['msg_0'].apply(lambda x: str(set(x)))
 94 |     df_fea['msg_1_unique'] = df_fea['msg_1'].apply(lambda x: str(set(x)))
 95 |     df_fea['msg_2_unique'] = df_fea['msg_2'].apply(lambda x: str(set(x)))
 96 | 
 97 |     df_fea['msg_list_list'] = df_fea['msg_list'].apply(lambda x: str(x))
 98 |     df_fea['msg_0_list'] = df_fea['msg_0'].apply(lambda x: str(x))
 99 |     df_fea['msg_1_list'] = df_fea['msg_1'].apply(lambda x: str(x))
100 |     df_fea['msg_2_list'] = df_fea['msg_2'].apply(lambda x: str(x))
101 | 
102 |     df_fea['msg_0_first'] = df_fea['msg_0'].apply(lambda x: x[0])
103 |     df_fea['msg_1_first'] = df_fea['msg_1'].apply(lambda x: x[0])
104 |     df_fea['msg_2_first'] = df_fea['msg_2'].apply(lambda x: x[0])
105 | 
106 |     df_fea['msg_0_last'] = df_fea['msg_0'].apply(lambda x: x[-1])
107 |     df_fea['msg_1_last'] = df_fea['msg_1'].apply(lambda x: x[-1])
108 |     df_fea['msg_2_last'] = df_fea['msg_2'].apply(lambda x: x[-1])
109 | 
110 |     df_fea['msg_last'] = df.groupby(['sn', 'fault_time']).apply(
111 |         lambda x: x['msg'].to_list()[-1]).values
112 |     df_fea['msg_first'] = df.groupby(['sn', 'fault_time']).apply(
113 |         lambda x: x['msg'].to_list()[0]).values
114 | 
115 |     df_fea['last_msg_cnt'] = df_fea['msg_list'].apply(
116 |         lambda x: get_last_msg_cnt(x))
117 |     df_fea['first_msg_cnt'] = df_fea['msg_list'].apply(
118 |         lambda x: get_first_msg_cnt(x))
119 |     cat_cols = ['msg_list', 'msg_0', 'msg_1', 'msg_2',
120 |                 'msg_list_unique', 'msg_0_unique', 'msg_1_unique', 'msg_2_unique',
121 |                 'msg_list_list', 'msg_0_list', 'msg_1_list', 'msg_2_list',
122 |                 'msg_0_first', 'msg_1_first', 'msg_2_first', 'msg_0_last', 'msg_1_last',
123 |                 'msg_2_last', 'msg_last', 'msg_first']
124 |     num_cols = ['last_msg_cnt', 'first_msg_cnt']
125 |     id_cols = ['sn', 'fault_time']
126 | 
127 |     df_fea = df_fea.rename(
128 |         columns={
129 |             i: f'{msg_type}_{i}' for i in (cat_cols + num_cols)})
130 |     cat_cols = [f'{msg_type}_{i}' for i in cat_cols]
131 |     for cat_col in cat_cols:
132 |         df_fea[cat_col] = df_fea[cat_col].astype(str)
133 |     df_fea = cat2num(df_fea, cat_cols, Transfer2num=True)
134 |     for i in cat_cols:
135 |         del df_fea[i]
136 |     return df_fea
137 | 
138 | def add_w2v_feats(all_data,w2v_feats_df,f1,f2,emb_size = 32,window = 5,min_count  =5,):
139 |     print(f'生成 {f1}_{f2}_w2v 特征')
140 | 
141 |     df_fea = all_data.groupby(f1).agg({f2:'sum'}).reset_index()
142 |     df_emb = df_fea[[f1 ]]
143 |     sencences = df_fea[f2].to_list()
144 |     if not os.path.exists(os.path.join(GENERATION_DIR, f'{f1}_{f2}_w2v_model.pkl')):
145 |         print(f'{f1}_{f2}_w2v_model 不存在，开始训练......')
146 |         model = Word2Vec(sencences, vector_size=emb_size, window=window,
147 |                          min_count=min_count, sg=0, hs=1, seed=42)
148 |         with open(os.path.join(GENERATION_DIR, f'{f1}_{f2}_w2v_model.pkl'), 'wb') as f:
149 |             pickle.dump(model, f)
150 |     else:
151 |         print(f'{f1}_{f2}_w2v_model 已存在，开始读取......')
152 |         with open(os.path.join(GENERATION_DIR, f'{f1}_{f2}_w2v_model.pkl'), 'rb') as f:
153 |             model = pickle.load(f)
154 | 
155 |     emb_matrix_mean = []
156 |     for sent in sencences:
157 |         vec = []
158 |         for w in sent:
159 |             if w in model.wv:
160 |                 vec.append(model.wv[w])
161 |         if len(vec) >0:
162 |             emb_matrix_mean.append(np.mean(vec,axis = 0))
163 |         else:
164 |             emb_matrix_mean.append([0]*emb_size)
165 |     df_emb_mean = pd.DataFrame(emb_matrix_mean).add_prefix(f'{f1}_{f2}_w2v_')
166 | 
167 |     df_emb = pd.concat([df_emb,df_emb_mean],axis = 1)
168 |     w2v_feats_df = w2v_feats_df.merge(df_emb,on = f1,how ='left')
169 |     return w2v_feats_df
170 | def get_w2v_feats(all_data,f1_list,f2_list):
171 |     all_data['msg_list'] = all_data['msg'].apply(lambda x: [i.strip() for i in x.split(' | ')])
172 |     all_data['msg_0'] = all_data['msg'].apply(lambda x: [get_msg_location(x.split(' | '), 0)])
173 |     all_data['msg_1'] = all_data['msg'].apply(lambda x: [get_msg_location(x.split(' | '), 1)])
174 |     all_data['msg_2'] = all_data['msg'].apply(lambda x: [get_msg_location(x.split(' | '), 2)])
175 |     w2v_feats_df = all_data[f1_list].drop_duplicates()
176 |     for f1 in f1_list:
177 |         for f2 in f2_list:
178 |             w2v_feats_df = add_w2v_feats(all_data,w2v_feats_df,f1,f2,emb_size = 10,window = 5,min_count  =5,)
179 |     print(f'w2v_feats 的特征维度: {w2v_feats_df.shape}')
180 |     return w2v_feats_df
181 | 
182 | 
183 | 
184 | def get_time_diff_feats_v2(all_data):
185 |     print('生成时间差特征 time_diff_feats_v2')
186 |     all_data['duration_seconds'] = all_data['time_interval']
187 |     all_data['duration_minutes'] = all_data['time_interval'] / 60
188 |     df_merge_log = all_data[['sn', 'fault_time', 'label', 'time', 'msg',
189 |                              'server_model', 'time_interval', 'duration_seconds',
190 |                              'duration_minutes']]
191 |     df_merge_log['fault_id'] = df_merge_log['sn'] + '_' + df_merge_log['fault_time'] + '_' + df_merge_log[
192 |         'server_model']
193 |     f1_list = ['fault_id', 'sn', 'server_model']
194 |     f2_list = ['duration_minutes', 'duration_seconds']
195 |     time_diff_feats_v2 = df_merge_log[['sn', 'fault_time', 'fault_id', 'server_model']].drop_duplicates().reset_index(
196 |         drop=True)
197 | 
198 |     for f1 in f1_list:
199 |         for f2 in f2_list:
200 |             func_opt = ['count', 'nunique', 'min', 'max', 'median', 'sum']
201 |             for opt in func_opt:
202 |                 tmp = df_merge_log.groupby([f1])[f2].agg([(f'{f2}_in_{f1}_' + opt, opt)]).reset_index()
203 |                 # print(f'{f1}_in_{f2}_{opt}:{tmp.shape}' )
204 |                 time_diff_feats_v2 = time_diff_feats_v2.merge(tmp, on=f1, how='left')
205 | 
206 |             temp = df_merge_log.groupby([f1])[f2].apply(lambda x: stats.mode(x)[0][0])
207 |             time_diff_feats_v2[f'{f2}_in_{f1}_mode'] = time_diff_feats_v2[f1].map(temp).fillna(np.nan)
208 |             secs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
209 |             for sec in secs:
210 |                 temp = df_merge_log.groupby([f1])[f2].quantile(sec).reset_index(
211 |                     name=f'log_{f2}_in_{f1}_quantile_' + str(sec * 100))
212 |                 # print(f'log_{f1}_in_{f2}_quantile_{str(sec * 100)}:{tmp.shape}' )
213 |                 time_diff_feats_v2 = pd.merge(time_diff_feats_v2, temp, on=f1, how='left')
214 |     del time_diff_feats_v2['fault_id']
215 |     return time_diff_feats_v2
216 | 
217 | def get_feature(data, time_list, log_fea, fea_num, key):
218 |     print(f'当前特征维度{data.shape}')
219 |     fea_df_list = []
220 |     fea_cnt_list = ['OEM record c2', 'Processor CPU_Core_Error', '001c4c', 'System Event Sys_Event','OEM CPU0 MCERR',
221 |                     'OEM CPU0 CATERR', 'Reading 0 &lt; Threshold 2 degrees C', '0203c0a80101',
222 |                     'Unknown CPU0 MCERR', 'Unknown CPU0 CATERR','Memory', 'Correctable ECC logging limit reached',
223 |                     'Memory MEM_CHE0_Status', 'Memory Memory_Status',  'Memory #0x87', 'Memory CPU0F0_DIMM_Stat',
224 |                     'Drive Fault', 'NMI/Diag Interrupt', 'Failure detected',  'Power Supply AC lost', ]
225 |     for time_tmp in tqdm(time_list):
226 |         print(f'获取异常前后 {time_tmp} min的数据进行聚合')
227 |         tmp1 = data[(pd.to_datetime(data['time']) < pd.to_datetime(data[f'time_{time_tmp}'])) & (pd.to_datetime(data['time']) > pd.to_datetime(data[f'time_-{time_tmp}']))].sort_values(
228 |             ['sn', 'fault_time'])
229 |         tmp1 = tmp1.groupby(key).apply(
230 |             lambda x: ' | '.join(x['msg'].to_list())).reset_index().rename(columns={0: 'msg'})
231 |         tmp1[f'msg_len'] = tmp1['msg'].apply(lambda x: len(x.split(' | ')))
232 |         #         tmp1[f'msg_len_two'] = tmp1['msg'].apply(lambda x: len(x))
233 |         # 添加数字个数
234 |         # tmp1[f'msg_num_two'] = tmp1['msg'].apply(
235 |         #     lambda x: len([int(s) for s in re.findall(r'\b\d+\b', x)]))
236 |         print(f'根据异常前后 {time_tmp} min的数据的日志数据提取 {fea_num} 个稀疏特征')
237 |         feature = log_fea + ['msg_len']
238 |         for fea in feature:
239 |             tmp1[fea] = tmp1['msg'].apply(lambda x: get_fea(x, fea))
240 |             # 添加计数特征
241 |             if fea in fea_cnt_list:
242 |                 tmp1[f'{fea}_cnt'] = tmp1['msg'].apply(lambda x:x.replace('|',' ').replace('_',' ').split(' ').count(fea))
243 |                 feature.append(f'{fea}_cnt')
244 |         tmp1_new_col_map = {i: i + '_' + str(int(time_tmp)) for i in feature}
245 |         tmp1 = tmp1.rename(columns=tmp1_new_col_map)
246 |         del tmp1['msg']
247 |         fea_df_list.append(tmp1)
248 |     fea_df = fea_df_list[-1]
249 |     print(fea_df.shape)
250 |     for i in fea_df_list[:-1]:
251 |         fea_df = fea_df.merge(i, on=key, how='left')
252 |         print(fea_df.shape)
253 |     return fea_df
254 | 
255 | 
256 | def get_msg_location(x, num):
257 |     try:
258 |         return x[num]
259 |     except BaseException:
260 |         return '其它'
261 | 
262 | 
263 | def get_nearest_msg_fea(train, test):
264 |     print('生成 nearest_msg 特征')
265 |     df = pd.concat([train, test], axis=0, ignore_index=True)
266 |     df['duration_minutes'] = (pd.to_datetime(df['fault_time']) - pd.to_datetime(df['time'])).apply(
267 |         lambda x: x.total_seconds())
268 |     df = df.sort_values(
269 |         ['sn', 'server_model', 'fault_time', 'time']).reset_index(drop=True)
270 |     df['duration_minutes_abs'] = np.abs(df['duration_minutes'])
271 | 
272 |     df['duration_minutes_abs_rank'] = df.groupby(['sn', 'server_model', 'fault_time'])['duration_minutes_abs'].rank(
273 |         method='first', ascending=False)
274 | 
275 |     key = ['sn', 'server_model', 'fault_time', 'duration_minutes_abs']
276 |     df = df.sort_values(key, ascending=False)
277 |     df = df.drop_duplicates(
278 |         ['sn', 'server_model', 'fault_time', ], keep='first')
279 | 
280 |     df.loc[df['duration_minutes'] ==
281 |            df['duration_minutes_abs'], 'last_or_next'] = 1
282 |     df.loc[df['duration_minutes'] !=
283 |            df['duration_minutes_abs'], 'last_or_next'] = 0
284 |     df['msg_cnt'] = df['msg'].map(df['msg'].value_counts())
285 |     df['msg_0'] = df['msg'].apply(
286 |         lambda x: get_msg_location(
287 |             x.split(' | '), 0))
288 |     df['msg_0_cnt'] = df['msg_0'].map(df['msg_0'].value_counts())
289 |     df['msg_1'] = df['msg'].apply(
290 |         lambda x: get_msg_location(
291 |             x.split(' | '), 1))
292 |     df['msg_1_cnt'] = df['msg_1'].map(df['msg_1'].value_counts())
293 |     df['msg_2'] = df['msg'].apply(
294 |         lambda x: get_msg_location(
295 |             x.split(' | '), 2))
296 |     df['msg_2_cnt'] = df['msg_2'].map(df['msg_2'].value_counts())
297 |     cat_feats = ['msg', 'msg_0', 'msg_1',
298 |                  'msg_2']  # ,'server_model_day_date','server_model_dayofmonth','server_model_dayofweek','server_model_hour']
299 |     # for name in cat_feats:
300 |     #     le = LabelEncoder()
301 |     #     df[f'{name}_LabelEnc'] = le.fit_transform(df[name])
302 |     df = cat2num(df,cat_feats)
303 |     df = df.drop_duplicates().reset_index(drop=True)
304 |     df = df[['sn', 'server_model', 'fault_time', 'msg_cnt',
305 |              'msg_0_cnt', 'msg_1_cnt', 'msg_2_cnt',
306 |              #              'duration_minutes_abs','duration_minutes', 'duration_minutes_abs_rank',
307 |              'last_or_next', 'msg_LabelEnc', 'msg_0_LabelEnc', 'msg_1_LabelEnc', 'msg_2_LabelEnc']]
308 |     print(f'生成 nearest_msg 特征完毕，特征维度{df.shape}')
309 |     return df
310 | 
311 | def get_server_model_time_interval_stat_fea(all_data):
312 |     server_model_time_interval_stat_fea = all_data.groupby('server_model').agg({'time_interval':['min','max','mean','median']}).reset_index()
313 |     server_model_time_interval_stat_fea = get_new_cols(server_model_time_interval_stat_fea,key = ['server_model' ])
314 | 
315 |     server_model_time_interval_stat_fea.columns  = ['server_model', 'sm_time_interval_min', 'sm_ttime_interval_max',
316 |            'sm_ttime_interval_mean', 'sm_ttime_interval_median']
317 |     return server_model_time_interval_stat_fea
318 | 
319 | def get_server_model_sn_fea_2(train, test):
320 |     df = pd.concat([train[['sn', 'server_model']],
321 |                    test[['sn', 'server_model']]], ignore_index=True)
322 |     df['server_model_count_sn_2'] = df.groupby(
323 |         ['server_model'])['sn'].transform('count')
324 |     df['server_model_nunique_sn_2'] = df.groupby(
325 |         ['server_model'])['sn'].transform('nunique')
326 |     df['sn_cnt_2'] = df['sn'].map(df['sn'].value_counts())
327 |     return df.drop_duplicates().reset_index(drop=True)
328 | 
329 | 
330 | def get_4_time_stat_fea(df):
331 |     print('     生成时间统计特征')
332 |     time_stat_fea_df = df.groupby(['sn', 'fault_time', 'server_model']).agg(
333 |         {'duration_minutes': ['min', 'max', 'mean', 'median', 'skew', 'sum', 'std', 'count'],
334 |          'log_duration_minutes': ['min', 'max', 'mean', 'median', 'skew', 'sum', 'std'],
335 |          'time_diff_1': ['min', 'max', 'mean', 'median', 'skew', 'sum', 'std'],
336 |          'log_time_diff_1': ['min', 'max', 'median'],
337 |          }).reset_index()
338 |     new_time_stat_cols = []
339 |     for i in time_stat_fea_df.columns:
340 |         if i[0] in ['sn', 'fault_time', 'server_model']:
341 |             new_time_stat_cols.append(i[0])
342 |         else:
343 |             new_time_stat_cols.append(f'{i[0]}_{i[1]}')
344 |             #             print(f'{i[0]}_{i[1]}')
345 |             time_stat_fea_df.loc[time_stat_fea_df[i[0]]
346 |                                  [i[1]] == -np.inf, (i[0], i[1])] = -20
347 |             time_stat_fea_df.loc[time_stat_fea_df[i[0]]
348 |                                  [i[1]] == np.inf, (i[0], i[1])] = 30
349 |     time_stat_fea_df.columns = new_time_stat_cols
350 |     time_stat_fea_df['duration_minutes_range'] = time_stat_fea_df['duration_minutes_max'] - time_stat_fea_df[
351 |         'duration_minutes_min']
352 |     time_stat_fea_df['log_duration_minutes_range'] = time_stat_fea_df['log_duration_minutes_max'] - time_stat_fea_df[
353 |         'log_duration_minutes_min']
354 |     time_stat_fea_df['time_diff_1_range'] = time_stat_fea_df['time_diff_1_max'] - \
355 |         time_stat_fea_df['time_diff_1_min']
356 |     time_stat_fea_df['log_time_diff_1_range'] = time_stat_fea_df['log_time_diff_1_max'] - time_stat_fea_df[
357 |         'log_time_diff_1_min']
358 |     time_stat_fea_df['duration_minutes_freq'] = time_stat_fea_df['duration_minutes_range'] / time_stat_fea_df[
359 |         'duration_minutes_count']
360 |     print(f'    生成时间统计特征完毕，特征维度:{time_stat_fea_df.shape}')
361 |     return time_stat_fea_df
362 | 
363 | 
364 | def get_time_std_fea(train, test):
365 |     print('生成 server_model 特征')
366 |     df = pd.concat([train, test], axis=0, ignore_index=True)
367 |     # df['year'] = df['time'].dt.year
368 |     # df['month'] = df['time'].dt.month
369 |     df['hour'] = df['time'].dt.hour
370 |     # df['week'] = df['time'].dt.week
371 |     df['minute'] = df['time'].dt.minute
372 |     time_std = df.groupby(['sn', 'server_model']).agg(
373 |         {'hour': 'std', 'minute': 'std'}).reset_index()
374 |     time_std = time_std.rename(
375 |         columns={
376 |             'hour': 'hour_std',
377 |             'minute': 'minute_std'})
378 |     return time_std
379 | 
380 | 
381 | def get_key(all_data):
382 |     all_data['msg_list'] = all_data['msg'].apply(lambda x: [i.strip() for i in x.split(' | ')])
383 |     class_fea_cnt_list = []
384 |     for label in [0,1,2,3]:
385 |         class_df = all_data.query(f'label =={label}')
386 |         counter = Counter()
387 |         for i in class_df['msg_list']:
388 |             counter.update(i)
389 |         class_fea_cnt = pd.DataFrame({i[0]:i[1] for i in counter.most_common()},index = [f'fea_cnt_{label}']).T.reset_index().rename(columns = {'index':'fea'})
390 |         class_fea_cnt_list.append(class_fea_cnt)
391 | 
392 |     fea_cnt_df = class_fea_cnt_list[0]
393 |     for tmp in class_fea_cnt_list[1:]:
394 |         fea_cnt_df = fea_cnt_df.merge(tmp,on = 'fea')
395 | 
396 |     fea_cnt_df['fea_cnt_sum'] = fea_cnt_df.loc[:,['fea_cnt_0', 'fea_cnt_1', 'fea_cnt_2', 'fea_cnt_3']].sum(1)
397 | 
398 |     all_fea_cnt = fea_cnt_df['fea_cnt_sum'].sum()
399 | 
400 |     for i in ['fea_cnt_0', 'fea_cnt_1', 'fea_cnt_2', 'fea_cnt_3']:
401 |         fea_cnt_df[f'{i}_ratio'] = fea_cnt_df[i]/fea_cnt_df['fea_cnt_sum']
402 |         fea_cnt_df[f'{i}_all_ratio'] = fea_cnt_df[i]/all_fea_cnt
403 | 
404 |     fea_cnt_df['fea_cnt_ratio_std'] = fea_cnt_df.loc[:,['fea_cnt_0_ratio','fea_cnt_1_ratio','fea_cnt_2_ratio','fea_cnt_3_ratio', ]].std(1)
405 |     fea_cnt_df['fea_cnt_std'] = fea_cnt_df.loc[:,['fea_cnt_0', 'fea_cnt_1','fea_cnt_2','fea_cnt_3',]].std(1)
406 | 
407 |     fea_cnt_df['fea_cnt_all_ratio_std'] = fea_cnt_df.loc[:,['fea_cnt_0_all_ratio','fea_cnt_1_all_ratio',
408 |            'fea_cnt_2_all_ratio','fea_cnt_3_all_ratio',]].std(1)
409 | 
410 |     fea_cnt_df = fea_cnt_df[~fea_cnt_df['fea_cnt_ratio_std'].isnull()].sort_values('fea_cnt_ratio_std',ascending = False)
411 | 
412 |     fea_cnt_df['fea_max'] = np.argmax(fea_cnt_df.loc[:,['fea_cnt_0', 'fea_cnt_1', 'fea_cnt_2', 'fea_cnt_3',]].values,axis = 1)
413 |     key_0 = fea_cnt_df.query('fea_max ==0 ')['fea'].to_list()
414 |     key_1 = fea_cnt_df.query('fea_max ==1 ')['fea'].to_list()
415 |     key_2 = fea_cnt_df.query('fea_max ==2 ')['fea'].to_list()
416 |     key_3 = fea_cnt_df.query('fea_max ==3 ')['fea'].to_list()
417 |     # key_1 = ['OEM record c2','Processor CPU_Core_Error','001c4c','System Event Sys_Event','Power Supply PS0_Status','Temperature CPU0_Margin_Temp','Reading 51 &gt; Threshold 85 degrees C','Lower Non-critical going low','Temperature CPU1_Margin_Temp','System ACPI Power State #0x7d','Lower Critical going low']
418 |     # key_2 = ['OEM CPU0 MCERR','OEM CPU0 CATERR','Reading 0 &lt; Threshold 2 degrees C','0203c0a80101','Unknown CPU0 MCERR','Unknown CPU0 CATERR','Microcontroller #0x3b','System Boot Initiated','Processor #0xfa','Power Unit Pwr Unit Status','Hard reset','Power off/down','System Event #0xff','Memory CPU1A1_DIMM_Stat','000000','Power cycle','OEM record c3','Memory CPU1C0_DIMM_Stat','Reading 0 &lt; Threshold 1 degrees C','IERR']
419 |     # key_3 = ['Memory','Correctable ECC logging limit reached','Memory MEM_CHE0_Status','Memory Memory_Status','Memory #0x87','Memory CPU0F0_DIMM_Stat','Memory Device Disabled','Memory #0xe2','OS Stop/Shutdown OS Status','System Boot Initiated System Restart','OS Boot BIOS_Boot_Up','System Boot Initiated BIOS_Boot_UP','Memory DIMM101','OS graceful shutdown','OS Critical Stop OS Status','Memory #0xf9','Memory CPU0C0_DIMM_Stat','Memory DIMM111','Memory DIMM021',]
420 |     # key_4 = ['Drive Fault','NMI/Diag Interrupt','Failure detected','Power Supply AC lost','Power Supply PSU0_Supply','AC out-of-range, but present','Predictive failure','Drive Present','Temperature Temp_DIMM_KLM','Temperature Temp_DIMM_DEF','Power Supply PS1_Status','Identify Status','Power Supply PS2_Status','Temperature DIMMG1_Temp','Upper Non-critical going high','Temperature DIMMG0_Temp','Upper Critical going high','Power Button pressed','System Boot Initiated #0xb8','Deasserted']
421 |     return key_0,key_1,key_2,key_3
422 | 
423 | def get_class_key_words_nunique(all_data):
424 |     print('获取 class_key_words_nunique 特征')
425 | 
426 |     key_0,key_1,key_2,key_3 = get_key(all_data)
427 | 
428 |     df = all_data[['sn', 'fault_time', 'msg_list']]
429 |     df_tmp = df.groupby(['sn' ]).agg({'msg_list':'sum'}).reset_index()
430 |     df_tmp['class_0_key_words_nunique'] = df_tmp['msg_list'].apply(lambda x:len(set(x)&set(key_0)))
431 |     df_tmp['class_1_key_words_nunique'] = df_tmp['msg_list'].apply(lambda x:len(set(x)&set(key_1)))
432 |     df_tmp['class_2_key_words_nunique'] = df_tmp['msg_list'].apply(lambda x:len(set(x)&set(key_2)))
433 |     df_tmp['class_3_key_words_nunique'] = df_tmp['msg_list'].apply(lambda x:len(set(x)&set(key_3)))
434 |     del df_tmp['msg_list']
435 |     return df_tmp
436 | def get_key_for_top_fea(train,test):
437 |     KEY_FOR_TOP_COLS = []
438 |     print('添加 key_for_top_fea 特征')
439 |     for TIME in [3, 5, 10, 15, 30, 45, 60, 90, 120, 240, 360, 480, 540, 600,60000000]:
440 |         for i in range(10):
441 |             train[f'KEY_FOR_TOP_{i}_{TIME}'] = train[f'{KEY_1[i]}_{TIME}'].astype(str)+'_'+train[f'{KEY_2[i]}_{TIME}'].astype(str)+'_'+train[f'{KEY_3[i]}_{TIME}'].astype(str)+'_'+train[f'{KEY_4[i]}_{TIME}'].astype(str)
442 |             test[f'KEY_FOR_TOP_{i}_{TIME}'] = test[f'{KEY_1[i]}_{TIME}'].astype(str)+'_'+test[f'{KEY_2[i]}_{TIME}'].astype(str)+'_'+test[f'{KEY_3[i]}_{TIME}'].astype(str)+'_'+test[f'{KEY_4[i]}_{TIME}'].astype(str)
443 |             KEY_FOR_TOP_COLS.append(f'KEY_FOR_TOP_{i}_{TIME}')
444 |     train = cat2num(train,KEY_FOR_TOP_COLS)
445 |     test = cat2num(test,KEY_FOR_TOP_COLS)
446 |     for KEY_FOR_TOP_COL in KEY_FOR_TOP_COLS:
447 |         del train[KEY_FOR_TOP_COL]
448 |         del test[KEY_FOR_TOP_COL]
449 |     return train,test
450 | 
451 | def get_key_word_cross_fea(train,test):
452 |     print('获取关键词交叉特征......')
453 |     KEY_WORDS_MAP  = {'CPU0':KEY_1,'CPU1':KEY_2,'CPU2':KEY_3,'CPU3':KEY_4}
454 |     KEY_WORDS_CROSS_COLS =[]
455 |     for KEY_WORDS in KEY_WORDS_MAP:
456 |         for i in [3, 5, 10, 15, 30, 45, 60, 90, 120, 240, 360, 480, 540, 600,60000000]:
457 |             KEY_WORDS_COLS = [f'{col}_{i}' for col in KEY_WORDS_MAP[KEY_WORDS]]
458 |             train[f'{KEY_WORDS}_WORDS_{i}'] = train[KEY_WORDS_COLS].astype(str).sum(1)
459 |             test[f'{KEY_WORDS}_WORDS_{i}'] = test[KEY_WORDS_COLS].astype(str).sum(1)
460 |             KEY_WORDS_CROSS_COLS.append(f'{KEY_WORDS}_WORDS_{i}')
461 |     train = cat2num(train,KEY_WORDS_CROSS_COLS)
462 |     test = cat2num(test,KEY_WORDS_CROSS_COLS)
463 | 
464 |     for COLS in KEY_WORDS_CROSS_COLS:
465 |         del train[COLS]
466 |         del test[COLS]
467 |     print('获取关键词交叉特征完毕......')
468 |     return train,test
469 | def get_time_quantile_fea(df):
470 |     print('    生成时间分位数特征')
471 |     secs = [0.2, 0.4, 0.6, 0.8]
472 |     time_fea_list = []
473 |     for sec in tqdm(secs):
474 |         for time_fea_type in [
475 |                 'duration_minutes', 'log_duration_minutes', 'time_diff_1', 'log_time_diff_1']:
476 |             temp = df.groupby(['sn', 'server_model', 'fault_time'])[time_fea_type].quantile(sec).reset_index(
477 |                 name=f'{time_fea_type}_' + str(sec * 100))
478 | 
479 |             time_fea_list.append(temp)
480 |     time_fea_df = time_fea_list[0]
481 |     for time_fea in time_fea_list[1:]:
482 |         time_fea_df = time_fea_df.merge(
483 |             time_fea, how='left', on=[
484 |                 'sn', 'server_model', 'fault_time'])
485 |     print(f'    生成时间分位数特征完毕，特征维度:{time_fea_df.shape}')
486 |     return time_fea_df
487 | 
488 | 
489 | def get_server_model_fea(train, test):
490 |     print('生成 server_model 特征')
491 |     df = pd.concat([train, test], axis=0, ignore_index=True)
492 |     df['server_model_count_sn'] = df.groupby(
493 |         ['server_model'])['sn'].transform('count')
494 |     df['server_model_nunique_sn'] = df.groupby(
495 |         ['server_model'])['sn'].transform('nunique')
496 |     #     df['server_model_count'] = df.groupby('server_model')['server_model'].transform('count')
497 |     #     df['server_model_cnt_quantile'] = df['server_model'].map(
498 |     #         df['server_model'].value_counts().rank() / len(df['server_model'].unique()))
499 |     #     df['server_model_cnt_rank'] = df[f'server_model_cnt_quantile'].rank(method='min')
500 | 
501 |     df['sn_cnt'] = df['sn'].map(df['sn'].value_counts())
502 |     df['sn_freq'] = df['sn'].map(df['sn'].value_counts() / len(df))
503 |     df['server_model_cnt'] = df['server_model'].map(
504 |         df['server_model'].value_counts())
505 |     df['server_model_freq'] = df['server_model'].map(
506 |         df['server_model'].value_counts() / len(df))
507 |     select_cols = ['sn', 'server_model',
508 |                    'server_model_count_sn', 'server_model_nunique_sn',
509 |                    'sn_cnt', 'sn_freq', 'server_model_cnt', 'server_model_freq'
510 |                    #                    'server_model_count','server_model_cnt_quantile', 'server_model_cnt_rank'
511 |                    ]
512 |     server_model_fea = df[select_cols]
513 | 
514 |     cat_feats = [
515 |         'server_model']  # ,'server_model_day_date','server_model_dayofmonth','server_model_dayofweek','server_model_hour']
516 |     # for name in cat_feats:
517 |     #     le = LabelEncoder()
518 |     #     server_model_fea[f'{name}_LabelEnc'] = le.fit_transform(
519 |     #         server_model_fea[name])
520 |     server_model_fea = cat2num(server_model_fea, cat_feats, Transfer2num=True)
521 |     server_model_fea = server_model_fea.drop_duplicates().reset_index(drop=True)
522 |     print(f'生成 server_model 特征完毕，特征维度:{server_model_fea.shape}')
523 | 
524 |     return server_model_fea
525 | 
526 | 
527 | def get_time_type_msg_unique_fea(df):
528 |     df['msg_list'] = df['msg'].apply(
529 |         lambda x: [i.strip() for i in x.split(' | ')])
530 | 
531 |     df['msg_0'] = df['msg'].apply(
532 |         lambda x: [
533 |             get_msg_location(
534 |                 x.split(' | '),
535 |                 0)])
536 |     df['msg_1'] = df['msg'].apply(
537 |         lambda x: [
538 |             get_msg_location(
539 |                 x.split(' | '),
540 |                 1)])
541 |     df['msg_2'] = df['msg'].apply(
542 |         lambda x: [
543 |             get_msg_location(
544 |                 x.split(' | '),
545 |                 2)])
546 | 
547 |     df = df.groupby(['sn', 'fault_time']).agg(
548 |         {'msg_list': 'sum', 'msg_0': 'sum', 'msg_1': 'sum', 'msg_2': 'sum'}).reset_index()
549 | 
550 |     df['msg_set'] = df['msg_list'].apply(lambda x: '|'.join(list(set(x))))
551 | 
552 |     df['msg_0_set'] = df['msg_0'].apply(lambda x: '|'.join(list(set(x))))
553 |     df['msg_1_set'] = df['msg_1'].apply(lambda x: '|'.join(list(set(x))))
554 |     df['msg_2_set'] = df['msg_2'].apply(lambda x: '|'.join(list(set(x))))
555 |     df = df[['sn', 'fault_time', 'msg_set',
556 |              'msg_0_set', 'msg_1_set', 'msg_2_set']]
557 |     return df
558 | 
559 | 
560 | def get_msg_unique_fea(train, test, time_type='last'):
561 |     print('生成msg_unique_ fea')
562 |     common_cols = ['msg_set', 'msg_0_set', 'msg_1_set', 'msg_2_set']
563 |     df = pd.concat([train, test], axis=0, ignore_index=True)
564 |     df['time_interval'] = (
565 |         pd.to_datetime(
566 |             df['fault_time']) -
567 |         df['time']).apply(
568 |             lambda x: x.total_seconds())
569 | 
570 |     last_fea = get_time_type_msg_unique_fea(df.query('time_interval >0'))
571 |     last_fea = last_fea.rename(columns={i: f'last_{i}' for i in common_cols})
572 |     next_fea = get_time_type_msg_unique_fea(df.query('time_interval <0'))
573 |     next_fea = next_fea.rename(columns={i: f'next_{i}' for i in common_cols})
574 |     all_fea = get_time_type_msg_unique_fea(df)
575 |     all_fea = all_fea.rename(columns={i: f'all_{i}' for i in common_cols})
576 |     msg_unique_fea = all_fea.merge(
577 |         last_fea, on=['sn', 'fault_time'], how='outer')
578 |     msg_unique_fea = msg_unique_fea.merge(
579 |         next_fea, on=['sn', 'fault_time'], how='outer')
580 |     return msg_unique_fea
581 | 
582 | 
583 | def get_duration_minutes_fea(train, test):
584 |     print('生成 duration_minutes 特征')
585 |     df = pd.concat([train, test], axis=0, ignore_index=True)
586 |     df['duration_minutes'] = (pd.to_datetime(df['fault_time']) - pd.to_datetime(df['time'])).apply(
587 |         lambda x: x.total_seconds())
588 |     df['log_duration_minutes'] = np.log(df['duration_minutes'])
589 | 
590 |     df = df.sort_values(['sn', 'label', 'server_model',
591 |                         'fault_time', 'time']).reset_index(drop=True)
592 |     df['time_diff_1'] = (df.groupby(['sn', 'server_model', 'fault_time'])['time'].diff(1)).apply(
593 |         lambda x: x.total_seconds())
594 |     df['time_diff_1'] = df['time_diff_1'].fillna(0)
595 |     df['log_time_diff_1'] = np.log(df['time_diff_1'])
596 | 
597 |     # time_quantile_fea_df = get_time_quantile_fea(df)
598 |     # time_stat_fea_df = get_4_time_stat_fea(df)
599 |     # df_tmp = time_quantile_fea_df.merge(time_stat_fea_df, on= ['sn',   'server_model','fault_time'],how = 'left')
600 |     time_stat_fea_df = get_4_time_stat_fea(df)
601 |     df_tmp = time_stat_fea_df
602 |     print(f'生成 duration_minutes 特征完毕，特征维度{df_tmp.shape}')
603 |     return df_tmp
604 | 
605 | 
606 | def get_msg_text_fea_all(all_data):
607 |     all_data['label'] = all_data['label'].fillna(-1)
608 |     all_data['msg_list'] = all_data['msg'].apply(lambda x: [i.strip() for i in x.split(' | ')])
609 |     all_data['msg_0'] = all_data['msg'].apply(lambda x: [get_msg_location(x.split(' | '), 0)])
610 |     all_data['msg_1'] = all_data['msg'].apply(lambda x: [get_msg_location(x.split(' | '), 1)])
611 |     all_data['msg_2'] = all_data['msg'].apply(lambda x: [get_msg_location(x.split(' | '), 2)])
612 | 
613 |     all_data = all_data.sort_values(['sn', 'fault_time', 'time']).reset_index(drop=True)
614 |     del all_data['label']
615 |     last_data = all_data.query('time_interval >0')
616 |     next_data = all_data.query('time_interval <=0')
617 | 
618 |     # id_cols = ['sn', 'fault_time', 'label']
619 | 
620 |     # all_msg_text_fea = get_msg_text_fea(all_data, msg_type='all')
621 |     last_msg_text_fea = get_msg_text_fea(last_data, msg_type='last')
622 |     # next_msg_text_fea = get_msg_text_fea(next_data, msg_type='next')
623 |     msg_text_fea = last_msg_text_fea
624 |     return msg_text_fea
625 | 
626 | def get_test_key_words(train,test):
627 | 
628 |     df = pd.concat([train[['sn', 'fault_time', 'label','msg']],test[['sn', 'fault_time',  'msg']]],ignore_index = True).drop_duplicates(['sn', 'fault_time',  'msg'])
629 |     df['label'] = df['label'].fillna(5)
630 |     df['msg_list'] = df['msg'].apply(lambda x:[i.strip() for i in x.split(' | ')])
631 |     words_cnt_df_list = []
632 |     for label in df['label'].unique():
633 |         label = int(label)
634 |         df_tmp = df.query(f'label == {label}')
635 |         counter = Counter()
636 |         for words in df_tmp['msg_list']:
637 |             words = [i.replace('_',' ') for i in words]
638 |             # word_list = []
639 |             # for i in words:
640 |             #     word_list+=i.split(' ')
641 |             # words = word_list
642 |             counter.update(words)
643 |         words_cnt_df = pd.DataFrame(counter,index = [0]).T.reset_index().rename(columns = {'index':'word',0:f'cnt_{label}'})
644 |         words_cnt_df_list.append(words_cnt_df)
645 |     words_cnt_df = words_cnt_df_list[0]
646 |     for i in words_cnt_df_list[1:]:
647 |         words_cnt_df = words_cnt_df.merge(i,on = 'word',how = 'outer' )
648 | 
649 |     words_cnt_df = words_cnt_df.fillna(-1)
650 |     words_cnt_df1 = words_cnt_df.query('cnt_0 >10 and cnt_2 >10 and cnt_1 >10 and cnt_3>10 and cnt_5>10 ')
651 |     cnt_class = ['cnt_0','cnt_1','cnt_2','cnt_3','cnt_5']
652 |     words_cnt_df1['word_cnt_sum'] = words_cnt_df1.loc[:,cnt_class].sum(1)
653 |     for i in cnt_class:
654 |         words_cnt_df1[f'{i}_ratio'] = words_cnt_df1[i]/words_cnt_df1['word_cnt_sum']
655 |     words_cnt_df1['word_cnt_ratio_std'] = words_cnt_df1.loc[:,['cnt_0_ratio','cnt_1_ratio', 'cnt_2_ratio', 'cnt_3_ratio']].std(1)
656 |     words_cnt_df1['cnt_1_0_diff'] = (words_cnt_df1['cnt_1_ratio'] - words_cnt_df1['cnt_0_ratio'])
657 |     test_key_words = words_cnt_df1.sort_values('cnt_5',ascending = False)['word'].to_list()[5:40]
658 |     return test_key_words
659 | 
660 | def get_w2v_mean(w2v_model,sentences):
661 |     emb_matrix = list()
662 |     vec = list()
663 |     for w in sentences.split():
664 |         if w in w2v_model.wv:
665 |             vec.append(w2v_model.wv[w])
666 |     if len(vec) > 0:
667 |         emb_matrix.append(np.mean(vec, axis=0))
668 |     else:
669 |         emb_matrix.append([0] * w2v_model.vector_size)
670 |     return emb_matrix
671 | def get_tfidf_svd(tfv,svd,sentences, n_components=16):
672 |     X_tfidf = tfv.transform(sentences)
673 |     X_svd = svd.transform(X_tfidf)
674 |     return np.mean(X_svd, axis=0)
675 | def get_w2v_tfidf_fea(all_data):
676 |     print('w2v编码')
677 |     df = all_data
678 |     df['msg_list'] = df['msg'].apply(lambda x: [i.strip().lower().replace(' ','_') for i in x.split(" | ")])
679 |     df = df.groupby(['sn']).agg({'msg_list': 'sum'}).reset_index()
680 |     df['text'] = df['msg_list'].apply(lambda x: ' '.join(x))
681 | 
682 |     sentences_list = df['text'].values.tolist()
683 |     sentences = []
684 |     for s in sentences_list:
685 |         sentences.append([w for w in s.split()])
686 |     w2v_model = Word2Vec(sentences, vector_size=10, window=3, min_count=5, sg=0, hs=1, seed=2022)
687 |     df['text_w2v'] = df['text'].apply(lambda x: get_w2v_mean(w2v_model, x)[0])
688 | 
689 |     print('tfidf编码')
690 |     X = df['text'].to_list()
691 |     tfv = TfidfVectorizer(ngram_range=(1, 3), min_df=5, max_features=50000)
692 |     tfv.fit(X)
693 |     X_tfidf = tfv.transform(X)
694 |     svd = TruncatedSVD(n_components=16)  # 降维
695 |     svd.fit(X_tfidf)
696 |     df['text_tfidf'] = df['text'].apply(lambda x: get_tfidf_svd(tfv, svd, x.split()))
697 | 
698 |     print("doc2vec编码")
699 |     texts = df['text'].tolist()
700 |     documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(texts)]
701 |     model = Doc2Vec(documents, window=5, min_count=3, workers=4)
702 |     docvecs = model.docvecs
703 |     df['doc2vec'] = [docvecs[i] for i in range(len(docvecs))]
704 | 
705 |     for i in range(32):
706 |         df[f'msg_w2v_{i}'] = df['text_w2v'].apply(lambda x: x[i])
707 |     for i in range(16):
708 |         df[f'msg_tfv_{i}'] = df['text_tfidf'].apply(lambda x: x[i])
709 |     for i in range(100):
710 |         df[f'msg_doc2vec_{i}'] = df['doc2vec'].apply(lambda x: x[i])
711 | 
712 |     save_cols = [i for i in df.columns if i not in ['msg_list', 'text', 'text_w2v', 'text_tfidf', 'doc2vec']]
713 |     return df[save_cols]
714 | 
715 | # w2v_tfidf_fea = get_w2v_tfidf_fea(all_data)
716 | class BetaEncoder(object):
717 | 
718 |     def __init__(self, group):
719 | 
720 |         self.group = group
721 |         self.stats = None
722 | 
723 |     # get counts from df
724 |     def fit(self, df, target_col):
725 |         # 先验均值
726 |         self.prior_mean = np.mean(df[target_col])
727 |         stats = df[[target_col, self.group]].groupby(self.group)
728 |         # count和sum
729 |         stats = stats.agg(['sum', 'count'])[target_col]
730 |         stats.rename(columns={'sum': 'n', 'count': 'N'}, inplace=True)
731 |         stats.reset_index(level=0, inplace=True)
732 |         self.stats = stats
733 | 
734 |     # extract posterior statistics
735 |     def transform(self, df, stat_type, N_min=1):
736 | 
737 |         df_stats = pd.merge(df[[self.group]], self.stats, how='left')
738 |         n = df_stats['n'].copy()
739 |         N = df_stats['N'].copy()
740 | 
741 |         # fill in missing
742 |         nan_indexs = np.isnan(n)
743 |         n[nan_indexs] = self.prior_mean
744 |         N[nan_indexs] = 1.0
745 | 
746 |         # prior parameters
747 |         N_prior = np.maximum(N_min - N, 0)
748 |         alpha_prior = self.prior_mean * N_prior
749 |         beta_prior = (1 - self.prior_mean) * N_prior
750 | 
751 |         # posterior parameters
752 |         alpha = alpha_prior + n
753 |         beta = beta_prior + N - n
754 | 
755 |         # calculate statistics
756 |         if stat_type == 'mean':
757 |             num = alpha
758 |             dem = alpha + beta
759 | 
760 |         elif stat_type == 'mode':
761 |             num = alpha - 1
762 |             dem = alpha + beta - 2
763 | 
764 |         elif stat_type == 'median':
765 |             num = alpha - 1 / 3
766 |             dem = alpha + beta - 2 / 3
767 | 
768 |         elif stat_type == 'var':
769 |             num = alpha * beta
770 |             dem = (alpha + beta) ** 2 * (alpha + beta + 1)
771 | 
772 |         elif stat_type == 'skewness':
773 |             num = 2 * (beta - alpha) * np.sqrt(alpha + beta + 1)
774 |             dem = (alpha + beta + 2) * np.sqrt(alpha * beta)
775 | 
776 |         elif stat_type == 'kurtosis':
777 |             num = 6 * (alpha - beta) ** 2 * (alpha + beta + 1) - \
778 |                 alpha * beta * (alpha + beta + 2)
779 |             dem = alpha * beta * (alpha + beta + 2) * (alpha + beta + 3)
780 | 
781 |         # replace missing
782 |         value = num / dem
783 |         value[np.isnan(value)] = np.nanmedian(value)
784 |         return value
785 | 
786 | 
787 | def get_beta_target(train, test):
788 |     N_min = 1000
789 |     feature_cols = []
790 | 
791 |     # encode variables
792 |     for c in ['server_model']:
793 |         # fit encoder
794 |         be = BetaEncoder(c)
795 |         be.fit(train, 'label')
796 | 
797 |         # mean
798 |         feature_name = f'{c}_mean'
799 |         train[feature_name] = be.transform(train, 'mean', N_min)
800 |         test[feature_name] = be.transform(test, 'mean', N_min)
801 |         feature_cols.append(feature_name)
802 | 
803 |         # mode
804 |         feature_name = f'{c}_mode'
805 |         train[feature_name] = be.transform(train, 'mode', N_min)
806 |         test[feature_name] = be.transform(test, 'mode', N_min)
807 |         feature_cols.append(feature_name)
808 | 
809 |         # median
810 |         feature_name = f'{c}_median'
811 |         train[feature_name] = be.transform(train, 'median', N_min)
812 |         test[feature_name] = be.transform(test, 'median', N_min)
813 |         feature_cols.append(feature_name)
814 | 
815 |         # var
816 |         feature_name = f'{c}_var'
817 |         train[feature_name] = be.transform(train, 'var', N_min)
818 |         test[feature_name] = be.transform(test, 'var', N_min)
819 |         feature_cols.append(feature_name)
820 | 
821 |         #     # skewness
822 |         #     feature_name = f'{c}_skewness'
823 |         #     train[feature_name] = be.transform(train, 'skewness', N_min)
824 |         #     test[feature_name]  = be.transform(test,  'skewness', N_min)
825 |         #     feature_cols.append(feature_name)
826 | 
827 |         # kurtosis
828 |         feature_name = f'{c}_kurtosis'
829 |         train[feature_name] = be.transform(train, 'kurtosis', N_min)
830 |         test[feature_name] = be.transform(test, 'kurtosis', N_min)
831 |         feature_cols.append(feature_name)
832 |     df = train.append(test).reset_index(drop=True)
833 |     df = df[['sn', 'fault_time', 'server_model', 'server_model_mean',
834 |              'server_model_mode', 'server_model_median', 'server_model_var',
835 |              'server_model_kurtosis']].drop_duplicates().reset_index(drop=True)
836 |     return df
837 | 


--------------------------------------------------------------------------------