├── .idea
├── .gitignore
├── CCKS2022_JS.iml
├── deployment.xml
├── inspectionProfiles
│ └── profiles_settings.xml
├── modules.xml
└── vcs.xml
├── Event_extraction
├── BertGRU.py
├── conf
│ └── EE1.0
│ │ ├── event_schema.json
│ │ ├── role_tag.dict
│ │ └── trigger_tag.dict
├── data_agumentation.py
├── duee_1_data_prepare.py
├── log
│ ├── endpoints.log
│ └── workerlog.0
├── model.py
├── run_role_labeling.sh
├── run_trigger_labeling.sh
├── sequence_labeling.py
├── sequence_labeling_role.py
├── sequence_labeling_trigger.py
├── test.py
└── utils.py
├── LICENSE
├── README.md
├── bert_multi_classification
├── config.py
├── convert_test_format.py
├── convert_train_dev_format.py
├── data_preprocess.py
├── dataset.py
├── logs
│ ├── final_main.log
│ ├── main.log
│ └── preprocess.log
├── model
│ └── bert-base-chinese
│ │ ├── config.json
│ │ ├── tokenizer_config.json
│ │ └── vocab.txt
├── models.py
├── predict_result.py
├── train.py
└── utils
│ └── utils.py
├── evaluation_metric.py
├── postprocess.py
├── requirements.txt
├── role_classification
├── data
│ └── train.xlsx
├── dataset.py
├── predict.py
├── train.py
└── utils.py
├── run_ee.sh
├── submit
└── result.txt
├── tree.py
└── yolov5
├── README_yolov5.md
├── data
├── ccks2022.yaml
├── coco128.yaml
├── hyps
│ ├── hyp.Objects365.yaml
│ ├── hyp.VOC.yaml
│ ├── hyp.scratch-high.yaml
│ ├── hyp.scratch-low.yaml
│ └── hyp.scratch-med.yaml
└── scripts
│ ├── download_weights.sh
│ ├── get_coco.sh
│ └── get_coco128.sh
├── detect.py
├── export.py
├── hubconf.py
├── models
├── common.py
├── experimental.py
├── hub
│ ├── anchors.yaml
│ ├── yolov3-spp.yaml
│ ├── yolov3-tiny.yaml
│ ├── yolov3.yaml
│ ├── yolov5-bifpn.yaml
│ ├── yolov5-fpn.yaml
│ ├── yolov5-p2.yaml
│ ├── yolov5-p34.yaml
│ ├── yolov5-p6.yaml
│ ├── yolov5-p7.yaml
│ ├── yolov5-panet.yaml
│ ├── yolov5l6.yaml
│ ├── yolov5m6.yaml
│ ├── yolov5n6.yaml
│ ├── yolov5s-ghost.yaml
│ ├── yolov5s-transformer.yaml
│ ├── yolov5s6.yaml
│ └── yolov5x6.yaml
├── tf.py
├── yolo.py
├── yolov5l.yaml
├── yolov5m.yaml
├── yolov5n.yaml
├── yolov5s.yaml
└── yolov5x.yaml
├── predict.py
├── train.py
├── utils
├── __init__.py
├── activations.py
├── augmentations.py
├── autoanchor.py
├── autobatch.py
├── aws
│ ├── __init__.py
│ ├── mime.sh
│ ├── resume.py
│ └── userdata.sh
├── benchmarks.py
├── callbacks.py
├── dataloaders.py
├── docker
│ ├── .dockerignore
│ ├── Dockerfile
│ ├── Dockerfile-arm64
│ └── Dockerfile-cpu
├── downloads.py
├── flask_rest_api
│ ├── README.md
│ ├── example_request.py
│ └── restapi.py
├── general.py
├── google_app_engine
│ ├── Dockerfile
│ ├── additional_requirements.txt
│ └── app.yaml
├── loggers
│ ├── __init__.py
│ └── wandb
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── log_dataset.py
│ │ ├── sweep.py
│ │ ├── sweep.yaml
│ │ └── wandb_utils.py
├── loss.py
├── metrics.py
├── plots.py
└── torch_utils.py
└── val.py
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Editor-based HTTP Client requests
5 | /httpRequests/
6 | # Datasource local storage ignored files
7 | /dataSources/
8 | /dataSources.local.xml
9 |
--------------------------------------------------------------------------------
/.idea/CCKS2022_JS.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/.idea/deployment.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/Event_extraction/conf/EE1.0/event_schema.json:
--------------------------------------------------------------------------------
1 | {"role_list": [{"role": "攻击事件发起者"}, {"role": "攻击事件承受者"}, {"role": "攻击事件使用器械"}, {"role": "攻击事件时间"}, {"role": "攻击事件地点"}], "event_type": "攻击事件", "id": "66fd6541f6e81ecbc9df69ac377d8d8f"}
2 | {"role_list": [{"role": "侦查事件发起者"}, {"role": "侦查事件承受者"}, {"role": "侦查事件使用器械"}, {"role": "侦查事件时间"}, {"role": "侦查事件地点"}], "event_type": "侦查事件", "id": "ce7923d3e82610c3f1692f98193426d6"}
3 | {"role_list": [{"role": "保障事件发起者"}, {"role": "保障事件承受者"}, {"role": "保障事件使用器械"}, {"role": "保障事件时间"}, {"role": "保障事件地点"}], "event_type": "保障事件", "id": "42774ed6cc845ce2bbcbce242deb8994"}
4 | {"role_list": [{"role": "封锁事件发起者"}, {"role": "封锁事件使用器械"}, {"role": "封锁事件时间"}, {"role": "封锁事件地点"}], "event_type": "封锁事件", "id": "83c21d96a46a78c2e3620fd75668675d"}
5 | {"role_list": [{"role": "部署事件发起者"}, {"role": "部署事件承受者"}, {"role": "部署事件使用器械"}, {"role": "部署事件时间"}, {"role": "部署事件地点"}], "event_type": "部署事件", "id": "90faa2088d198f670edbb6dc65766877"}
6 | {"role_list": [{"role": "防守事件发起者"}, {"role": "防守事件承受者"}, {"role": "防守事件使用器械"}, {"role": "防守事件时间"}, {"role": "防守事件地点"}], "event_type": "防守事件", "id": "2f2acf626f0e06df3f0491ff37db2b3e"}
7 | {"role_list": [{"role": "机动事件发起者"}, {"role": "机动事件使用器械"}, {"role": "机动事件时间"}, {"role": "机动事件地点"}], "event_type": "机动事件", "id": "2fbbd5f1cebf47c246249880d283bcbe"}
8 |
--------------------------------------------------------------------------------
/Event_extraction/conf/EE1.0/role_tag.dict:
--------------------------------------------------------------------------------
1 | 0 B-攻击事件发起者
2 | 1 I-攻击事件发起者
3 | 2 B-攻击事件承受者
4 | 3 I-攻击事件承受者
5 | 4 B-攻击事件使用器械
6 | 5 I-攻击事件使用器械
7 | 6 B-攻击事件时间
8 | 7 I-攻击事件时间
9 | 8 B-攻击事件地点
10 | 9 I-攻击事件地点
11 | 10 B-侦查事件发起者
12 | 11 I-侦查事件发起者
13 | 12 B-侦查事件承受者
14 | 13 I-侦查事件承受者
15 | 14 B-侦查事件使用器械
16 | 15 I-侦查事件使用器械
17 | 16 B-侦查事件时间
18 | 17 I-侦查事件时间
19 | 18 B-侦查事件地点
20 | 19 I-侦查事件地点
21 | 20 B-保障事件发起者
22 | 21 I-保障事件发起者
23 | 22 B-保障事件承受者
24 | 23 I-保障事件承受者
25 | 24 B-保障事件使用器械
26 | 25 I-保障事件使用器械
27 | 26 B-保障事件时间
28 | 27 I-保障事件时间
29 | 28 B-保障事件地点
30 | 29 I-保障事件地点
31 | 30 B-封锁事件发起者
32 | 31 I-封锁事件发起者
33 | 32 B-封锁事件使用器械
34 | 33 I-封锁事件使用器械
35 | 34 B-封锁事件时间
36 | 35 I-封锁事件时间
37 | 36 B-封锁事件地点
38 | 37 I-封锁事件地点
39 | 38 B-部署事件发起者
40 | 39 I-部署事件发起者
41 | 40 B-部署事件承受者
42 | 41 I-部署事件承受者
43 | 42 B-部署事件使用器械
44 | 43 I-部署事件使用器械
45 | 44 B-部署事件时间
46 | 45 I-部署事件时间
47 | 46 B-部署事件地点
48 | 47 I-部署事件地点
49 | 48 B-防守事件发起者
50 | 49 I-防守事件发起者
51 | 50 B-防守事件承受者
52 | 51 I-防守事件承受者
53 | 52 B-防守事件使用器械
54 | 53 I-防守事件使用器械
55 | 54 B-防守事件时间
56 | 55 I-防守事件时间
57 | 56 B-防守事件地点
58 | 57 I-防守事件地点
59 | 58 B-机动事件发起者
60 | 59 I-机动事件发起者
61 | 60 B-机动事件使用器械
62 | 61 I-机动事件使用器械
63 | 62 B-机动事件时间
64 | 63 I-机动事件时间
65 | 64 B-机动事件地点
66 | 65 I-机动事件地点
67 | 66 O
68 |
--------------------------------------------------------------------------------
/Event_extraction/conf/EE1.0/trigger_tag.dict:
--------------------------------------------------------------------------------
1 | 0 B-攻击事件
2 | 1 I-攻击事件
3 | 2 B-侦查事件
4 | 3 I-侦查事件
5 | 4 B-保障事件
6 | 5 I-保障事件
7 | 6 B-封锁事件
8 | 7 I-封锁事件
9 | 8 B-部署事件
10 | 9 I-部署事件
11 | 10 B-防守事件
12 | 11 I-防守事件
13 | 12 B-机动事件
14 | 13 I-机动事件
15 | 14 O
16 |
--------------------------------------------------------------------------------
/Event_extraction/duee_1_data_prepare.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import json
4 | from utils import read_by_lines, write_by_lines,text_to_sents,cal_md5
5 |
6 | enum_role = '填充'
7 |
8 | def data_process(path, model="trigger", is_predict=False):
9 | """data_process"""
10 |
11 | def label_data(data, start, l, _type):
12 | """label_data"""
13 | for i in range(start, start + l):
14 | suffix = "B-" if i == start else "I-"
15 | data[i] = "{}{}".format(suffix, _type)
16 | return data
17 |
18 | sentences = []
19 | output = ["text_a"] if is_predict else ["text_a\tlabel"]
20 | with open(path) as f:
21 | for line in f:
22 | d_json = json.loads(line.strip())
23 | _id = d_json["id"]
24 | text_a = [
25 | "," if t == " " or t == "\n" or t == "\r" else t
26 | for t in list(d_json["text"].lower())
27 | ]
28 | if is_predict:
29 | sentences.append({"text": d_json["text"], "id": _id})
30 | output.append('\002'.join(text_a))
31 | else:
32 | if model == "trigger":
33 | #过滤空事件
34 | if len(d_json.get("event_list", [])) > 0:
35 | labels = ["O"] * len(text_a)
36 | for event in d_json.get("event_list", []):
37 |
38 | event_type = event["event_type"]
39 | trigger = event["trigger"]
40 | try:
41 | start = d_json["text"].find(trigger)
42 | labels = label_data(labels, start, len(trigger),
43 | event_type)
44 | except:
45 | pass
46 | output.append("{}\t{}".format('\002'.join(text_a),
47 | '\002'.join(labels)))
48 | elif model == "role":
49 | if len(d_json.get("event_list", []))>0:
50 | for event in d_json.get("event_list", []):
51 | labels = ["O"] * len(text_a)
52 | event_type = event["event_type"]
53 | trigger = event["trigger"]
54 | #将事件类型+trigger拼接到text开头构成先验信息
55 | txt_event = event_type+trigger+':'
56 | for arg in event["arguments"]:
57 | role_type = arg["role"]
58 | argument = arg["argument"]
59 | try:
60 | start = arg["argument_start_index"]
61 | labels = label_data(labels, start, len(argument),
62 | role_type)
63 | except:
64 | pass
65 | txt = [
66 | "," if t == " " or t == "\n" or t == "\r" else t
67 | for t in list(txt_event.lower())
68 | ]
69 | txt_event_label = ["O"] * len(txt_event)+labels
70 |
71 | output.append("{}\t{}".format('\002'.join(txt + text_a),
72 | '\002'.join(txt_event_label)))
73 | # else:
74 | # labels = ["O"] * len(text_a)
75 | # output.append("{}\t{}".format('\002'.join(text_a),
76 | # '\002'.join(labels)))
77 |
78 | return output
79 |
80 |
81 | def schema_process(path, model="trigger"):
82 | """schema_process"""
83 |
84 | def label_add(labels, _type):
85 | """label_add"""
86 | if "B-{}".format(_type) not in labels:
87 | labels.extend(["B-{}".format(_type), "I-{}".format(_type)])
88 | return labels
89 |
90 | labels = []
91 | for line in read_by_lines(path):
92 | d_json = json.loads(line.strip())
93 | if model == "trigger":
94 | labels = label_add(labels, d_json["event_type"])
95 | elif model == "role":
96 | for role in d_json["role_list"]:
97 | labels = label_add(labels, role["role"])
98 | labels.append("O")
99 | tags = []
100 | for index, label in enumerate(labels):
101 | tags.append("{}\t{}".format(index, label))
102 | return tags
103 |
104 | # if __name__ == "__main__":
105 | # train_sent = docs_data_process(
106 | # "./data/EE1.0/duee_train.json")
107 | # print(train_sent)
108 |
109 |
110 | if __name__ == "__main__":
111 | print("\n=================CCKS 1.0 DATASET==============")
112 | conf_dir = "../data/EE1.0"
113 | # conf_dir = "./conf/Event_extraction-fin-1"
114 | schema_path = "{}/event_schema.json".format(conf_dir)
115 | tags_trigger_path = "{}/trigger_tag.dict".format(conf_dir)
116 | tags_role_path = "{}/role_tag.dict".format(conf_dir)
117 | print("\n=================start schema process==============")
118 | print('input path {}'.format(schema_path))
119 | tags_trigger = schema_process(schema_path, "trigger")
120 | write_by_lines(tags_trigger_path, tags_trigger)
121 | print("save trigger tag {} at {}".format(len(tags_trigger),
122 | tags_trigger_path))
123 | tags_role = schema_process(schema_path, "role")
124 | write_by_lines(tags_role_path, tags_role)
125 | print("save role tag {} at {}".format(len(tags_role), tags_role_path))
126 | print("=================end schema process===============")
127 |
128 | # data process
129 | data_dir = "../data/EE1.0"
130 | # data_dir = "./data/Event_extraction-fin-1"
131 | trigger_save_dir = "{}/trigger".format(data_dir)
132 | role_save_dir = "{}/role".format(data_dir)
133 | print("\n=================start schema process==============")
134 | if not os.path.exists(trigger_save_dir):
135 | os.makedirs(trigger_save_dir)
136 | if not os.path.exists(role_save_dir):
137 | os.makedirs(role_save_dir)
138 | print("\n----trigger------for dir {} to {}".format(data_dir,
139 | trigger_save_dir))
140 | train_tri = data_process("{}/train.json".format(data_dir), "trigger")
141 | write_by_lines("{}/train.tsv".format(trigger_save_dir), train_tri)
142 | dev_tri = data_process("{}/dev.json".format(data_dir), "trigger")
143 | write_by_lines("{}/dev.tsv".format(trigger_save_dir), dev_tri)
144 | test_tri = data_process("{}/test.json".format(data_dir), "trigger")
145 | write_by_lines("{}/test.tsv".format(trigger_save_dir), test_tri)
146 | print("train {} dev {} test {}".format(len(train_tri), len(dev_tri),
147 | len(test_tri)))
148 | print("\n----role------for dir {} to {}".format(data_dir, role_save_dir))
149 | train_role = data_process("{}/train.json".format(data_dir), "role")
150 | write_by_lines("{}/train.tsv".format(role_save_dir), train_role)
151 | dev_role = data_process("{}/dev.json".format(data_dir), "role")
152 | write_by_lines("{}/dev.tsv".format(role_save_dir), dev_role)
153 | test_role = data_process("{}/test.json".format(data_dir), "role")
154 | write_by_lines("{}/test.tsv".format(role_save_dir), test_role)
155 | print("train {} dev {} test {}".format(len(train_role), len(dev_role),
156 | len(test_role)))
157 | print("=================end schema process==============")
--------------------------------------------------------------------------------
/Event_extraction/log/endpoints.log:
--------------------------------------------------------------------------------
1 | PADDLE_TRAINER_ENDPOINTS:
2 | 127.0.0.1:60833
--------------------------------------------------------------------------------
/Event_extraction/log/workerlog.0:
--------------------------------------------------------------------------------
1 | /home/zxwang/anaconda3/envs/cxy_ccks2022/bin/python: can't open file '0': [Errno 2] No such file or directory
2 | /home/zxwang/anaconda3/envs/cxy_ccks2022/bin/python: can't open file '0': [Errno 2] No such file or directory
3 | dev step: 600 - loss: 0.00887, precision: 0.80908, recall: 0.89202, f1: 0.84853 current best 0.84274
4 | ==============================================save best model best performerence 0.848530
5 | train epoch: 1 - step: 610 (total: 9060) - loss: 0.005241
6 |
--------------------------------------------------------------------------------
/Event_extraction/model.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2022/7/20 17:07
3 | # @Author: Jielong Tang
4 | # @File : BERT-gru-crf.py
5 | import paddle
6 | import paddle.nn as nn
7 | from paddlenlp.layers.crf import LinearChainCrf, LinearChainCrfLoss, ViterbiDecoder
8 | from paddlenlp.transformers import ErnieModel,BertModel
9 |
10 | class FGM():
11 | """针对embedding层梯度上升干扰的对抗训练方法,Fast Gradient Method(FGM)"""
12 |
13 | def __init__(self, model):
14 | self.model = model
15 | self.backup = {}
16 |
17 | def attack(self, epsilon=1., emb_name='embedding'):
18 | # embedding的参数名
19 | for name, param in self.model.named_parameters():
20 | if not param.stop_gradient and emb_name in name: # 检验参数是否可训练及范围
21 | # print('begin')
22 | self.backup[name] = param.numpy() # 备份原有参数值
23 | grad_tensor = paddle.to_tensor(param.grad) # param.grad是个numpy对象
24 | norm = paddle.norm(grad_tensor) # norm化
25 | if norm != 0:
26 | r_at = epsilon * grad_tensor / norm
27 | param.add(r_at) # 在原有embed值上添加向上梯度干扰
28 |
29 | def restore(self, emb_name='embedding'):
30 | for name, param in self.model.named_parameters():
31 | if not param.stop_gradient and emb_name in name:
32 | assert name in self.backup
33 | param.set_value(self.backup[name]) # 将原有embed参数还原
34 | self.backup = {}
35 |
36 | class ptm_GRUCRF(nn.Layer):
37 | def __init__(self,pretrain_model,num_class,gru_hidden_size=300,
38 | crf_lr=100):
39 | super().__init__()
40 | self.num_classes = num_class
41 | self.bert = BertModel.from_pretrained(pretrain_model)
42 | self.gru = nn.GRU(self.bert.config["hidden_size"],
43 | gru_hidden_size,
44 | num_layers = 2,
45 | direction='bidirect')
46 | self.fc = nn.Linear(gru_hidden_size*2,num_class)
47 | self.crf = LinearChainCrf(self.num_classes)
48 | self.crf_loss = LinearChainCrfLoss(self.crf)
49 | self.viterbi_decoder = ViterbiDecoder(self.crf.transitions)
50 |
51 | def forward(self,input_ids,token_type_ids,lengths=None,labels=None):
52 | encoder_output,_ = self.bert(input_ids, token_type_ids = token_type_ids,output_hidden_states = False)
53 | # all_hidden_states = paddle.stack(encoder_output,axis=)
54 | # concatenate_pooling = paddle.concat(
55 | # (encoder_output[-1], encoder_output[-2], encoder_output[-3], encoder_output[-4]), -1
56 | # )
57 | # print(concatenate_pooling)
58 | # concatenate_pooling = concatenate_pooling[:, 0]
59 | gru_output, _ = self.gru(encoder_output)
60 | emission = self.fc(gru_output)
61 | if labels is not None:
62 | loss = self.crf_loss(emission, lengths, labels)
63 | return loss
64 | else:
65 | _,prediction = self.viterbi_decoder(emission, lengths)
66 | return prediction
67 |
68 | if __name__ =='__main__':
69 | print('01')
--------------------------------------------------------------------------------
/Event_extraction/run_role_labeling.sh:
--------------------------------------------------------------------------------
1 |
2 | data_dir=$1
3 | conf_path=$2
4 | ckpt_dir=$3
5 | predict_data=$4
6 | learning_rate=$5
7 | is_train=$6
8 | max_seq_len=$7
9 | batch_size=$8
10 | epoch=${9}
11 | pred_save_path=${10}
12 |
13 | if [ "$is_train" = True ]; then
14 | unset CUDA_VISIBLE_DEVICES
15 | # python -m paddle.distributed.launch --gpus "0" sequence_labeling.py
16 | export CUDA_VISIBLE_DEVICES=0
17 | python -m paddle.distributed.launch --gpus "0" sequence_labeling_role.py \
18 | --num_epoch ${epoch} \
19 | --learning_rate ${learning_rate} \
20 | --tag_path ${conf_path} \
21 | --train_data ${data_dir}/train.tsv \
22 | --dev_data ${data_dir}/dev.tsv \
23 | --test_data ${data_dir}/test.tsv \
24 | --predict_data ${predict_data} \
25 | --do_train True \
26 | --do_predict False \
27 | --max_seq_len ${max_seq_len} \
28 | --batch_size ${batch_size} \
29 | --skip_step 10 \
30 | --valid_step 50 \
31 | --checkpoints ${ckpt_dir} \
32 | --init_ckpt ${ckpt_dir}/best.pdparams \
33 | --predict_save_path ${pred_save_path} \
34 | --device gpu
35 | else
36 | export CUDA_VISIBLE_DEVICES=0
37 | python sequence_labeling_role.py \
38 | --num_epoch ${epoch} \
39 | --learning_rate ${learning_rate} \
40 | --tag_path ${conf_path} \
41 | --train_data ${data_dir}/train.tsv \
42 | --dev_data ${data_dir}/dev.tsv \
43 | --test_data ${data_dir}/test.tsv \
44 | --predict_data ${predict_data} \
45 | --do_train False \
46 | --do_predict True \
47 | --max_seq_len ${max_seq_len} \
48 | --batch_size ${batch_size} \
49 | --skip_step 10 \
50 | --valid_step 50 \
51 | --checkpoints ${ckpt_dir} \
52 | --init_ckpt ${ckpt_dir}/best.pdparams \
53 | --predict_save_path ${pred_save_path} \
54 | --device gpu
55 | fi
56 |
--------------------------------------------------------------------------------
/Event_extraction/run_trigger_labeling.sh:
--------------------------------------------------------------------------------
1 |
2 | data_dir=$1
3 | conf_path=$2
4 | ckpt_dir=$3
5 | predict_data=$4
6 | learning_rate=$5
7 | is_train=$6
8 | max_seq_len=$7
9 | batch_size=$8
10 | epoch=${9}
11 | pred_save_path=${10}
12 |
13 | if [ "$is_train" = True ]; then
14 | unset CUDA_VISIBLE_DEVICES
15 | # python -m paddle.distributed.launch --gpus "0" sequence_labeling.py
16 | export CUDA_VISIBLE_DEVICES=0
17 | python -m paddle.distributed.launch --gpus "0" sequence_labeling_trigger.py \
18 | --num_epoch ${epoch} \
19 | --learning_rate ${learning_rate} \
20 | --tag_path ${conf_path} \
21 | --train_data ${data_dir}/train.tsv \
22 | --dev_data ${data_dir}/dev.tsv \
23 | --test_data ${data_dir}/test.tsv \
24 | --predict_data ${predict_data} \
25 | --do_train True \
26 | --do_predict False \
27 | --max_seq_len ${max_seq_len} \
28 | --batch_size ${batch_size} \
29 | --skip_step 10 \
30 | --valid_step 50 \
31 | --checkpoints ${ckpt_dir} \
32 | --init_ckpt ${ckpt_dir}/best.pdparams \
33 | --predict_save_path ${pred_save_path} \
34 | --device gpu
35 | else
36 | export CUDA_VISIBLE_DEVICES=0
37 | python sequence_labeling_trigger.py \
38 | --num_epoch ${epoch} \
39 | --learning_rate ${learning_rate} \
40 | --tag_path ${conf_path} \
41 | --train_data ${data_dir}/train.tsv \
42 | --dev_data ${data_dir}/dev.tsv \
43 | --test_data ${data_dir}/test.tsv \
44 | --predict_data ${predict_data} \
45 | --do_train False \
46 | --do_predict True \
47 | --max_seq_len ${max_seq_len} \
48 | --batch_size ${batch_size} \
49 | --skip_step 10 \
50 | --valid_step 50 \
51 | --checkpoints ${ckpt_dir} \
52 | --init_ckpt ${ckpt_dir}/best.pdparams \
53 | --predict_save_path ${pred_save_path} \
54 | --device gpu
55 | fi
56 |
--------------------------------------------------------------------------------
/Event_extraction/test.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2022/7/14 10:25
3 | # @Author: Jielong Tang
4 | # @File : test.py
5 | import paddle
6 | from toolHandlerJLT import Handler
7 |
8 | def testGPU():
9 | env_info = {}
10 | compiled_with_cuda = paddle.is_compiled_with_cuda()
11 | env_info['Paddle compiled with cuda'] = compiled_with_cuda
12 | print(paddle.get_device())
13 |
14 | if compiled_with_cuda:
15 | v = paddle.get_cudnn_version()
16 | v = str(v // 1000) + '.' + str(v % 1000 // 100)
17 | env_info['cudnn'] = v
18 | if 'gpu' in paddle.get_device():
19 | gpu_nums = paddle.distributed.ParallelEnv().nranks
20 | else:
21 | gpu_nums = 0
22 | env_info['GPUs used'] = gpu_nums
23 |
24 | env_info['PaddlePaddle'] = paddle.__version__
25 |
26 | for k, v in env_info.items():
27 | print('{}: {}'.format(k, v))
28 |
29 | if __name__ == '__main__':
30 | print(Handler.sort_([]))
31 | # testGPU()
32 | # paddle.fluid.install_check.run_check()
33 | # paddle.fluid.is_compiled_with_cuda()
34 |
--------------------------------------------------------------------------------
/Event_extraction/utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2022/7/13 12:17
3 | # @Author: Jielong Tang
4 | # @File : utils.py
5 |
6 |
7 | import hashlib
8 | import json
9 |
10 | def cal_md5(str):
11 | """calculate string md5"""
12 | str = str.decode("utf-8", "ignore").encode("utf-8", "ignore")
13 | return hashlib.md5(str).hexdigest()
14 |
15 |
16 | def read_by_lines(path):
17 | """read the data by line"""
18 | result = list()
19 | with open(path, "r", encoding="utf8") as infile:
20 | for line in infile:
21 | result.append(line.strip())
22 | return result
23 |
24 |
25 | def write_by_lines(path, data):
26 | """write the data"""
27 | with open(path, "w", encoding="utf8") as outfile:
28 | [outfile.write(d + "\n") for d in data]
29 |
30 |
31 | def text_to_sents(text):
32 | """text_to_sents"""
33 | deliniter_symbols = [u"。", u"?", u"!"]
34 | paragraphs = text.split("\n")
35 | ret = []
36 | for para in paragraphs:
37 | if para == u"":
38 | continue
39 | sents = [u""]
40 | for s in para:
41 | sents[-1] += s
42 | if s in deliniter_symbols:
43 | sents.append(u"")
44 | if sents[-1] == u"":
45 | sents = sents[:-1]
46 | ret.extend(sents)
47 | return ret
48 |
49 |
50 | def load_dict(dict_path):
51 | """load_dict"""
52 | vocab = {}
53 | for line in open(dict_path, 'r', encoding='utf-8'):
54 | value, key = line.strip('\n').split('\t')
55 | vocab[key] = int(value)
56 | return vocab
57 |
58 |
59 | def extract_result(text, labels):
60 | """extract_result"""
61 | ret, is_start, cur_type = [], False, None
62 | if len(text) != len(labels):
63 | # 韩文回导致label 比 text要长
64 | labels = labels[:len(text)]
65 | for i, label in enumerate(labels):
66 | if label != u"O":
67 | _type = label[2:]
68 | if label.startswith(u"B-"):
69 | is_start = True
70 | cur_type = _type
71 | ret.append({"start": i, "text": [text[i]], "type": _type})
72 | elif _type != cur_type:
73 | """
74 | # 如果是没有B-开头的,则不要这部分数据
75 | cur_type = None
76 | is_start = False
77 | """
78 | cur_type = _type
79 | is_start = True
80 | ret.append({"start": i, "text": [text[i]], "type": _type})
81 | elif is_start:
82 | ret[-1]["text"].append(text[i])
83 | else:
84 | cur_type = None
85 | is_start = False
86 | else:
87 | cur_type = None
88 | is_start = False
89 | return ret
90 |
91 | def read_json(data_path):
92 | with open(data_path, 'r', encoding='utf-8') as f:
93 | data = json.load(f)
94 | return data
95 |
96 | def val_generate(data_path,pre_result):
97 | # 验证集生成从多标签分类结果
98 | with open(data_path, 'r', encoding='utf-8') as f:
99 | data = json.load(f)
100 | sentences = data # origin data format
101 | result = []
102 | set_list = []
103 | # for sent in sentences:
104 | # for clas in sent['class']:
105 | # if clas != '无事件':
106 | # item = {}
107 | # item['id'] = sent['id']
108 | # item['text'] = clas+':'+ sent['text'].rstrip()
109 | # set_list.append(item)
110 | for sent in sentences:
111 | for clas in sent['class']:
112 | if clas != '无事件':
113 | set_list.append((sent['id'],clas))
114 | print(set_list)
115 | for txt in pre_result:
116 | a = (txt['id'],txt['event_type'])
117 | if a in set_list:
118 | result.append(txt)
119 | return result
120 |
121 | def wgm_trans_decodes(ds, decodes, lens, label_vocab):
122 | #将decodes和lens由列表转换为数组
123 | decodes = [x for batch in decodes for x in batch]
124 | lens = [x for batch in lens for x in batch]
125 | #先使用zip形成元祖(编号, 标签),然后使用dict形成字典
126 | id_label = dict(zip(label_vocab.values(), label_vocab.keys()))
127 | #保存所有句子解析结果的列表
128 | results=[]
129 | #初始化编号
130 | inNum = 1;
131 | #逐个处理待转换的标签编码列表
132 | for idx, end in enumerate(lens):
133 | #句子单字构成的数组
134 | sent_array = ds.data[idx][0][:end]
135 | #句子单字标签构成的数组
136 | tags_array = [id_label[x] for x in decodes[idx][1:end]]
137 | #初始化句子和解析结果
138 | sent = "";
139 | tags = "";
140 | #将字符串数组转换为单个字符串
141 | for i in range(end-2):
142 | #pdb.set_trace()
143 | #单字直接连接,形成句子
144 | sent = sent + sent_array[i]
145 | #标签以空格连接
146 | if i > 0:
147 | tags = tags + " " + tags_array[i]
148 | else:#第1个标签
149 | tags = tags_array[i]
150 | #构成结果串:编号+句子+标签序列,中间用“\u0001”连接
151 | current_pred = str(inNum) + '\u0001' + sent + '\u0001' + tags + "\n"
152 | #pdb.set_trace()
153 | #添加到句子解析结果的列表
154 | results.append(current_pred)
155 | inNum = inNum + 1
156 | return results
157 |
158 |
159 | if __name__ == "__main__":
160 | # s = "xxdedewd"
161 | # print(cal_md5(s.encode("utf-8")))
162 | print(load_dict('../data/EE1.0/trigger_tag.dict'))
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Jielong Tang, Zhenxing Wang
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 开源多模态军事装备数据的事件要素抽取
2 |
3 |
4 | 多模态军事事件要素抽取,是指从面向军事特定领域多模态数据中抽取出用户所需要的事件信息并以结构化形式呈现的过程,是事件抽取、目标检测、文本识别等技术在军事领域的具体应用。CCKS 2022 组织了本次事件要素抽取评测任务,要求从互联网公开的多模态军事装备数据(文本 + 图像)中抽取相关事件类型及事件要素,提供多模态、结构化、高价值的目标装备信息抽取结果。本文采用文本事件抽取、图像目标检测以及多模态知识融合等技术,实现多模态军事事件要素抽取,以测试集上 F1值 0.53403 的最终成绩,在测评任务中排名第二名。
5 |
6 | 该文档展示了如何使用与快速复现我们CCKS:开源多模态军事装备数据的事件要素抽取任务的方法。
7 |
8 | ## 目录结构
9 |
10 | 以下是本项目主要目录结构及说明:
11 |
12 | ```python
13 | --CCKS2022_JS
14 | |--postprocess.py #后处理
15 | |--run_ee.sh #主程序批处理脚本
16 | |--evaluation_metric.py #评价指标
17 | |--ckpt #模型checkpoint
18 | |--submit #结果文件
19 | |--data #数据文件
20 | | |--ObjectDect #yolov5模型输入
21 | | | |--train_images
22 | | | | |--labels
23 | | | | |--images
24 | | | |--dev_images
25 | | | | |--labels
26 | | | | |--images
27 | | |--result #结果文件
28 | | |--EE1.0 #事件抽取输入
29 | | | |--test.json #测试集
30 | | | |--dev.json #触发词标注后验证集
31 | | | |--roles.xlsx #数据增强替换实体
32 | | | |--trigger/ #触发词抽取输入
33 | | | |--role/ #论元抽取输入
34 | | | |--train.json #触发词标注后测试集
35 | | | |--event_schema.json #事件定义schema
36 | | |--raw_data #原始数据集
37 | | | |--test
38 | | | |--train
39 | | | |--val
40 | | |--MutiClass #多标签多分类输入
41 | | | |--processed_data #预处理完成模型输入
42 | | | | |--test.json
43 | | | | |--dev.json
44 | | | | |--labels.txt
45 | | | | |--train.json
46 | |--Event_extraction #事件抽取
47 | | |--data_agumentation.py #事件增强
48 | | |--sequence_labeling.py #序列标注
49 | | |--test.py
50 | | |--utils.py #工具类
51 | | |--duee_1_data_prepare.py #数据预处理
52 | | |--model.py #模型代码
53 | | |--run_role_labeling.sh #论元抽取脚本
54 | | |--sequence_labeling_role.py #论元抽取
55 | | |--run_trigger_labeling.sh #触发词抽取脚本
56 | | |--sequence_labeling_trigger.py #触发词抽取
57 | |--bert_multi_classification #多标签分类
58 | | |--dataset.py #数据集构建
59 | | |--model #模型代码
60 | | |--logs #输出日志
61 | | |--predict_result.py #预测函数
62 | | |--config.py #配置文件
63 | | |--train.py #模型训练
64 | | |--convert_train_dev_format.py #bert模型数据转换函数
65 | | |--convert_test_format.py #bert模型数据转换函数
66 | | |--data_preprocess.py #数据预处理
67 | | |--utils #工具类
68 | | |--models.py #模型函数
69 | |--yolov5 #目标检测
70 | | |--detect.py #目标检测
71 | | |--data #数据配置文件
72 | | | |--ccks2022.yaml
73 | | |--train.py #模型训练
74 | | |--predict.py #模型推理
75 | | |--models
76 | | | |--common.py
77 | | | |--yolov5x.yaml
78 | | | |--yolov5l.yaml
79 | | | |--experimental.py
80 | | | |--yolo.py
81 | | |--utils #工具包
82 | |--role_classification #论元实体判别
83 | | |--dataset.py #数据集构建
84 | | |--utils.py #工具类
85 | | |--data #论元实体类别
86 | | | |--train.xlsx
87 | | |--train.py #模型训练
88 | | |--predict.py #模型预测
89 |
90 | ```
91 | ## 多模态军事装备事件要素抽取
92 |
93 | ### 评测方法
94 | 本次任务采用事件要素抽取的精确率(Precision, P)、召回率(Recall, R)和F1值(F1-measure, F1)来评估事件要素的识别效果。使用事件要素匹配F1作为最终评价指标,匹配过程不区分大小写,其中F1的计算方式如下:
95 |
96 | $$ F1=\frac{2*P*R}{P+R}\ $$
97 |
98 | 其中,
99 |
100 | - P=预测正确事件要素个数/全部预测事件要素数量.
101 | - R=预测正确事件要素个数/全部正确标注要素数量。
102 | - 对于一个预测事件要素,判断其为预测正确的标准是:如果它的事件类型、在文本中的位置、事件要素角色以及在图像中的实体位置与正确标注数据中的一个事件要素匹配,则判定其为预测正确要素。其中,与图像中实体位置匹配的标准为预测的图像实体的位置与正确标注图像实体的位置的交并比大于0.5。若图像中没有与该事件要素对应的图像实体,则输出-1判断为正确。
103 |
104 | ### 模型数据下载(请严格按照对应路径放置)
105 | 百度网盘链接 提取码:8b8g
106 |
107 | - **将压缩包文件中data.zip解压后所有文件放置于CCKS2022_JS/data中**
108 | - **将压缩包文件中ckpt.zip解压后所有文件放置于CCKS2022_JS/ckpt中**
109 |
110 | - **CCKS2022_JS/data/raw_data/train 存放训练用官方数据**
111 | - **CCKS2022_JS/data/raw_data/test 存放测试用官方数据**
112 | - **CCKS2022_JS/data/raw_data/dev 存放验证用官方数据**
113 | - **CCKS2022_JS/data/result 存放各步骤中间结果**
114 | ### 项目运行主要环境依赖(建议在conda虚拟环境中安装)
115 | 运行系统:
116 | ```shell
117 | Linux:
118 | Ubuntu 18.04.6 LTS
119 | GPU:
120 | NVIDIA GeForce RTX 3090
121 | ```
122 |
123 | ---
124 |
125 | python:
126 |
127 | ```shell
128 | python3.8
129 | ```
130 |
131 | 运行依赖:
132 | ```shell
133 | pip install -r requirements.txt
134 | ```
135 | ### 方案复现
136 | 下面步骤用于指导我们方案的复现过程。由于方案采用pipeline形式实现,我们将大体流程分为六个过程:
137 | - 1. 数据预处理
138 | - 2. 多标签分类模型实现
139 | - 3. 触发词模型实现
140 | - 4. 论元抽取模型实现
141 | - 5. 目标检测模型实现
142 | - 6. 结果后处理
143 |
144 | ### 快速开始:
145 | 以下指令可通过已经加载模型一键快速生成任务目标结果,详细任务可见CCKS2022_JS/run_ee.sh脚本。同时也可以通过分步执行完成模型训练预测过程。最终结果文件存放于**CCKS2022_JS/submit/result_xxx.txt**中,“xxx”代表时间戳。
146 | ``` shell
147 | # 一键执行pipeline任务
148 | sh run_ee.sh pipeline
149 | ```
150 | ### 分步执行:
151 | 以下将详细通过各个步骤生成目标结果。
152 | #### Step1:数据预处理并加载
153 |
154 | 从比赛官网下载数据集,逐层解压存放于data/raw_data目录下,运行以下脚本将原始数据预处理成序列标注格式数据,并进行数据增强。
155 | 处理之后的数据放在data/EE1.0下,其中train.json代表训练集(1400条),dev.json代表验证集(200条),test.json代表测试集(400条),数据增强随机替换样本存放于roles.xlsx。触发词识别数据文件存放在data/EE1.0/role下,论元角色识别数据文件存放在data/EE1.0/trigger下。
156 |
157 | ``` shell
158 | # 数据预处理
159 | sh run_ee.sh data_prepare
160 |
161 | # 数据增强
162 | sh run_ee.sh data_augmentation
163 | ```
164 |
165 |
166 | #### Step2:多标签多分类
167 | 多标签分类代码存放于CCKS2022_JS/bert_multi_classification中。首先在data_preprocess.py中,有将数据预处理成bert所需要的格式的相关代码。然后在dataset.py中,将处理好的数据制作成torch所需格式的数据集。在models.py中有多标签分类模型建立的相关代码。最后在train.py中执行训练、验证过程,在predict_result.py中执行预测过程。
168 | ```shell
169 | # 多标签多分类模型训练
170 | sh run_ee.sh multi_label_train
171 |
172 | # 多标签多分类模型预测
173 | sh run_ee.sh multi_label_predict
174 | ```
175 | #### Step3:触发词识别
176 |
177 | 触发词识别通过部分人工标注的训练集触发词数据进而通过模型进行测试集触发词预测,相关数据文件存放于CCKS2022_JS/data/EE1.0中。标注完成的训练集放置于./trigger/train.tsv,验证集放于./trigger/dev.tsv,如需更换训练数据需按./train.json的格式标注训练集触发词,重新执行step1中脚本完成数据预处理,自动生成项目所需数据集格式。执行以下脚本完成模型训练与预测,中间结果存放于CCKS2022_JS/data/result/trigger/test_predict_trigger.json。
178 |
179 |
180 | ```shell
181 | # 触发词识别模型训练
182 | sh run_ee.sh trigger_train
183 |
184 | # 触发词识别模型预测
185 | sh run_ee.sh trigger_predict
186 | ```
187 |
188 | #### Step4:论元抽取
189 |
190 | 论元抽取部分采用step2与step3中模型最终投票结果构建模型输入,模型输入的训练集放置于CCKS2022_JS/data/EE1.0/role/train.tsv,验证集放于CCKS2022_JS/data/EE1.0/role/dev.tsv,中间结果存放于CCKS2022_JS/data/result/role/test_predict_role.json。
191 |
192 | ```shell
193 | # 论元抽取模型训练
194 | sh run_ee.sh role_train
195 |
196 | # 论元抽取模型预测
197 | sh run_ee.sh role_predict
198 | ```
199 |
200 | #### Step5:论元目标检测
201 | 论元目标检测采用yolov5模型,相关代码文件见CCKS2022_JS/yolov5
202 |
203 | ##### 数据
204 |
205 | ```
206 | - 设置数据相关配置文件:./CCKS2022_JS/yolov5/data/ccks2022.yaml
207 |
208 | - path: #根目录
209 |
210 | - train: #训练数据存放路径
211 |
212 | - val: #验证数据存放路径
213 | ```
214 |
215 |
216 | ##### 训练
217 | 输入以下指令
218 | ```shell
219 | # yolov5模型训练
220 | sh run_ee.sh objDec_train
221 | ```
222 | 会出现以下提示信息:
223 | ``` shell
224 | ROOTPATH/yolov5
225 | wandb: (1) Create a W&B account
226 | wandb: (2) Use an existing W&B account
227 | wandb: (3) Don't visualize my results
228 | wandb: Enter your choice: (30 second timeout)
229 | ```
230 | 此时键盘输入3,点击回车继续执行。
231 |
232 | ##### 预测
233 |
234 | ---
235 |
236 | ```shell
237 | # yolov5模型预测
238 | sh run_ee.sh objDec_predict
239 | ```
240 |
241 |
242 | #### Step6:数据后处理,提交结果
243 |
244 | 将step1-5中所有结果文件融合,按照比赛预测指定格式构建结果文件。
245 |
246 | ```shell
247 | # 后处理
248 | sh run_ee.sh pred_2_submit
249 | ```
250 |
251 | 最终结果文件存放于**CCKS2022_JS/submit/result_xxx.txt**中,“xxx”代表时间戳。
252 |
253 |
--------------------------------------------------------------------------------
/bert_multi_classification/config.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 |
4 | class Args:
5 | @staticmethod
6 | def parse():
7 | parser = argparse.ArgumentParser()
8 | return parser
9 |
10 | @staticmethod
11 | def initialize(parser):
12 | # args for path
13 | parser.add_argument('--output_dir', default='../ckpt/MultiClass/',
14 | help='the output dir for model checkpoints')
15 |
16 | parser.add_argument('--bert_dir', default='./model/bert-base-chinese/',
17 | help='bert dir for uer')
18 | parser.add_argument('--data_dir', default='./data/tcner/',
19 | help='data dir for uer')
20 | parser.add_argument('--log_dir', default='./logs/',
21 | help='log dir for uer')
22 |
23 | # other args
24 | parser.add_argument('--num_tags', default=8, type=int,
25 | help='number of tags')
26 | parser.add_argument('--seed', type=int, default=123, help='random seed')
27 |
28 | parser.add_argument('--gpu_ids', type=str, default='0',
29 | help='gpu ids to use, -1 for cpu, "0,1" for multi gpu')
30 |
31 | parser.add_argument('--max_seq_len', default=300, type=int)
32 |
33 | parser.add_argument('--eval_batch_size', default=12, type=int)
34 |
35 | parser.add_argument('--swa_start', default=3, type=int,
36 | help='the epoch when swa start')
37 |
38 | # train args
39 | parser.add_argument('--train_epochs', default=15, type=int,
40 | help='Max training epoch')
41 |
42 | parser.add_argument('--dropout_prob', default=0, type=float,
43 | help='drop out probability')
44 |
45 | # 2e-5
46 | parser.add_argument('--lr', default=2e-5, type=float,
47 | help='learning rate for the bert module')
48 | # 2e-3
49 | parser.add_argument('--other_lr', default=3e-4, type=float,
50 | help='learning rate for the module except bert')
51 | # 0.5
52 | parser.add_argument('--max_grad_norm', default=1, type=float,
53 | help='max grad clip')
54 |
55 | parser.add_argument('--warmup_proportion', default=0.1, type=float)
56 |
57 | parser.add_argument('--weight_decay', default=0.1, type=float)
58 |
59 | parser.add_argument('--adam_epsilon', default=1e-8, type=float)
60 |
61 | parser.add_argument('--train_batch_size', default=32, type=int)
62 |
63 | parser.add_argument('--eval_model', default=True, action='store_true',
64 | help='whether to eval model after training')
65 |
66 | return parser
67 |
68 | def get_parser(self):
69 | parser = self.parse()
70 | parser = self.initialize(parser)
71 | return parser.parse_args()
72 |
--------------------------------------------------------------------------------
/bert_multi_classification/convert_test_format.py:
--------------------------------------------------------------------------------
1 | # Author:xyy
2 | # CreatTime:2022-07-10
3 | # FileName:convert_format
4 | # Description: 修改比赛给定测试文件格式为代码所需格式
5 |
6 | def process_test_data(file_name):
7 | with open('./data/raw_data/' + file_name, encoding='utf-8') as file:
8 | file_content = eval(file.read().strip())
9 |
10 | with open('./data/processed_data/' + file_name, 'a', encoding='utf-8') as file:
11 | content_text = {}
12 | for item in file_content:
13 | data_text = item['my_text']
14 | content_text["id"] = item["id"]
15 | content_text["text"] = data_text.strip()
16 | file.write(f"{content_text}\n")
17 |
18 |
19 | process_test_data('test.json')
20 |
--------------------------------------------------------------------------------
/bert_multi_classification/convert_train_dev_format.py:
--------------------------------------------------------------------------------
1 | # Author:xyy
2 | # CreatTime:
3 | # FileName:convert_format
4 | # Description: 修改比赛给定训练\验证文件格式为代码所需格式
5 |
6 | def process_data(file_name):
7 | with open('./data/raw_data/' + file_name, encoding='utf-8') as file:
8 | file_content = eval(file.read().strip())
9 |
10 | with open('./data/processed_data/' + file_name, 'a', encoding='utf-8') as file:
11 | content_text = {}
12 | for item in file_content:
13 | data_text = item['my_text']
14 | content_text['text'] = data_text.strip()
15 | content_text['event_list'] = []
16 | data_event = item['events']
17 | if len(data_event) > 0:
18 | for i in data_event:
19 | content_text['event_list'].append({'event_type': i[1]})
20 | if len(content_text['event_list']) >= 0:
21 | file.write(f"{content_text}\n")
22 | elif len(data_event) == 0:
23 | content_text['event_list'].append({'event_type': '无事件'})
24 | file.write(f"{content_text}\n")
25 |
26 |
27 | process_data('train.json')
28 | # process_data('dev.json')
29 |
--------------------------------------------------------------------------------
/bert_multi_classification/data_preprocess.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | sys.path.append('../..')
4 | import os
5 | import logging
6 | from transformers import BertTokenizer
7 | import config
8 | import numpy as np
9 | from utils import utils
10 |
11 | logger = logging.getLogger(__name__)
12 |
13 |
14 | class InputExample:
15 | def __init__(self, set_type, text, labels=None):
16 | self.set_type = set_type
17 | self.text = text
18 | self.labels = labels
19 |
20 |
21 | class BaseFeature:
22 | def __init__(self, token_ids, attention_masks, token_type_ids):
23 | # BERT 输入
24 | self.token_ids = token_ids
25 | self.attention_masks = attention_masks
26 | self.token_type_ids = token_type_ids
27 |
28 |
29 | class BertFeature(BaseFeature):
30 | def __init__(self, token_ids, attention_masks, token_type_ids, labels=None):
31 | super(BertFeature, self).__init__(
32 | token_ids=token_ids,
33 | attention_masks=attention_masks,
34 | token_type_ids=token_type_ids)
35 | # labels
36 | self.labels = labels
37 |
38 |
39 | class Processor:
40 |
41 | @staticmethod
42 | def read_json(file_path):
43 | with open(file_path, encoding='utf-8') as f:
44 | raw_examples = f.read().strip()
45 | return raw_examples
46 |
47 | def get_examples(self, raw_examples, set_type):
48 | examples = []
49 | # 这里是从json数据中的字典中获取
50 | for line in raw_examples.split('\n'):
51 | line = eval(line)
52 | labels = []
53 | if len(line['event_list']) != 0:
54 | for tmp in line['event_list']:
55 | labels.append(tmp['event_type'])
56 | examples.append(InputExample(set_type=set_type,
57 | text=line['text'],
58 | labels=labels))
59 | return examples
60 |
61 |
62 | def convert_bert_example(ex_idx, example: InputExample, tokenizer: BertTokenizer, max_seq_len, label2id):
63 | set_type = example.set_type
64 | raw_text = example.text
65 | labels = example.labels
66 | # 文本元组
67 | callback_info = (raw_text,)
68 | callback_labels = labels
69 | callback_info += (callback_labels,)
70 | # 转换为one-hot编码
71 | label_ids = [0 for _ in range(len(label2id))]
72 | for label in labels:
73 | label_ids[label2id[label]] = 1
74 |
75 | encode_dict = tokenizer.encode_plus(text=raw_text,
76 | add_special_tokens=True,
77 | max_length=max_seq_len,
78 | truncation_strategy='longest_first',
79 | padding="max_length",
80 | return_token_type_ids=True,
81 | return_attention_mask=True)
82 | token_ids = encode_dict['input_ids']
83 | attention_masks = encode_dict['attention_mask']
84 | token_type_ids = encode_dict['token_type_ids']
85 |
86 | while len(token_ids) < max_seq_len:
87 | token_ids.append(0)
88 | attention_masks.append(0)
89 | token_type_ids.append(0)
90 |
91 | assert len(token_ids) == max_seq_len
92 | assert len(attention_masks) == max_seq_len
93 | assert len(token_type_ids) == max_seq_len
94 |
95 | if ex_idx < 3:
96 | decode_text = tokenizer.decode(np.array(token_ids)[np.where(np.array(attention_masks) == 1)[0]].tolist())
97 | logger.info(f"*** {set_type}_example-{ex_idx} ***")
98 | logger.info(f"text: {decode_text}")
99 | logger.info(f"token_ids: {token_ids}")
100 | logger.info(f"attention_masks: {attention_masks}")
101 | logger.info(f"token_type_ids: {token_type_ids}")
102 | logger.info(f"labels: {label_ids}")
103 |
104 | feature = BertFeature(
105 | # bert inputs
106 | token_ids=token_ids,
107 | attention_masks=attention_masks,
108 | token_type_ids=token_type_ids,
109 | labels=label_ids,
110 | )
111 |
112 | return feature, callback_info
113 |
114 |
115 | def convert_examples_to_features(examples, max_seq_len, bert_dir, label2id):
116 | tokenizer = BertTokenizer(os.path.join(bert_dir, 'vocab.txt'))
117 | features = []
118 | callback_info = []
119 |
120 | logger.info(f'Convert {len(examples)} examples to features')
121 |
122 | for i, example in enumerate(examples):
123 | feature, tmp_callback = convert_bert_example(
124 | ex_idx=i,
125 | example=example,
126 | max_seq_len=max_seq_len,
127 | tokenizer=tokenizer,
128 | label2id=label2id,
129 | )
130 | if feature is None:
131 | continue
132 |
133 | features.append(feature)
134 | callback_info.append(tmp_callback)
135 | logger.info(f'Build {len(features)} features')
136 |
137 | out = (features,)
138 |
139 | if not len(callback_info):
140 | return out
141 |
142 | out += (callback_info,)
143 | return out
144 |
145 |
146 | def get_out(processor, json_path, args, label2id, mode):
147 | raw_examples = processor.read_json(json_path)
148 |
149 | examples = processor.get_examples(raw_examples, mode)
150 | for i, example in enumerate(examples):
151 | print(example.text)
152 | print(example.labels)
153 | if i == 5:
154 | break
155 | out = convert_examples_to_features(examples, args.max_seq_len, args.bert_dir, label2id)
156 | return out
157 |
158 |
159 | if __name__ == '__main__':
160 | args = config.Args().get_parser()
161 | args.log_dir = 'logs/'
162 | args.max_seq_len = 128
163 | args.bert_dir = 'model/bert-base-chinese/'
164 | utils.set_logger(os.path.join(args.log_dir, 'preprocess.log'))
165 | logger.info(vars(args))
166 |
167 | processor = Processor()
168 |
169 | label2id = {}
170 | id2label = {}
171 | with open('../data/MutiClass/processed_data/labels.txt', 'r', encoding='utf-8') as fp:
172 | labels = fp.read().strip().split('\n')
173 | for i, label in enumerate(labels):
174 | label2id[label] = i
175 | id2label[id] = label
176 | print(label2id)
177 |
178 | train_out = get_out(processor, '../data/MutiClass/processed_data/train.json', args, label2id, 'train')
179 | dev_out = get_out(processor, '../data/MutiClass/processed_data/dev.json', args, label2id, 'dev')
180 |
--------------------------------------------------------------------------------
/bert_multi_classification/dataset.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | sys.path.append('../..')
4 | import torch
5 | from torch.utils.data import Dataset
6 | # 这里要显示的引入BertFeature,不然会报错
7 | from data_preprocess import get_out, Processor
8 | import config
9 |
10 |
11 | class MLDataset(Dataset):
12 | def __init__(self, features):
13 | self.nums = len(features)
14 |
15 | self.token_ids = [torch.tensor(example.token_ids).long() for example in features]
16 | self.attention_masks = [torch.tensor(example.attention_masks).float() for example in features]
17 | self.token_type_ids = [torch.tensor(example.token_type_ids).long() for example in features]
18 | self.labels = [torch.tensor(example.labels).float() for example in features]
19 |
20 | def __len__(self):
21 | return self.nums
22 |
23 | def __getitem__(self, index):
24 | data = {
25 | 'token_ids': self.token_ids[index],
26 | 'attention_masks': self.attention_masks[index],
27 | 'token_type_ids': self.token_type_ids[index]
28 | }
29 |
30 | data['labels'] = self.labels[index]
31 |
32 | return data
33 |
34 |
35 | if __name__ == '__main__':
36 | args = config.Args().get_parser()
37 | args.log_dir = 'logs/'
38 | args.max_seq_len = 128
39 | args.bert_dir = 'model/bert-base-chinese/'
40 |
41 | processor = Processor()
42 |
43 | label2id = {}
44 | id2label = {}
45 | with open('../data/MutiClass/processed_data/labels.txt', 'r', encoding='utf-8') as fp:
46 | labels = fp.read().strip().split('\n')
47 | for i, label in enumerate(labels):
48 | label2id[label] = i
49 | id2label[id] = label
50 | print(label2id)
51 |
52 | train_out = get_out(processor, '../data/MutiClass/processed_data/train.json', args, label2id, 'train')
53 | features, callback_info = train_out
54 | train_dataset = MLDataset(features)
55 | for data in train_dataset:
56 | print(data['token_ids'])
57 | print(data['attention_masks'])
58 | print(data['token_type_ids'])
59 | print(data['labels'])
60 | break
61 |
--------------------------------------------------------------------------------
/bert_multi_classification/logs/final_main.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tangjielong928/Multi-modal-Event-Extraction/4e133e609f3ada97862dd65123652f569761de5d/bert_multi_classification/logs/final_main.log
--------------------------------------------------------------------------------
/bert_multi_classification/logs/main.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tangjielong928/Multi-modal-Event-Extraction/4e133e609f3ada97862dd65123652f569761de5d/bert_multi_classification/logs/main.log
--------------------------------------------------------------------------------
/bert_multi_classification/logs/preprocess.log:
--------------------------------------------------------------------------------
1 | 2022-08-10 17:12:05,316 - INFO - 2_data_preprocess.py - - 165 - {'output_dir': './checkpoints/', 'bert_dir': './model/bert-base-chinese/', 'data_dir': './data/tcner/', 'log_dir': './logs/', 'num_tags': 8, 'seed': 123, 'gpu_ids': '0', 'max_seq_len': 128, 'eval_batch_size': 8, 'swa_start': 3, 'train_epochs': 15, 'dropout_prob': 0, 'lr': 2e-05, 'other_lr': 0.0003, 'max_grad_norm': 1, 'warmup_proportion': 0.1, 'weight_decay': 0, 'adam_epsilon': 1e-08, 'train_batch_size': 8, 'eval_model': True}
2 | 2022-08-10 17:12:05,355 - INFO - 2_data_preprocess.py - convert_examples_to_features - 120 - Convert 642 examples to features
3 |
--------------------------------------------------------------------------------
/bert_multi_classification/model/bert-base-chinese/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "BertForMaskedLM"
4 | ],
5 | "attention_probs_dropout_prob": 0.1,
6 | "directionality": "bidi",
7 | "hidden_act": "gelu",
8 | "hidden_dropout_prob": 0.1,
9 | "hidden_size": 768,
10 | "initializer_range": 0.02,
11 | "intermediate_size": 3072,
12 | "layer_norm_eps": 1e-12,
13 | "max_position_embeddings": 512,
14 | "model_type": "bert",
15 | "num_attention_heads": 12,
16 | "num_hidden_layers": 12,
17 | "pad_token_id": 0,
18 | "pooler_fc_size": 768,
19 | "pooler_num_attention_heads": 12,
20 | "pooler_num_fc_layers": 3,
21 | "pooler_size_per_head": 128,
22 | "pooler_type": "first_token_transform",
23 | "type_vocab_size": 2,
24 | "vocab_size": 21128
25 | }
26 |
--------------------------------------------------------------------------------
/bert_multi_classification/model/bert-base-chinese/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "BertForMaskedLM"
4 | ],
5 | "attention_probs_dropout_prob": 0.1,
6 | "directionality": "bidi",
7 | "hidden_act": "gelu",
8 | "hidden_dropout_prob": 0.1,
9 | "hidden_size": 768,
10 | "initializer_range": 0.02,
11 | "intermediate_size": 3072,
12 | "layer_norm_eps": 1e-12,
13 | "max_position_embeddings": 512,
14 | "model_type": "bert",
15 | "num_attention_heads": 12,
16 | "num_hidden_layers": 12,
17 | "pad_token_id": 0,
18 | "pooler_fc_size": 768,
19 | "pooler_num_attention_heads": 12,
20 | "pooler_num_fc_layers": 3,
21 | "pooler_size_per_head": 128,
22 | "pooler_type": "first_token_transform",
23 | "type_vocab_size": 2,
24 | "vocab_size": 21128
25 | }
26 |
--------------------------------------------------------------------------------
/bert_multi_classification/models.py:
--------------------------------------------------------------------------------
1 | from transformers import BertModel
2 | import torch.nn as nn
3 |
4 |
5 | class BertForMultiLabelClassification(nn.Module):
6 | def __init__(self, args):
7 | super(BertForMultiLabelClassification, self).__init__()
8 | self.bert = BertModel.from_pretrained('bert-base-chinese')
9 | self.bert_config = self.bert.config
10 | out_dims = self.bert_config.hidden_size
11 | self.dropout = nn.Dropout(0.1)
12 | self.linear = nn.Linear(out_dims, args.num_tags)
13 |
14 | def forward(self, token_ids, attention_masks, token_type_ids):
15 | bert_outputs = self.bert(
16 | input_ids = token_ids,
17 | attention_mask = attention_masks,
18 | token_type_ids = token_type_ids,
19 | )
20 | seq_out = bert_outputs[1]
21 | seq_out = self.dropout(seq_out)
22 | seq_out = self.linear(seq_out)
23 | return seq_out
--------------------------------------------------------------------------------
/bert_multi_classification/predict_result.py:
--------------------------------------------------------------------------------
1 | # Author:xyy
2 | # CreatTime:2022-07-11
3 | # FileName:predict_result
4 | # Description:
5 |
6 |
7 | import os
8 | import logging
9 | import torch
10 | import json
11 | import torch.nn as nn
12 | import numpy as np
13 | from transformers import BertTokenizer
14 | import config
15 | import data_preprocess
16 | import models
17 | import utils.utils as ut
18 | from tqdm import tqdm
19 |
20 | logger = logging.getLogger(__name__)
21 |
22 |
23 | class Trainer:
24 | def __init__(self, args, train_loader, dev_loader, test_loader):
25 | self.args = args
26 | gpu_ids = args.gpu_ids.split(',')
27 | self.device = torch.device("cpu" if gpu_ids[0] == '-1' else "cuda:" + gpu_ids[0])
28 | self.model = models.BertForMultiLabelClassification(args)
29 | self.optimizer = torch.optim.Adam(params=self.model.parameters(), lr=self.args.lr)
30 | self.criterion = nn.BCEWithLogitsLoss()
31 | self.train_loader = train_loader
32 | self.dev_loader = dev_loader
33 | self.test_loader = test_loader
34 | self.model.to(self.device)
35 |
36 | def load_ckp(self, model, checkpoint_path):
37 | checkpoint = torch.load(checkpoint_path)
38 | model.load_state_dict(checkpoint['state_dict'])
39 | # optimizer.load_state_dict(checkpoint['optimizer'])
40 | epoch = checkpoint['epoch']
41 | loss = checkpoint['loss']
42 | return model, epoch, loss
43 |
44 | def test(self, checkpoint_path):
45 | model = self.model
46 | optimizer = torch.optim.Adam(params=self.model.parameters(), lr=self.args.lr)
47 | model = self.load_ckp(model, optimizer, checkpoint_path)
48 | model.eval()
49 | model.to(self.device)
50 | total_loss = 0.0
51 | test_outputs = []
52 | test_targets = []
53 | with torch.no_grad():
54 | for test_step, test_data in enumerate(self.test_loader):
55 | token_ids = test_data['token_ids'].to(self.device)
56 | attention_masks = test_data['attention_masks'].to(self.device)
57 | token_type_ids = test_data['token_type_ids'].to(self.device)
58 | labels = test_data['labels'].to(self.device)
59 | outputs = model(token_ids, attention_masks, token_type_ids)
60 | loss = self.criterion(outputs, labels)
61 | # val_loss = val_loss + ((1 / (dev_step + 1))) * (loss.item() - val_loss)
62 | total_loss += loss.item()
63 | outputs = torch.sigmoid(outputs).cpu().detach().numpy().tolist()
64 | outputs = (np.array(outputs) > 0.6).astype(int)
65 | test_outputs.extend(outputs.tolist())
66 | test_targets.extend(labels.cpu().detach().numpy().tolist())
67 |
68 | return total_loss, test_outputs, test_targets
69 |
70 | def predict(self, tokenizer, text, id2label, args):
71 | model = self.model
72 | optimizer = self.optimizer
73 | checkpoint = os.path.join(args.output_dir, 'best.pt')
74 | model, epoch, loss = self.load_ckp(model, checkpoint)
75 | model.eval()
76 | model.to(self.device)
77 | with torch.no_grad():
78 | inputs = tokenizer.encode_plus(text=text,
79 | add_special_tokens=True,
80 | max_length=args.max_seq_len,
81 | truncation='longest_first',
82 | padding="max_length",
83 | return_token_type_ids=True,
84 | return_attention_mask=True,
85 | return_tensors='pt')
86 | token_ids = inputs['input_ids'].to(self.device)
87 | attention_masks = inputs['attention_mask'].to(self.device)
88 | token_type_ids = inputs['token_type_ids'].to(self.device)
89 |
90 | outputs = model(token_ids, attention_masks, token_type_ids)
91 | outputs = torch.sigmoid(outputs).cpu().detach().numpy().tolist()
92 | outputs = (np.array(outputs) > 0.5).astype(int)
93 | outputs = np.where(outputs[0] == 1)[0].tolist()
94 | if len(outputs) != 0:
95 | outputs = [id2label[i] for i in outputs]
96 | return outputs
97 | else:
98 | return ["无事件"]
99 |
100 |
101 | if __name__ == '__main__':
102 | args = config.Args().get_parser()
103 | ut.set_seed(args.seed)
104 | ut.set_logger(os.path.join(args.log_dir, 'final_main.log'))
105 |
106 | processor = data_preprocess.Processor()
107 |
108 | label2id = {}
109 | id2label = {}
110 | with open('../data/MutiClass/processed_data/labels.txt', 'r', encoding='utf-8') as fp:
111 | labels = fp.read().strip().split('\n')
112 | for i, label in enumerate(labels):
113 | label2id[label] = i
114 | id2label[i] = label
115 |
116 | # 预测
117 | trainer = Trainer(args, None, None, None)
118 | tokenizer = BertTokenizer.from_pretrained(args.bert_dir)
119 | res_list = []
120 | # 读取test1.json里面的数据
121 | with open(os.path.join('../data/raw_data/test/test_no_ann.json'), 'r', encoding='utf-8')as fp:
122 | lines = json.load(fp)
123 | # lines = fp.read().strip().split('\n')
124 |
125 | print("****************Now start our multi-labels inference!!*************")
126 | for line in tqdm(lines):
127 | id = line["id"]
128 | text = line["my_text"]
129 | result = trainer.predict(tokenizer, text, id2label, args)
130 | # print(result)
131 | res_list.append({"id": id, "text": text, "class": result})
132 |
133 | with open('../data/result/multi_label/result.json', 'w', encoding='utf-8') as f:
134 | # f.write("[\n")
135 | # for item in res_list:
136 | # f.write(str(item) + ",\n")
137 | # f.write("]")
138 | f.write(json.dumps(res_list,ensure_ascii=False))
139 |
--------------------------------------------------------------------------------
/bert_multi_classification/utils/utils.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import random
3 | import logging
4 | import time
5 | import numpy as np
6 | import torch
7 |
8 |
9 | def timer(func):
10 | """
11 | 函数计时器
12 | :param func:
13 | :return:
14 | """
15 |
16 | @functools.wraps(func)
17 | def wrapper(*args, **kwargs):
18 | start = time.time()
19 | res = func(*args, **kwargs)
20 | end = time.time()
21 | print("{}共耗时约{:.4f}秒".format(func.__name__, end - start))
22 | return res
23 |
24 | return wrapper
25 |
26 |
27 | def set_seed(seed=123):
28 | """
29 | 设置随机数种子,保证实验可重现
30 | :param seed:
31 | :return:
32 | """
33 | random.seed(seed)
34 | torch.manual_seed(seed)
35 | np.random.seed(seed)
36 | torch.cuda.manual_seed_all(seed)
37 |
38 |
39 | def set_logger(log_path):
40 | """
41 | 配置log
42 | :param log_path:s
43 | :return:
44 | """
45 | logger = logging.getLogger()
46 | logger.setLevel(logging.INFO)
47 |
48 | # 由于每调用一次set_logger函数,就会创建一个handler,会造成重复打印的问题,因此需要判断root logger中是否已有该handler
49 | if not any(handler.__class__ == logging.FileHandler for handler in logger.handlers):
50 | file_handler = logging.FileHandler(log_path)
51 | formatter = logging.Formatter(
52 | '%(asctime)s - %(levelname)s - %(filename)s - %(funcName)s - %(lineno)d - %(message)s')
53 | file_handler.setFormatter(formatter)
54 | logger.addHandler(file_handler)
55 |
56 | if not any(handler.__class__ == logging.StreamHandler for handler in logger.handlers):
57 | stream_handler = logging.StreamHandler()
58 | stream_handler.setFormatter(logging.Formatter('%(message)s'))
59 | logger.addHandler(stream_handler)
60 |
61 |
--------------------------------------------------------------------------------
/evaluation_metric.py:
--------------------------------------------------------------------------------
1 | ## -*- coding: utf-8 -*-
2 | '''
3 | 计算评价指标P, R, F1
4 | F1 = (2 * P * R) / (P + R)
5 | P = 预测正确事件要素个数 / 全部预测事件要素数量
6 | R = 预测正确事件要素个数 / 全部正确标注要素数量
7 | '''
8 |
9 | class EvaluationMetric(object):
10 | def __init__(self, txt_pre='', txt_gt=''):
11 | self.pre_, self.count_pre_ = self.read_pre(txt_pre=txt_pre)
12 | self.gt_, self.count_gt_ = self.read_gt(txt_gt=txt_gt)
13 |
14 | def read_pre(self, txt_pre=''):
15 | with open(txt_pre, 'r') as f:
16 | lines = f.readlines()
17 | if lines:
18 | pre_dict = {}
19 | for line in lines:
20 | argus = line.strip().split('\t')
21 | # assert len(argus) == 6
22 | id = argus[0]
23 | if id in pre_dict.keys():
24 | pre_dict[id].append(argus[1:6])
25 | else:
26 | pre_dict[id] = [argus[1:6], ]
27 | return pre_dict, len(lines)
28 |
29 | def read_gt(self, txt_gt=''):
30 | with open(txt_gt, 'r') as f:
31 | lines = f.readlines()
32 | if lines:
33 | gt_dict = {}
34 | for line in lines:
35 | argus = line.strip().split('\t')
36 | # assert len(argus) == 6
37 | id = argus[0]
38 | if id in gt_dict.keys():
39 | gt_dict[id].append(argus[1:6])
40 | else:
41 | gt_dict[id] = [argus[1:6], ]
42 | return gt_dict, len(lines)
43 |
44 | def calculate_iou_1(self, bbox_pre='', bbox_gt=''):
45 | bbox_pre_pre = [float(it) for it in bbox_pre[1:-1].split(' ')]
46 | bbox_pre_gt = [float(it) for it in bbox_gt[1:-1].split(' ')]
47 |
48 | s_rec1 = (bbox_pre_pre[2] - bbox_pre_pre[0]) * (bbox_pre_pre[3] - bbox_pre_pre[1]) # 第一个bbox面积 = 长×宽
49 | s_rec2 = (bbox_pre_gt[2] - bbox_pre_gt[0]) * (bbox_pre_gt[3] - bbox_pre_gt[1]) # 第二个bbox面积 = 长×宽
50 | sum_s = s_rec1 + s_rec2 # 总面积
51 | left = max(bbox_pre_pre[0], bbox_pre_gt[0]) # 并集左上角顶点横坐标
52 | right = min(bbox_pre_pre[2], bbox_pre_gt[2]) # 并集右下角顶点横坐标
53 | bottom = max(bbox_pre_pre[1], bbox_pre_gt[1]) # 并集左上角顶点纵坐标
54 | top = min(bbox_pre_pre[3], bbox_pre_gt[3]) # 并集右下角顶点纵坐标
55 | if left >= right or top <= bottom: # 不存在并集的情况
56 | return 0
57 | else:
58 | inter = (right - left) * (top - bottom) # 求并集面积
59 | iou = (inter / (sum_s - inter)) * 1.0 # 计算IOU
60 | return iou
61 |
62 | def calculate_iou_2(self, bbox_pre='', bbox_gt=''):
63 | bbox_pre_pre = [float(it) for it in bbox_pre[1:-1].split(' ')]
64 | bbox_pre_gt = [float(it) for it in bbox_gt[1:-1].split(' ')]
65 | x1, y1, x2, y2 = bbox_pre_pre # box1的左上角坐标、右下角坐标
66 | x3, y3, x4, y4 = bbox_pre_gt # box2的左上角坐标、右下角坐标
67 |
68 | # 计算交集的坐标
69 | x_inter1 = max(x1, x3) # union的左上角x
70 | y_inter1 = max(y1, y3) # union的左上角y
71 | x_inter2 = min(x2, x4) # union的右下角x
72 | y_inter2 = min(y2, y4) # union的右下角y
73 |
74 | # 计算交集部分面积,因为图像是像素点,所以计算图像的长度需要加一
75 | # 比如有两个像素点(0,0)、(1,0),那么图像的长度是1-0+1=2,而不是1-0=1
76 | interArea = max(0, x_inter2 - x_inter1 + 1) * max(0, y_inter2 - y_inter1 + 1)
77 |
78 | # 分别计算两个box的面积
79 | area_box1 = (x2 - x1 + 1) * (y2 - y1 + 1)
80 | area_box2 = (x4 - x3 + 1) * (y4 - y3 + 1)
81 |
82 | # 计算IOU,交集比并集,并集面积=两个矩形框面积和-交集面积
83 | iou = interArea / (area_box1 + area_box2 - interArea)
84 | return iou
85 |
86 | def calculate_P_R_F1(self, pre={}, count_pre=0, gt={}, count_gt=0, flag_onlyText=True):
87 | P, R, F1 = 0., 0., 0.
88 |
89 | count_crrect = 0
90 | for key_pre, val_pre in pre.items():
91 | if key_pre not in gt.keys():
92 | continue
93 | else:
94 | val_gt = gt[key_pre]
95 | ## 开始遍历
96 | for i in range(len(val_pre)):
97 | i_type, i_sta, i_end, i_argu, i_bbox = val_pre[i]
98 | for j in range(len(val_gt)):
99 | j_type, j_sta, j_end, j_argu, j_bbox = val_gt[j]
100 | if i_type != j_type:
101 | continue
102 | else:
103 | if i_sta != j_sta:
104 | continue
105 | else:
106 | if i_end != j_end:
107 | continue
108 | else:
109 | if flag_onlyText:
110 | count_crrect += 1
111 | else:
112 | if i_argu != j_argu:
113 | continue
114 | else:
115 | if i_bbox.startswith('-') and j_bbox.startswith('-'):
116 | count_crrect += 1
117 | elif i_bbox.startswith('-') and j_bbox.startswith('['):
118 | continue
119 | elif i_bbox.startswith('[') and j_bbox.startswith('-'):
120 | continue
121 | elif i_bbox.startswith('[') and j_bbox.startswith('['):
122 | iou = self.calculate_iou_2(i_bbox.strip(), j_bbox.strip())
123 | if iou <= 0.5:
124 | continue
125 | else:
126 | count_crrect += 1
127 | '''
128 | F1 = (2 * P * R) / (P + R)
129 | P = 预测正确事件要素个数 / 全部预测事件要素数量
130 | R = 预测正确事件要素个数 / 全部正确标注要素数量
131 | '''
132 | print("预测:{}\n标注:{}\n\n正确:{}\n".format(count_pre, count_gt, count_crrect))
133 |
134 | assert count_pre != 0
135 | assert count_gt != 0
136 | P = count_crrect / count_pre
137 | R = count_crrect / count_gt
138 | F1 = (2 * P * R) / (P + R)
139 |
140 | return P, R, F1
141 |
142 | if __name__ == '__main__':
143 | em = EvaluationMetric(txt_pre='/data/zxwang/ccks2022/val/val_result/result_20220725_180219_clear.txt', txt_gt='../THUCNews/ccks2022/val_gt.txt')
144 | P, R, F1 = em.calculate_P_R_F1(em.pre_, em.count_pre_, em.gt_, em.count_gt_, flag_onlyText=False)
145 | print("P={:.5f}\tR={:.5f}\tF1={:.5f} \t".format(P, R, F1))
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | albumentations==1.2.1
2 | coremltools==5.2.0
3 | Flask==2.1.3
4 | ipython==8.4.0
5 | matplotlib==3.5.2
6 | numpy==1.22.3
7 | onnx==1.12.0
8 | onnxruntime==1.12.1
9 | onnxsim==0.4.7
10 | opencv_python==4.6.0.66
11 | openvino==2022.1.0
12 | paddle==1.0.2
13 | paddlehub==2.2.0
14 | paddlenlp==2.3.4
15 | pafy==0.5.5
16 | pandas==1.4.3
17 | Pillow==9.2.0
18 | protobuf==4.21.5
19 | psutil==5.9.1
20 | pycocotools==2.0.4
21 | PyYAML==6.0
22 | requests==2.28.1
23 | scikit_learn==1.1.2
24 | scipy==1.8.1
25 | seaborn==0.11.2
26 | setuptools==62.6.0
27 | tensorflow==2.9.1
28 | tensorflowjs==3.19.0
29 | tensorrt==0.0.1.dev5
30 | tflite_runtime==2.9.1
31 | thop==0.1.1.post2207130030
32 | toolHandlerJLT==0.1.1
33 | torch==1.10.2+cu113
34 | torchvision==0.11.3+cu113
35 | tqdm==4.64.0
36 | transformers==4.15.0
37 |
--------------------------------------------------------------------------------
/role_classification/data/train.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tangjielong928/Multi-modal-Event-Extraction/4e133e609f3ada97862dd65123652f569761de5d/role_classification/data/train.xlsx
--------------------------------------------------------------------------------
/role_classification/dataset.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2022/7/28 13:40
3 | # @Author: Jielong Tang
4 | # @File : dataset.py
5 | import paddle
6 | import paddlenlp
7 | import pandas as pd
8 |
9 | label_vocab = {'airplane': 0, 'boat': 1,'missile':2,'other':3,'submarine':4,'truck':5}
10 |
11 |
12 | class SelfDefinedDataset(paddle.io.Dataset):
13 | def __init__(self, data_path,sheet_name):
14 | super(SelfDefinedDataset, self).__init__()
15 | self.data =[]
16 | label_vocab = {'airplane': 0, 'boat': 1, 'missile': 2, 'other': 3, 'submarine': 4, 'truck': 5}
17 | train_df = pd.read_excel(data_path, header=0, sheet_name=sheet_name)
18 | for txt, label in zip(train_df['Column6'].tolist(), train_df['Column7'].tolist()):
19 | if len(txt)>0 and len(label)>0:
20 | self.data.append(dict(text=txt, label=label_vocab[label]))
21 |
22 |
23 | def __getitem__(self, index):
24 | return self.data[index]
25 |
26 | def __len__(self):
27 | return len(self.data)
28 |
29 | def get_labels(self):
30 | return ['0','1','2','3','4','5',]
31 |
32 |
33 |
34 | def xlsx_to_list(data_path):
35 | res_list = []
36 | dedv_list = []
37 | train_df = pd.read_excel(data_path, header=0,sheet_name="train_content")
38 | dev_df = pd.read_excel(data_path, header=0, sheet_name="dev_content")
39 | for txt,label in zip(train_df['Column6'].tolist(),train_df['Column7'].tolist()):
40 | res_list.append(dict(text=txt, label=label))
41 | for txt,label in zip(dev_df['Column6'].tolist(),dev_df['Column7'].tolist()):
42 | dedv_list.append(dict(text=txt, label=label))
43 | return res_list,dedv_list
44 |
45 |
46 | #
47 | # train_ds, dev_ds, test_ds = SelfDefinedDataset.get_datasets([trainlst, devlst, testlst])
48 | if __name__ == "__main__":
49 | trainlst = SelfDefinedDataset('data/train.xlsx', "train_content")
50 | print(trainlst.data)
51 | # train_ds, dev_ds = SelfDefinedDataset.get_datasets([trainlst, devlst])
--------------------------------------------------------------------------------
/role_classification/predict.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2022/7/28 17:01
3 | # @Author: Jielong Tang
4 | # @File : predict.py
5 | import os
6 |
7 | import paddle
8 | from paddlenlp.transformers import ErnieForSequenceClassification,ErnieTinyTokenizer
9 | from .dataset import label_vocab
10 | from .utils import predict
11 |
12 | def load_model(init_ckpt = None):
13 | paddle.set_device('gpu')
14 | tokenizer = ErnieTinyTokenizer.from_pretrained('ernie-tiny')
15 | model = ErnieForSequenceClassification.from_pretrained('ernie-tiny', num_classes=len(label_vocab))
16 | if not init_ckpt or not os.path.isfile(init_ckpt):
17 | raise Exception("init checkpoints {} not exist".format(init_ckpt))
18 | else:
19 | print('load model {}'.format(init_ckpt))
20 | state_dict = paddle.load(init_ckpt)
21 | model.set_dict(state_dict)
22 | return model,tokenizer
23 |
24 | def do_predict(test_sample, model,tokenizer):
25 | data = dict(text=test_sample, label='NULL')
26 | id2label = {val: key for key, val in label_vocab.items()}
27 | result = predict(model, data, tokenizer, label_vocab, batch_size=1)
28 | return id2label[result[0]]
29 |
30 | if __name__ == '__main__':
31 | '''
32 | 下面为classifier标准用法,先导入模型load_model函数,然后do_predict进行inference。记得导入模型代码放在循环外。
33 | '''
34 | model,tokenizer = load_model(init_ckpt='../ckpt/role_judge_model/model_state.pdparams')
35 | print(do_predict(model = model,tokenizer=tokenizer, test_sample='里根号航母舰队'))
36 |
37 |
38 | # with open('./data/result_20220727_192612.txt','r',encoding='utf-8') as f:
39 | # result_list = []
40 | # for line in f:
41 | # if line.strip().split('\t')[4] in ['发起者','承受者','使用器械']:
42 | # test_sample = line.strip().split('\t')[6]
43 | # result = do_predict(model = model,tokenizer=tokenizer, test_sample=test_sample)
44 | # result_list.append(line.strip().split('\t')[4] + '\t'+test_sample + '\t'+ result)
45 | # with open('./data/result_show.txt','w',encoding='utf-8') as f:
46 | # for line in result_list:
47 | # f.write(line+'\n')
48 |
49 |
50 |
--------------------------------------------------------------------------------
/role_classification/train.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2022/7/28 11:33
3 | # @Author: Jielong Tang
4 | # @File : train.py
5 |
6 | import paddlehub as hub
7 | import numpy as np
8 | import paddle
9 | import paddle.nn.functional as F
10 | from paddlenlp.data import Stack, Tuple, Pad
11 | from paddlenlp.transformers import ErnieForSequenceClassification,ErnieTinyTokenizer,LinearDecayWithWarmup
12 | from .dataset import SelfDefinedDataset, label_vocab
13 | from functools import partial
14 | from .utils import convert_example, create_dataloader,evaluate
15 |
16 | batch_size = 128
17 | max_seq_length = 42
18 | learning_rate = 5e-5
19 | epochs = 30
20 | # 学习率预热比例
21 | warmup_proportion = 0.1
22 | # 权重衰减系数
23 | weight_decay = 0.01
24 |
25 | def train():
26 | paddle.set_device('gpu')
27 |
28 | train_ds = SelfDefinedDataset(data_path='data/train.xlsx', sheet_name="train_content")
29 | dev_ds = SelfDefinedDataset(data_path='data/train.xlsx', sheet_name="dev_content")
30 |
31 | model = ErnieForSequenceClassification.from_pretrained('ernie-tiny',num_classes=len(label_vocab))
32 | tokenizer = ErnieTinyTokenizer.from_pretrained('ernie-tiny')
33 |
34 | trans_func = partial(
35 | convert_example,
36 | tokenizer=tokenizer,
37 | max_seq_length=max_seq_length)
38 | batchify_fn = lambda samples, fn=Tuple(
39 | Pad(axis=0, pad_val=tokenizer.pad_token_id), # input
40 | Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment
41 | Stack(dtype="int64") # label
42 | ): fn(list(map(trans_func, samples)))
43 |
44 | train_data_loader = create_dataloader(
45 | train_ds,
46 | mode='train',
47 | batch_size=batch_size,
48 | batchify_fn=batchify_fn,
49 | trans_function=trans_func)
50 | dev_data_loader = create_dataloader(
51 | dev_ds,
52 | mode='dev',
53 | batch_size=batch_size,
54 | batchify_fn=batchify_fn,
55 | trans_function=trans_func)
56 |
57 | num_training_steps = len(train_data_loader) * epochs
58 | lr_scheduler = LinearDecayWithWarmup(learning_rate, num_training_steps, warmup_proportion)
59 | optimizer = paddle.optimizer.AdamW(
60 | learning_rate=lr_scheduler,
61 | parameters=model.parameters(),
62 | weight_decay=weight_decay,
63 | apply_decay_param_fun=lambda x: x in [
64 | p.name for n, p in model.named_parameters()
65 | if not any(nd in n for nd in ["bias", "norm"])
66 | ])
67 |
68 | criterion = paddle.nn.loss.CrossEntropyLoss()
69 | metric = paddle.metric.Accuracy()
70 |
71 | global_step = 0
72 | for epoch in range(1, epochs + 1):
73 | for step, batch in enumerate(train_data_loader, start=1):
74 | input_ids, segment_ids, labels = batch
75 | logits = model(input_ids, segment_ids)
76 | loss = criterion(logits, labels)
77 | probs = F.softmax(logits, axis=1)
78 | correct = metric.compute(probs, labels)
79 | metric.update(correct)
80 | acc = metric.accumulate()
81 |
82 | global_step += 1
83 | if global_step % 10 == 0:
84 | print("global step %d, epoch: %d, batch: %d, loss: %.5f, acc: %.5f" % (
85 | global_step, epoch, step, loss, acc))
86 | if global_step % 50 == 0:
87 | print("*************dev result****************")
88 | evaluate(model, criterion, metric, dev_data_loader)
89 | print("*************dev result****************")
90 | loss.backward()
91 | optimizer.step()
92 | lr_scheduler.step()
93 | optimizer.clear_grad()
94 |
95 | print("*************dev result****************")
96 | evaluate(model, criterion, metric, dev_data_loader)
97 | print("*************dev result****************")
98 |
99 | model.save_pretrained('./role_judge_model')
100 | tokenizer.save_pretrained('./role_judge_model')
101 |
102 | if __name__ == '__main__':
103 | train()
--------------------------------------------------------------------------------
/role_classification/utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2022/7/28 15:21
3 | # @Author: Jielong Tang
4 | # @File : utils.py
5 |
6 | import numpy as np
7 | import paddle
8 | import paddle.nn.functional as F
9 | from paddlenlp.data import Stack, Tuple, Pad
10 |
11 |
12 | def predict(model, data, tokenizer, label_map, batch_size=1):
13 |
14 | examples = []
15 | input_ids, segment_ids = convert_example(
16 | data,
17 | tokenizer,
18 | max_seq_length=42,
19 | is_test=True)
20 | examples.append((input_ids, segment_ids))
21 |
22 | batchify_fn = lambda samples, fn=Tuple(
23 | Pad(axis=0, pad_val=tokenizer.pad_token_id), # input id
24 | Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment id
25 | ): fn(samples)
26 |
27 | # Seperates data into some batches.
28 | batches = []
29 | one_batch = []
30 | for example in examples:
31 | one_batch.append(example)
32 | if len(one_batch) == batch_size:
33 | batches.append(one_batch)
34 | one_batch = []
35 | if one_batch:
36 | # The last batch whose size is less than the config batch_size setting.
37 | batches.append(one_batch)
38 |
39 | results = []
40 | model.eval()
41 | for batch in batches:
42 | input_ids, segment_ids = batchify_fn(batch)
43 | input_ids = paddle.to_tensor(input_ids)
44 | segment_ids = paddle.to_tensor(segment_ids)
45 | logits = model(input_ids, segment_ids)
46 | probs = F.softmax(logits, axis=1)
47 | idx = paddle.argmax(probs, axis=1).numpy()
48 | idx = idx.tolist()
49 | # labels = [label_map[i] for i in idx]
50 | results.extend(idx)
51 | return results
52 |
53 |
54 | @paddle.no_grad()
55 | def evaluate(model, criterion, metric, data_loader):
56 | model.eval()
57 | metric.reset()
58 | losses = []
59 | for batch in data_loader:
60 | input_ids, token_type_ids, labels = batch
61 | logits = model(input_ids, token_type_ids)
62 | loss = criterion(logits, labels)
63 | losses.append(loss.numpy())
64 | correct = metric.compute(logits, labels)
65 | metric.update(correct)
66 | accu = metric.accumulate()
67 | print("eval loss: %.5f, accu: %.5f" % (np.mean(losses), accu))
68 | model.train()
69 | metric.reset()
70 |
71 |
72 | def convert_example(example, tokenizer, max_seq_length=512, is_test=False):
73 | """
74 | Builds model inputs from a sequence or a pair of sequence for sequence classification tasks
75 | by concatenating and adding special tokens. And creates a mask from the two sequences passed
76 | to be used in a sequence-pair classification task.
77 |
78 | A BERT sequence has the following format:
79 |
80 | - single sequence: ``[CLS] X [SEP]``
81 | - pair of sequences: ``[CLS] A [SEP] B [SEP]``
82 |
83 | A BERT sequence pair mask has the following format:
84 | ::
85 | 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
86 | | first sequence | second sequence |
87 |
88 | If only one sequence, only returns the first portion of the mask (0's).
89 |
90 |
91 | Args:
92 | example(obj:`list[str]`): List of input data, containing text and label if it have label.
93 | tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
94 | which contains most of the methods. Users should refer to the superclass for more information regarding methods.
95 | max_seq_len(obj:`int`): The maximum total input sequence length after tokenization.
96 | Sequences longer than this will be truncated, sequences shorter will be padded.
97 | is_test(obj:`False`, defaults to `False`): Whether the example contains label or not.
98 |
99 | Returns:
100 | input_ids(obj:`list[int]`): The list of token ids.
101 | token_type_ids(obj: `list[int]`): List of sequence pair mask.
102 | label(obj:`numpy.array`, data type of int64, optional): The input label if not is_test.
103 | """
104 | encoded_inputs = tokenizer(text=example["text"], max_seq_len=max_seq_length)
105 | input_ids = encoded_inputs["input_ids"]
106 | token_type_ids = encoded_inputs["token_type_ids"]
107 |
108 | if not is_test:
109 | label = np.array([example["label"]], dtype="int64")
110 | return input_ids, token_type_ids, label
111 | else:
112 | return input_ids, token_type_ids
113 |
114 |
115 | # Reads data and generates mini-batches.
116 | def create_dataloader(dataset,
117 | trans_function=None,
118 | mode='train',
119 | batch_size=1,
120 | pad_token_id=0,
121 | batchify_fn=None):
122 | # if trans_function:
123 | # dataset = dataset.apply(trans_function, lazy=True)
124 |
125 | # return_list 数据是否以list形式返回
126 | dataloader = paddle.io.DataLoader(
127 | dataset,
128 | return_list=True,
129 | batch_size=batch_size,
130 | collate_fn=batchify_fn)
131 |
132 | return dataloader
--------------------------------------------------------------------------------
/run_ee.sh:
--------------------------------------------------------------------------------
1 | dataset_name=EE1.0
2 | data_dir=./data/${dataset_name}
3 | #conf_dir=./conf/${dataset_name}
4 | conf_dir=./data/${dataset_name}
5 | ckpt_dir=./ckpt/${dataset_name}
6 | #submit_data_path=./submit/test_duee_1.json
7 |
8 | learning_rate=5e-5
9 | max_seq_len=300
10 | batch_size=32
11 | epoch=60
12 |
13 | echo -e "check and create directory"
14 | dir_list=(./ckpt ${ckpt_dir} ./submit)
15 | for item in ${dir_list[*]}
16 | do
17 | if [ ! -d ${item} ]; then
18 | mkdir ${item}
19 | echo "create dir * ${item} *"
20 | else
21 | echo "dir ${item} exist"
22 | fi
23 | done
24 |
25 | process_name=${1}
26 | pred_data=${data_dir}/multiLabel_result.json # 多标签预测结果
27 |
28 | run_role_labeling_model(){
29 | model=${1}
30 | is_train=${2}
31 | pred_save_path=../data/result/${model}
32 | cd Event_extraction || { echo "Enter Failure"; exit 1; }
33 | sh run_role_labeling.sh .${data_dir}/${model} .${conf_dir}/${model}_tag.dict .${ckpt_dir}/${model} .${pred_data} ${learning_rate} ${is_train} ${max_seq_len} ${batch_size} ${epoch} ${pred_save_path}
34 | }
35 |
36 | run_trigger_labeling_model(){
37 | model=${1}
38 | is_train=${2}
39 | pred_save_path=../data/result/${model}
40 | cd Event_extraction || { echo "Enter Failure"; exit 1; }
41 | sh run_trigger_labeling.sh .${data_dir}/${model} .${conf_dir}/${model}_tag.dict .${ckpt_dir}/${model} .${pred_data} ${learning_rate} ${is_train} ${max_seq_len} ${batch_size} ${epoch} ${pred_save_path}
42 | }
43 |
44 | if [ ${process_name} == data_prepare ]; then
45 | echo -e "\nstart ${dataset_name} data prepare"
46 | cd Event_extraction || { echo "Enter Failure"; exit 1; }
47 | python duee_1_data_prepare.py
48 | echo -e "end ${dataset_name} data prepare"
49 | elif [ ${process_name} == data_augmentation ]; then
50 | echo -e "\nstart ${dataset_name} data augmentation"
51 | cd Event_extraction || { echo "Enter Failure"; exit 1; }
52 | python data_agumentation.py
53 | echo -e "end ${dataset_name} data augmentation"
54 | elif [ ${process_name} == multi_label_train ]; then
55 | echo -e "\nstart multi_label train"
56 | cd bert_multi_classification || { echo "Enter Failure"; exit 1; }
57 | python train.py
58 | echo -e "end multi_label train"
59 | elif [ ${process_name} == multi_label_predict ]; then
60 | echo -e "\nstart multi_label predict"
61 | cd bert_multi_classification || { echo "Enter Failure"; exit 1; }
62 | python predict_result.py
63 | echo -e "end multi_label predict"
64 | elif [ ${process_name} == trigger_train ]; then
65 | echo -e "\nstart ${dataset_name} trigger train"
66 | run_trigger_labeling_model trigger True
67 | echo -e "end ${dataset_name} trigger train"
68 | elif [ ${process_name} == trigger_predict ]; then
69 | echo -e "\nstart ${dataset_name} trigger predict"
70 | run_trigger_labeling_model trigger False
71 | echo -e "end ${dataset_name} trigger predict"
72 | elif [ ${process_name} == role_train ]; then
73 | echo -e "\nstart ${dataset_name} role train"
74 | run_role_labeling_model role True
75 | echo -e "end ${dataset_name} role train"
76 | elif [ ${process_name} == role_predict ]; then
77 | echo -e "\nstart ${dataset_name} role predict"
78 | run_role_labeling_model role False
79 | echo -e "end ${dataset_name} role predict"
80 | elif [ ${process_name} == objDec_train ]; then
81 | echo -e "\nstart yolov5 train"
82 | cd yolov5 || { echo "Enter Failure"; exit 1; }
83 | python train.py
84 | echo -e "end yolov5 train"
85 | elif [ ${process_name} == objDec_predict ]; then
86 | echo -e "\nstart yolov5 predict"
87 | cd yolov5 || { echo "Enter Failure"; exit 1; }
88 | python predict.py
89 | echo -e "end yolov5 predict"
90 | elif [ ${process_name} == pred_2_submit ]; then
91 | echo -e "\nstart ${process_name} predict data merge to submit fotmat"
92 | python postprocess.py
93 | echo -e "end ${process_name} predict data merge"
94 | elif [ ${process_name} == pipeline ]; then
95 | echo -e "\nstart ${process_name} process, it will take some time!!"
96 | #启动多标签多分类预测
97 | echo -e "\nstart multi_label predict"
98 | cd bert_multi_classification || { echo "Enter Failure"; exit 1; }
99 | python predict_result.py
100 | cd ../ || { echo "Enter Failure"; exit 1; }
101 | echo -e "end multi_label predict"
102 | #启动触发词预测
103 | echo -e "\nstart ${dataset_name} trigger predict"
104 | run_trigger_labeling_model trigger False
105 | cd ../ || { echo "Enter Failure"; exit 1; }
106 | echo -e "end ${dataset_name} trigger predict"
107 | #启动论元预测
108 | echo -e "\nstart ${dataset_name} role predict"
109 | run_role_labeling_model role False
110 | cd ../ || { echo "Enter Failure"; exit 1; }
111 | echo -e "end ${dataset_name} role predict"
112 | #启动目标检测预测
113 | echo -e "\nstart yolov5 predict"
114 | cd yolov5 || { echo "Enter Failure"; exit 1; }
115 | python predict.py
116 | cd ../ || { echo "Enter Failure"; exit 1; }
117 | echo -e "end yolov5 predict"
118 | #启动后处理
119 | echo -e "\nstart ${process_name} predict data merge to submit fotmat"
120 | python postprocess.py
121 | echo -e "end ${process_name} predict data merge"
122 |
123 | echo -e "end ${process_name} process"
124 | else
125 | echo "no process name ${process_name}"
126 | fi
--------------------------------------------------------------------------------
/tree.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2022/8/12 14:45
3 | # @Author: Jielong Tang
4 | # @File : tree.py
5 |
6 | import sys
7 | from pathlib import Path
8 |
9 |
10 | class DirectionTree(object):
11 | """生成目录树
12 | @ pathname: 目标目录
13 | @ filename: 要保存成文件的名称
14 | """
15 |
16 | def __init__(self, pathname='.', filename='tree.txt'):
17 | super(DirectionTree, self).__init__()
18 | self.pathname = Path(pathname)
19 | self.filename = filename
20 | self.tree = ''
21 |
22 | def set_path(self, pathname):
23 | self.pathname = Path(pathname)
24 |
25 | def set_filename(self, filename):
26 | self.filename = filename
27 |
28 | def generate_tree(self, n=0):
29 | if self.pathname.is_file():
30 | self.tree += ' |' * n + '-' * 2 + self.pathname.name + '\n'
31 | elif self.pathname.is_dir():
32 | self.tree += ' |' * n + '-' * 2 + \
33 | str(self.pathname.relative_to(self.pathname.parent)) + '\\' + '\n'
34 |
35 | for cp in self.pathname.iterdir():
36 | self.pathname = Path(cp)
37 | self.generate_tree(n + 1)
38 |
39 | def save_file(self):
40 | with open(self.filename, 'w', encoding='utf-8') as f:
41 | f.write(self.tree)
42 |
43 |
44 | if __name__ == '__main__':
45 | dirtree = DirectionTree()
46 | # 命令参数个数为1,生成当前目录的目录树
47 | if len(sys.argv) == 1:
48 | dirtree.set_path(Path.cwd())
49 | dirtree.generate_tree()
50 | print(dirtree.tree)
51 | # 命令参数个数为2并且目录存在存在
52 | elif len(sys.argv) == 2 and Path(sys.argv[1]).exists():
53 | dirtree.set_path(sys.argv[1])
54 | dirtree.generate_tree()
55 | print(dirtree.tree)
56 | # 命令参数个数为3并且目录存在存在
57 | elif len(sys.argv) == 3 and Path(sys.argv[1]).exists():
58 | dirtree.set_path(sys.argv[1])
59 | dirtree.generate_tree()
60 | dirtree.set_filename(sys.argv[2])
61 | dirtree.save_file()
62 | else: # 参数个数太多,无法解析
63 | print('命令行参数太多,请检查!')
--------------------------------------------------------------------------------
/yolov5/README_yolov5.md:
--------------------------------------------------------------------------------
1 | ##### 数据处理
2 |
3 | ---
4 |
5 | - 设置数据相关配置文件:data/ccks2022.yaml
6 | - path / train / val 路径
7 | - nc类别数
8 | - names类别名
9 |
10 |
11 |
12 | ##### Train
13 |
14 | ---
15 |
16 | ```shell
17 | cd ROOTPATH/yolov5
18 |
19 | python train.py --data data/ccks2022.yaml --weights /data/zxwang/models/yolov5s/yolov5l.pt --img 640 --epochs 100 --batch-size 16
20 | ```
21 |
22 |
23 |
24 | ##### Predict
25 |
26 | ---
27 |
28 | ```shell
29 | cd ROOTPATH/yolov5
30 |
31 | python predict.py --input_img_source /data/zxwang/ccks2022/val/dev_images/ --weights /data/zxwang/models/yolov5s/yolov5l.pt --img 640 --epochs 100 --batch-size 16
32 | ```
33 |
34 |
--------------------------------------------------------------------------------
/yolov5/data/ccks2022.yaml:
--------------------------------------------------------------------------------
1 | # YOLOv5 🚀 by Ultralytics, GPL-3.0 license
2 | # COCO128 dataset https://www.kaggle.com/ultralytics/coco128 (first 128 images from COCO train2017) by Ultralytics
3 | # Example usage: python train.py --data coco128.yaml
4 | # parent
5 | # ├── yolov5
6 | # └── datasets
7 | # └── coco128 ← downloads here (7 MB)
8 |
9 |
10 | # Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
11 | path: ../data/ObjectDect # dataset root dir
12 | train: train_images/images # train images (relative to 'path') 128 images
13 | val: dev_images/images # val images (relative to 'path') 128 images
14 | test: # test images (optional)
15 |
16 | # Classes
17 | nc: 5 # number of classes
18 | names: ['airplane', 'boat', 'missile', 'truck', 'submarine'] # class names
19 |
20 |
21 | # Download script/URL (optional)
22 | # download: https://ultralytics.com/assets/coco128.zip
23 |
--------------------------------------------------------------------------------
/yolov5/data/coco128.yaml:
--------------------------------------------------------------------------------
1 | # YOLOv5 🚀 by Ultralytics, GPL-3.0 license
2 | # COCO128 dataset https://www.kaggle.com/ultralytics/coco128 (first 128 images from COCO train2017) by Ultralytics
3 | # Example usage: python train.py --data coco128.yaml
4 | # parent
5 | # ├── yolov5
6 | # └── datasets
7 | # └── coco128 ← downloads here (7 MB)
8 |
9 |
10 | # Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
11 | path: ../datasets/coco128 # dataset root dir
12 | train: images/train2017 # train images (relative to 'path') 128 images
13 | val: images/train2017 # val images (relative to 'path') 128 images
14 | test: # test images (optional)
15 |
16 | # Classes
17 | nc: 80 # number of classes
18 | names: ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
19 | 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
20 | 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
21 | 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
22 | 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
23 | 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
24 | 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
25 | 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
26 | 'hair drier', 'toothbrush'] # class names
27 |
28 |
29 | # Download script/URL (optional)
30 | download: https://ultralytics.com/assets/coco128.zip
31 |
--------------------------------------------------------------------------------
/yolov5/data/hyps/hyp.Objects365.yaml:
--------------------------------------------------------------------------------
1 | # YOLOv5 🚀 by Ultralytics, GPL-3.0 license
2 | # Hyperparameters for Objects365 training
3 | # python train.py --weights yolov5m.pt --data Objects365.yaml --evolve
4 | # See Hyperparameter Evolution tutorial for details https://github.com/ultralytics/yolov5#tutorials
5 |
6 | lr0: 0.00258
7 | lrf: 0.17
8 | momentum: 0.779
9 | weight_decay: 0.00058
10 | warmup_epochs: 1.33
11 | warmup_momentum: 0.86
12 | warmup_bias_lr: 0.0711
13 | box: 0.0539
14 | cls: 0.299
15 | cls_pw: 0.825
16 | obj: 0.632
17 | obj_pw: 1.0
18 | iou_t: 0.2
19 | anchor_t: 3.44
20 | anchors: 3.2
21 | fl_gamma: 0.0
22 | hsv_h: 0.0188
23 | hsv_s: 0.704
24 | hsv_v: 0.36
25 | degrees: 0.0
26 | translate: 0.0902
27 | scale: 0.491
28 | shear: 0.0
29 | perspective: 0.0
30 | flipud: 0.0
31 | fliplr: 0.5
32 | mosaic: 1.0
33 | mixup: 0.0
34 | copy_paste: 0.0
35 |
--------------------------------------------------------------------------------
/yolov5/data/hyps/hyp.VOC.yaml:
--------------------------------------------------------------------------------
1 | # YOLOv5 🚀 by Ultralytics, GPL-3.0 license
2 | # Hyperparameters for VOC training
3 | # python train.py --batch 128 --weights yolov5m6.pt --data VOC.yaml --epochs 50 --img 512 --hyp hyp.scratch-med.yaml --evolve
4 | # See Hyperparameter Evolution tutorial for details https://github.com/ultralytics/yolov5#tutorials
5 |
6 | # YOLOv5 Hyperparameter Evolution Results
7 | # Best generation: 467
8 | # Last generation: 996
9 | # metrics/precision, metrics/recall, metrics/mAP_0.5, metrics/mAP_0.5:0.95, val/box_loss, val/obj_loss, val/cls_loss
10 | # 0.87729, 0.85125, 0.91286, 0.72664, 0.0076739, 0.0042529, 0.0013865
11 |
12 | lr0: 0.00334
13 | lrf: 0.15135
14 | momentum: 0.74832
15 | weight_decay: 0.00025
16 | warmup_epochs: 3.3835
17 | warmup_momentum: 0.59462
18 | warmup_bias_lr: 0.18657
19 | box: 0.02
20 | cls: 0.21638
21 | cls_pw: 0.5
22 | obj: 0.51728
23 | obj_pw: 0.67198
24 | iou_t: 0.2
25 | anchor_t: 3.3744
26 | fl_gamma: 0.0
27 | hsv_h: 0.01041
28 | hsv_s: 0.54703
29 | hsv_v: 0.27739
30 | degrees: 0.0
31 | translate: 0.04591
32 | scale: 0.75544
33 | shear: 0.0
34 | perspective: 0.0
35 | flipud: 0.0
36 | fliplr: 0.5
37 | mosaic: 0.85834
38 | mixup: 0.04266
39 | copy_paste: 0.0
40 | anchors: 3.412
41 |
--------------------------------------------------------------------------------
/yolov5/data/hyps/hyp.scratch-high.yaml:
--------------------------------------------------------------------------------
1 | # YOLOv5 🚀 by Ultralytics, GPL-3.0 license
2 | # Hyperparameters for high-augmentation COCO training from scratch
3 | # python train.py --batch 32 --cfg yolov5m6.yaml --weights '' --data coco.yaml --img 1280 --epochs 300
4 | # See tutorials for hyperparameter evolution https://github.com/ultralytics/yolov5#tutorials
5 |
6 | lr0: 0.01 # initial learning rate (SGD=1E-2, Adam=1E-3)
7 | lrf: 0.1 # final OneCycleLR learning rate (lr0 * lrf)
8 | momentum: 0.937 # SGD momentum/Adam beta1
9 | weight_decay: 0.0005 # optimizer weight decay 5e-4
10 | warmup_epochs: 3.0 # warmup epochs (fractions ok)
11 | warmup_momentum: 0.8 # warmup initial momentum
12 | warmup_bias_lr: 0.1 # warmup initial bias lr
13 | box: 0.05 # box loss gain
14 | cls: 0.3 # cls loss gain
15 | cls_pw: 1.0 # cls BCELoss positive_weight
16 | obj: 0.7 # obj loss gain (scale with pixels)
17 | obj_pw: 1.0 # obj BCELoss positive_weight
18 | iou_t: 0.20 # IoU training threshold
19 | anchor_t: 4.0 # anchor-multiple threshold
20 | # anchors: 3 # anchors per output layer (0 to ignore)
21 | fl_gamma: 0.0 # focal loss gamma (efficientDet default gamma=1.5)
22 | hsv_h: 0.015 # image HSV-Hue augmentation (fraction)
23 | hsv_s: 0.7 # image HSV-Saturation augmentation (fraction)
24 | hsv_v: 0.4 # image HSV-Value augmentation (fraction)
25 | degrees: 0.0 # image rotation (+/- deg)
26 | translate: 0.1 # image translation (+/- fraction)
27 | scale: 0.9 # image scale (+/- gain)
28 | shear: 0.0 # image shear (+/- deg)
29 | perspective: 0.0 # image perspective (+/- fraction), range 0-0.001
30 | flipud: 0.0 # image flip up-down (probability)
31 | fliplr: 0.5 # image flip left-right (probability)
32 | mosaic: 1.0 # image mosaic (probability)
33 | mixup: 0.1 # image mixup (probability)
34 | copy_paste: 0.1 # segment copy-paste (probability)
35 |
--------------------------------------------------------------------------------
/yolov5/data/hyps/hyp.scratch-low.yaml:
--------------------------------------------------------------------------------
1 | # YOLOv5 🚀 by Ultralytics, GPL-3.0 license
2 | # Hyperparameters for low-augmentation COCO training from scratch
3 | # python train.py --batch 64 --cfg yolov5n6.yaml --weights '' --data coco.yaml --img 640 --epochs 300 --linear
4 | # See tutorials for hyperparameter evolution https://github.com/ultralytics/yolov5#tutorials
5 |
6 | lr0: 0.01 # initial learning rate (SGD=1E-2, Adam=1E-3)
7 | lrf: 0.01 # final OneCycleLR learning rate (lr0 * lrf)
8 | momentum: 0.937 # SGD momentum/Adam beta1
9 | weight_decay: 0.0005 # optimizer weight decay 5e-4
10 | warmup_epochs: 3.0 # warmup epochs (fractions ok)
11 | warmup_momentum: 0.8 # warmup initial momentum
12 | warmup_bias_lr: 0.1 # warmup initial bias lr
13 | box: 0.05 # box loss gain
14 | cls: 0.5 # cls loss gain
15 | cls_pw: 1.0 # cls BCELoss positive_weight
16 | obj: 1.0 # obj loss gain (scale with pixels)
17 | obj_pw: 1.0 # obj BCELoss positive_weight
18 | iou_t: 0.20 # IoU training threshold
19 | anchor_t: 4.0 # anchor-multiple threshold
20 | # anchors: 3 # anchors per output layer (0 to ignore)
21 | fl_gamma: 0.0 # focal loss gamma (efficientDet default gamma=1.5)
22 | hsv_h: 0.015 # image HSV-Hue augmentation (fraction)
23 | hsv_s: 0.7 # image HSV-Saturation augmentation (fraction)
24 | hsv_v: 0.4 # image HSV-Value augmentation (fraction)
25 | degrees: 0.0 # image rotation (+/- deg)
26 | translate: 0.1 # image translation (+/- fraction)
27 | scale: 0.5 # image scale (+/- gain)
28 | shear: 0.0 # image shear (+/- deg)
29 | perspective: 0.0 # image perspective (+/- fraction), range 0-0.001
30 | flipud: 0.0 # image flip up-down (probability)
31 | fliplr: 0.5 # image flip left-right (probability)
32 | mosaic: 1.0 # image mosaic (probability)
33 | mixup: 0.0 # image mixup (probability)
34 | copy_paste: 0.0 # segment copy-paste (probability)
35 |
--------------------------------------------------------------------------------
/yolov5/data/hyps/hyp.scratch-med.yaml:
--------------------------------------------------------------------------------
1 | # YOLOv5 🚀 by Ultralytics, GPL-3.0 license
2 | # Hyperparameters for medium-augmentation COCO training from scratch
3 | # python train.py --batch 32 --cfg yolov5m6.yaml --weights '' --data coco.yaml --img 1280 --epochs 300
4 | # See tutorials for hyperparameter evolution https://github.com/ultralytics/yolov5#tutorials
5 |
6 | lr0: 0.01 # initial learning rate (SGD=1E-2, Adam=1E-3)
7 | lrf: 0.1 # final OneCycleLR learning rate (lr0 * lrf)
8 | momentum: 0.937 # SGD momentum/Adam beta1
9 | weight_decay: 0.0005 # optimizer weight decay 5e-4
10 | warmup_epochs: 3.0 # warmup epochs (fractions ok)
11 | warmup_momentum: 0.8 # warmup initial momentum
12 | warmup_bias_lr: 0.1 # warmup initial bias lr
13 | box: 0.05 # box loss gain
14 | cls: 0.3 # cls loss gain
15 | cls_pw: 1.0 # cls BCELoss positive_weight
16 | obj: 0.7 # obj loss gain (scale with pixels)
17 | obj_pw: 1.0 # obj BCELoss positive_weight
18 | iou_t: 0.20 # IoU training threshold
19 | anchor_t: 4.0 # anchor-multiple threshold
20 | # anchors: 3 # anchors per output layer (0 to ignore)
21 | fl_gamma: 0.0 # focal loss gamma (efficientDet default gamma=1.5)
22 | hsv_h: 0.015 # image HSV-Hue augmentation (fraction)
23 | hsv_s: 0.7 # image HSV-Saturation augmentation (fraction)
24 | hsv_v: 0.4 # image HSV-Value augmentation (fraction)
25 | degrees: 0.0 # image rotation (+/- deg)
26 | translate: 0.1 # image translation (+/- fraction)
27 | scale: 0.9 # image scale (+/- gain)
28 | shear: 0.0 # image shear (+/- deg)
29 | perspective: 0.0 # image perspective (+/- fraction), range 0-0.001
30 | flipud: 0.0 # image flip up-down (probability)
31 | fliplr: 0.5 # image flip left-right (probability)
32 | mosaic: 1.0 # image mosaic (probability)
33 | mixup: 0.1 # image mixup (probability)
34 | copy_paste: 0.0 # segment copy-paste (probability)
35 |
--------------------------------------------------------------------------------
/yolov5/data/scripts/download_weights.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # YOLOv5 🚀 by Ultralytics, GPL-3.0 license
3 | # Download latest models from https://github.com/ultralytics/yolov5/releases
4 | # Example usage: bash path/to/download_weights.sh
5 | # parent
6 | # └── yolov5
7 | # ├── yolov5s.pt ← downloads here
8 | # ├── yolov5m.pt
9 | # └── ...
10 |
11 | python - <.
68 | """
69 |
70 | def __init__(self, c1):
71 | super().__init__()
72 | self.p1 = nn.Parameter(torch.randn(1, c1, 1, 1))
73 | self.p2 = nn.Parameter(torch.randn(1, c1, 1, 1))
74 | self.beta = nn.Parameter(torch.ones(1, c1, 1, 1))
75 |
76 | def forward(self, x):
77 | dpx = (self.p1 - self.p2) * x
78 | return dpx * torch.sigmoid(self.beta * dpx) + self.p2 * x
79 |
80 |
81 | class MetaAconC(nn.Module):
82 | r""" ACON activation (activate or not)
83 | MetaAconC: (p1*x-p2*x) * sigmoid(beta*(p1*x-p2*x)) + p2*x, beta is generated by a small network
84 | according to "Activate or Not: Learning Customized Activation" .
85 | """
86 |
87 | def __init__(self, c1, k=1, s=1, r=16): # ch_in, kernel, stride, r
88 | super().__init__()
89 | c2 = max(r, c1 // r)
90 | self.p1 = nn.Parameter(torch.randn(1, c1, 1, 1))
91 | self.p2 = nn.Parameter(torch.randn(1, c1, 1, 1))
92 | self.fc1 = nn.Conv2d(c1, c2, k, s, bias=True)
93 | self.fc2 = nn.Conv2d(c2, c1, k, s, bias=True)
94 | # self.bn1 = nn.BatchNorm2d(c2)
95 | # self.bn2 = nn.BatchNorm2d(c1)
96 |
97 | def forward(self, x):
98 | y = x.mean(dim=2, keepdims=True).mean(dim=3, keepdims=True)
99 | # batch-size 1 bug/instabilities https://github.com/ultralytics/yolov5/issues/2891
100 | # beta = torch.sigmoid(self.bn2(self.fc2(self.bn1(self.fc1(y))))) # bug/unstable
101 | beta = torch.sigmoid(self.fc2(self.fc1(y))) # bug patch BN layers removed
102 | dpx = (self.p1 - self.p2) * x
103 | return dpx * torch.sigmoid(beta * dpx) + self.p2 * x
104 |
--------------------------------------------------------------------------------
/yolov5/utils/autoanchor.py:
--------------------------------------------------------------------------------
1 | # YOLOv5 🚀 by Ultralytics, GPL-3.0 license
2 | """
3 | AutoAnchor utils
4 | """
5 |
6 | import random
7 |
8 | import numpy as np
9 | import torch
10 | import yaml
11 | from tqdm import tqdm
12 |
13 | from utils.general import LOGGER, colorstr, emojis
14 |
15 | PREFIX = colorstr('AutoAnchor: ')
16 |
17 |
18 | def check_anchor_order(m):
19 | # Check anchor order against stride order for YOLOv5 Detect() module m, and correct if necessary
20 | a = m.anchors.prod(-1).mean(-1).view(-1) # mean anchor area per output layer
21 | da = a[-1] - a[0] # delta a
22 | ds = m.stride[-1] - m.stride[0] # delta s
23 | if da and (da.sign() != ds.sign()): # same order
24 | LOGGER.info('{}Reversing anchor order'.format(PREFIX))
25 | m.anchors[:] = m.anchors.flip(0)
26 |
27 |
28 | def check_anchors(dataset, model, thr=4.0, imgsz=640):
29 | # Check anchor fit to data, recompute if necessary
30 | m = model.module.model[-1] if hasattr(model, 'module') else model.model[-1] # Detect()
31 | shapes = imgsz * dataset.shapes / dataset.shapes.max(1, keepdims=True)
32 | scale = np.random.uniform(0.9, 1.1, size=(shapes.shape[0], 1)) # augment scale
33 | wh = torch.tensor(np.concatenate([l[:, 3:5] * s for s, l in zip(shapes * scale, dataset.labels)])).float() # wh
34 |
35 | def metric(k): # compute metric
36 | r = wh[:, None] / k[None]
37 | x = torch.min(r, 1 / r).min(2)[0] # ratio metric
38 | best = x.max(1)[0] # best_x
39 | aat = (x > 1 / thr).float().sum(1).mean() # anchors above threshold
40 | bpr = (best > 1 / thr).float().mean() # best possible recall
41 | return bpr, aat
42 |
43 | stride = m.stride.to(m.anchors.device).view(-1, 1, 1) # model strides
44 | anchors = m.anchors.clone() * stride # current anchors
45 | bpr, aat = metric(anchors.cpu().view(-1, 2))
46 | s = '\n{}{:.2f} anchors/target, {:.3f} Best Possible Recall (BPR). '.format(PREFIX, aat, bpr)
47 | if bpr > 0.98: # threshold to recompute
48 | LOGGER.info(emojis('{}Current anchors are a good fit to dataset ✅'.format(s)))
49 | else:
50 | LOGGER.info(emojis('{}Anchors are a poor fit to dataset ⚠️, attempting to improve...'.format(s)))
51 | na = m.anchors.numel() // 2 # number of anchors
52 | try:
53 | anchors = kmean_anchors(dataset, n=na, img_size=imgsz, thr=thr, gen=1000, verbose=False)
54 | except Exception as e:
55 | LOGGER.info('{}ERROR: {}'.format(PREFIX, e))
56 | new_bpr = metric(anchors)[0]
57 | if new_bpr > bpr: # replace anchors
58 | anchors = torch.tensor(anchors, device=m.anchors.device).type_as(m.anchors)
59 | m.anchors[:] = anchors.clone().view_as(m.anchors)
60 | check_anchor_order(m) # must be in pixel-space (not grid-space)
61 | m.anchors /= stride
62 | s = '{}Done ✅ (optional: update model *.yaml to use these anchors in the future)'.format(PREFIX)
63 | else:
64 | s = '{}Done ⚠️ (original anchors better than new anchors, proceeding with original anchors)'.format(PREFIX)
65 | LOGGER.info(emojis(s))
66 |
67 |
68 | def kmean_anchors(dataset='./data/coco128.yaml', n=9, img_size=640, thr=4.0, gen=1000, verbose=True):
69 | """ Creates kmeans-evolved anchors from training dataset
70 |
71 | Arguments:
72 | dataset: path to data.yaml, or a loaded dataset
73 | n: number of anchors
74 | img_size: image size used for training
75 | thr: anchor-label wh ratio threshold hyperparameter hyp['anchor_t'] used for training, default=4.0
76 | gen: generations to evolve anchors using genetic algorithm
77 | verbose: print all results
78 |
79 | Return:
80 | k: kmeans evolved anchors
81 |
82 | Usage:
83 | from utils.autoanchor import *; _ = kmean_anchors()
84 | """
85 | from scipy.cluster.vq import kmeans
86 |
87 | npr = np.random
88 | thr = 1 / thr
89 |
90 | def metric(k, wh): # compute metrics
91 | r = wh[:, None] / k[None]
92 | x = torch.min(r, 1 / r).min(2)[0] # ratio metric
93 | # x = wh_iou(wh, torch.tensor(k)) # iou metric
94 | return x, x.max(1)[0] # x, best_x
95 |
96 | def anchor_fitness(k): # mutation fitness
97 | _, best = metric(torch.tensor(k, dtype=torch.float32), wh)
98 | return (best * (best > thr).float()).mean() # fitness
99 |
100 | def print_results(k, verbose=True):
101 | k = k[np.argsort(k.prod(1))] # sort small to large
102 | x, best = metric(k, wh0)
103 | bpr, aat = (best > thr).float().mean(), (x > thr).float().mean() * n # best possible recall, anch > thr
104 | s = '{}thr={:.2f}: {:.4f} best possible recall, {:.2f} anchors past thr\n'.format(PREFIX, thr, bpr, aat) + \
105 | '{}n={}, img_size={}, metric_all={:.3f}/{:.3f}-mean/best, '.format(PREFIX, n, img_size, x.mean(), best.mean()) + \
106 | 'past_thr={:.3f}-mean: '.format(x[x > thr].mean())
107 | for x in k:
108 | s += '%i,%i, ' % (round(x[0]), round(x[1]))
109 | if verbose:
110 | LOGGER.info(s[:-2])
111 | return k
112 |
113 | if isinstance(dataset, str): # *.yaml file
114 | with open(dataset, errors='ignore') as f:
115 | data_dict = yaml.safe_load(f) # model dict
116 | from utils.dataloaders import LoadImagesAndLabels
117 | dataset = LoadImagesAndLabels(data_dict['train'], augment=True, rect=True)
118 |
119 | # Get label wh
120 | shapes = img_size * dataset.shapes / dataset.shapes.max(1, keepdims=True)
121 | wh0 = np.concatenate([l[:, 3:5] * s for s, l in zip(shapes, dataset.labels)]) # wh
122 |
123 | # Filter
124 | i = (wh0 < 3.0).any(1).sum()
125 | if i:
126 | LOGGER.info('{}WARNING: Extremely small objects found: {} of {} labels are < 3 pixels in size'.format(PREFIX, i, len(wh0)))
127 | wh = wh0[(wh0 >= 2.0).any(1)] # filter > 2 pixels
128 | # wh = wh * (npr.rand(wh.shape[0], 1) * 0.9 + 0.1) # multiply by random scale 0-1
129 |
130 | # Kmeans init
131 | try:
132 | LOGGER.info('{}Running kmeans for {} anchors on {} points...'.format(PREFIX, n, len(wh)))
133 | assert n <= len(wh) # apply overdetermined constraint
134 | s = wh.std(0) # sigmas for whitening
135 | k = kmeans(wh / s, n, iter=30)[0] * s # points
136 | assert n == len(k) # kmeans may return fewer points than requested if wh is insufficient or too similar
137 | except Exception:
138 | LOGGER.warning('{}WARNING: switching strategies from kmeans to random init'.format(PREFIX))
139 | k = np.sort(npr.rand(n * 2)).reshape(n, 2) * img_size # random init
140 | wh, wh0 = (torch.tensor(x, dtype=torch.float32) for x in (wh, wh0))
141 | k = print_results(k, verbose=False)
142 |
143 | # Plot
144 | # k, d = [None] * 20, [None] * 20
145 | # for i in tqdm(range(1, 21)):
146 | # k[i-1], d[i-1] = kmeans(wh / s, i) # points, mean distance
147 | # fig, ax = plt.subplots(1, 2, figsize=(14, 7), tight_layout=True)
148 | # ax = ax.ravel()
149 | # ax[0].plot(np.arange(1, 21), np.array(d) ** 2, marker='.')
150 | # fig, ax = plt.subplots(1, 2, figsize=(14, 7)) # plot wh
151 | # ax[0].hist(wh[wh[:, 0]<100, 0],400)
152 | # ax[1].hist(wh[wh[:, 1]<100, 1],400)
153 | # fig.savefig('wh.png', dpi=200)
154 |
155 | # Evolve
156 | f, sh, mp, s = anchor_fitness(k), k.shape, 0.9, 0.1 # fitness, generations, mutation prob, sigma
157 | pbar = tqdm(range(gen), bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') # progress bar
158 | for _ in pbar:
159 | v = np.ones(sh)
160 | while (v == 1).all(): # mutate until a change occurs (prevent duplicates)
161 | v = ((npr.random(sh) < mp) * random.random() * npr.randn(*sh) * s + 1).clip(0.3, 3.0)
162 | kg = (k.copy() * v).clip(min=2.0)
163 | fg = anchor_fitness(kg)
164 | if fg > f:
165 | f, k = fg, kg.copy()
166 | pbar.desc = '{}Evolving anchors with Genetic Algorithm: fitness = {:.4f}'.format(PREFIX, f)
167 | if verbose:
168 | print_results(k, verbose)
169 |
170 | return print_results(k)
171 |
--------------------------------------------------------------------------------
/yolov5/utils/autobatch.py:
--------------------------------------------------------------------------------
1 | # YOLOv5 🚀 by Ultralytics, GPL-3.0 license
2 | """
3 | Auto-batch utils
4 | """
5 |
6 | from copy import deepcopy
7 |
8 | import numpy as np
9 | import torch
10 |
11 | from utils.general import LOGGER, colorstr, emojis
12 | from utils.torch_utils import profile
13 |
14 |
15 | def check_train_batch_size(model, imgsz=640, amp=True):
16 | # Check YOLOv5 training batch size
17 | with torch.cuda.amp.autocast(amp):
18 | return autobatch(deepcopy(model).train(), imgsz) # compute optimal batch size
19 |
20 |
21 | def autobatch(model, imgsz=640, fraction=0.9, batch_size=16):
22 | # Automatically estimate best batch size to use `fraction` of available CUDA memory
23 | # Usage:
24 | # import torch
25 | # from utils.autobatch import autobatch
26 | # model = torch.hub.load('ultralytics/yolov5', 'yolov5s', autoshape=False)
27 | # print(autobatch(model))
28 |
29 | # Check device
30 | prefix = colorstr('AutoBatch: ')
31 | LOGGER.info(f'{prefix}Computing optimal batch size for --imgsz {imgsz}')
32 | device = next(model.parameters()).device # get model device
33 | if device.type == 'cpu':
34 | LOGGER.info(f'{prefix}CUDA not detected, using default CPU batch-size {batch_size}')
35 | return batch_size
36 |
37 | # Inspect CUDA memory
38 | gb = 1 << 30 # bytes to GiB (1024 ** 3)
39 | d = str(device).upper() # 'CUDA:0'
40 | properties = torch.cuda.get_device_properties(device) # device properties
41 | t = properties.total_memory / gb # GiB total
42 | r = torch.cuda.memory_reserved(device) / gb # GiB reserved
43 | a = torch.cuda.memory_allocated(device) / gb # GiB allocated
44 | f = t - (r + a) # GiB free
45 | LOGGER.info(f'{prefix}{d} ({properties.name}) {t:.2f}G total, {r:.2f}G reserved, {a:.2f}G allocated, {f:.2f}G free')
46 |
47 | # Profile batch sizes
48 | batch_sizes = [1, 2, 4, 8, 16]
49 | try:
50 | img = [torch.zeros(b, 3, imgsz, imgsz) for b in batch_sizes]
51 | results = profile(img, model, n=3, device=device)
52 | except Exception as e:
53 | LOGGER.warning(f'{prefix}{e}')
54 |
55 | # Fit a solution
56 | y = [x[2] for x in results if x] # memory [2]
57 | p = np.polyfit(batch_sizes[:len(y)], y, deg=1) # first degree polynomial fit
58 | b = int((f * fraction - p[1]) / p[0]) # y intercept (optimal batch size)
59 | if None in results: # some sizes failed
60 | i = results.index(None) # first fail index
61 | if b >= batch_sizes[i]: # y intercept above failure point
62 | b = batch_sizes[max(i - 1, 0)] # select prior safe point
63 |
64 | fraction = np.polyval(p, b) / t # actual fraction predicted
65 | LOGGER.info(emojis(f'{prefix}Using batch-size {b} for {d} {t * fraction:.2f}G/{t:.2f}G ({fraction * 100:.0f}%) ✅'))
66 | return b
67 |
--------------------------------------------------------------------------------
/yolov5/utils/aws/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tangjielong928/Multi-modal-Event-Extraction/4e133e609f3ada97862dd65123652f569761de5d/yolov5/utils/aws/__init__.py
--------------------------------------------------------------------------------
/yolov5/utils/aws/mime.sh:
--------------------------------------------------------------------------------
1 | # AWS EC2 instance startup 'MIME' script https://aws.amazon.com/premiumsupport/knowledge-center/execute-user-data-ec2/
2 | # This script will run on every instance restart, not only on first start
3 | # --- DO NOT COPY ABOVE COMMENTS WHEN PASTING INTO USERDATA ---
4 |
5 | Content-Type: multipart/mixed; boundary="//"
6 | MIME-Version: 1.0
7 |
8 | --//
9 | Content-Type: text/cloud-config; charset="us-ascii"
10 | MIME-Version: 1.0
11 | Content-Transfer-Encoding: 7bit
12 | Content-Disposition: attachment; filename="cloud-config.txt"
13 |
14 | #cloud-config
15 | cloud_final_modules:
16 | - [scripts-user, always]
17 |
18 | --//
19 | Content-Type: text/x-shellscript; charset="us-ascii"
20 | MIME-Version: 1.0
21 | Content-Transfer-Encoding: 7bit
22 | Content-Disposition: attachment; filename="userdata.txt"
23 |
24 | #!/bin/bash
25 | # --- paste contents of userdata.sh here ---
26 | --//
27 |
--------------------------------------------------------------------------------
/yolov5/utils/aws/resume.py:
--------------------------------------------------------------------------------
1 | # Resume all interrupted trainings in yolov5/ dir including DDP trainings
2 | # Usage: $ python utils/aws/resume.py
3 |
4 | import os
5 | import sys
6 | from pathlib import Path
7 |
8 | import torch
9 | import yaml
10 |
11 | FILE = Path(__file__).resolve()
12 | ROOT = FILE.parents[2] # YOLOv5 root directory
13 | if str(ROOT) not in sys.path:
14 | sys.path.append(str(ROOT)) # add ROOT to PATH
15 |
16 | port = 0 # --master_port
17 | path = Path('').resolve()
18 | for last in path.rglob('*/**/last.pt'):
19 | ckpt = torch.load(last)
20 | if ckpt['optimizer'] is None:
21 | continue
22 |
23 | # Load opt.yaml
24 | with open(last.parent.parent / 'opt.yaml', errors='ignore') as f:
25 | opt = yaml.safe_load(f)
26 |
27 | # Get device count
28 | d = opt['device'].split(',') # devices
29 | nd = len(d) # number of devices
30 | ddp = nd > 1 or (nd == 0 and torch.cuda.device_count() > 1) # distributed data parallel
31 |
32 | if ddp: # multi-GPU
33 | port += 1
34 | cmd = f'python -m torch.distributed.run --nproc_per_node {nd} --master_port {port} train.py --resume {last}'
35 | else: # single-GPU
36 | cmd = f'python train.py --resume {last}'
37 |
38 | cmd += ' > /dev/null 2>&1 &' # redirect output to dev/null and run in daemon thread
39 | print(cmd)
40 | os.system(cmd)
41 |
--------------------------------------------------------------------------------
/yolov5/utils/aws/userdata.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # AWS EC2 instance startup script https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/user-data.html
3 | # This script will run only once on first instance start (for a re-start script see mime.sh)
4 | # /home/ubuntu (ubuntu) or /home/ec2-user (amazon-linux) is working dir
5 | # Use >300 GB SSD
6 |
7 | cd home/ubuntu
8 | if [ ! -d yolov5 ]; then
9 | echo "Running first-time script." # install dependencies, download COCO, pull Docker
10 | git clone https://github.com/ultralytics/yolov5 -b master && sudo chmod -R 777 yolov5
11 | cd yolov5
12 | bash data/scripts/get_coco.sh && echo "COCO done." &
13 | sudo docker pull ultralytics/yolov5:latest && echo "Docker done." &
14 | python -m pip install --upgrade pip && pip install -r requirements.txt && python detect.py && echo "Requirements done." &
15 | wait && echo "All tasks done." # finish background tasks
16 | else
17 | echo "Running re-start script." # resume interrupted runs
18 | i=0
19 | list=$(sudo docker ps -qa) # container list i.e. $'one\ntwo\nthree\nfour'
20 | while IFS= read -r id; do
21 | ((i++))
22 | echo "restarting container $i: $id"
23 | sudo docker start $id
24 | # sudo docker exec -it $id python train.py --resume # single-GPU
25 | sudo docker exec -d $id python utils/aws/resume.py # multi-scenario
26 | done <<<"$list"
27 | fi
28 |
--------------------------------------------------------------------------------
/yolov5/utils/benchmarks.py:
--------------------------------------------------------------------------------
1 | # YOLOv5 🚀 by Ultralytics, GPL-3.0 license
2 | """
3 | Run YOLOv5 benchmarks on all supported export formats
4 |
5 | Format | `export.py --include` | Model
6 | --- | --- | ---
7 | PyTorch | - | yolov5s.pt
8 | TorchScript | `torchscript` | yolov5s.torchscript
9 | ONNX | `onnx` | yolov5s.onnx
10 | OpenVINO | `openvino` | yolov5s_openvino_model/
11 | TensorRT | `engine` | yolov5s.engine
12 | CoreML | `coreml` | yolov5s.mlmodel
13 | TensorFlow SavedModel | `saved_model` | yolov5s_saved_model/
14 | TensorFlow GraphDef | `pb` | yolov5s.pb
15 | TensorFlow Lite | `tflite` | yolov5s.tflite
16 | TensorFlow Edge TPU | `edgetpu` | yolov5s_edgetpu.tflite
17 | TensorFlow.js | `tfjs` | yolov5s_web_model/
18 |
19 | Requirements:
20 | $ pip install -r requirements.txt coremltools onnx onnx-simplifier onnxruntime openvino-dev tensorflow-cpu # CPU
21 | $ pip install -r requirements.txt coremltools onnx onnx-simplifier onnxruntime-gpu openvino-dev tensorflow # GPU
22 | $ pip install -U nvidia-tensorrt --index-url https://pypi.ngc.nvidia.com # TensorRT
23 |
24 | Usage:
25 | $ python utils/benchmarks.py --weights yolov5s.pt --img 640
26 | """
27 |
28 | import argparse
29 | import sys
30 | import time
31 | from pathlib import Path
32 |
33 | import pandas as pd
34 |
35 | FILE = Path(__file__).resolve()
36 | ROOT = FILE.parents[1] # YOLOv5 root directory
37 | if str(ROOT) not in sys.path:
38 | sys.path.append(str(ROOT)) # add ROOT to PATH
39 | # ROOT = ROOT.relative_to(Path.cwd()) # relative
40 |
41 | import export
42 | import val
43 | from utils import notebook_init
44 | from utils.general import LOGGER, check_yaml, print_args
45 | from utils.torch_utils import select_device
46 |
47 |
48 | def run(
49 | weights=ROOT / 'yolov5s.pt', # weights path
50 | imgsz=640, # inference size (pixels)
51 | batch_size=1, # batch size
52 | data=ROOT / 'data/coco128.yaml', # dataset.yaml path
53 | device='', # cuda device, i.e. 0 or 0,1,2,3 or cpu
54 | half=False, # use FP16 half-precision inference
55 | test=False, # test exports only
56 | pt_only=False, # test PyTorch only
57 | ):
58 | y, t = [], time.time()
59 | device = select_device(device)
60 | for i, (name, f, suffix, gpu) in export.export_formats().iterrows(): # index, (name, file, suffix, gpu-capable)
61 | try:
62 | assert i != 9, 'Edge TPU not supported'
63 | assert i != 10, 'TF.js not supported'
64 | if device.type != 'cpu':
65 | assert gpu, f'{name} inference not supported on GPU'
66 |
67 | # Export
68 | if f == '-':
69 | w = weights # PyTorch format
70 | else:
71 | w = export.run(weights=weights, imgsz=[imgsz], include=[f], device=device, half=half)[-1] # all others
72 | assert suffix in str(w), 'export failed'
73 |
74 | # Validate
75 | result = val.run(data, w, batch_size, imgsz, plots=False, device=device, task='benchmark', half=half)
76 | metrics = result[0] # metrics (mp, mr, map50, map, *losses(box, obj, cls))
77 | speeds = result[2] # times (preprocess, inference, postprocess)
78 | y.append([name, round(metrics[3], 4), round(speeds[1], 2)]) # mAP, t_inference
79 | except Exception as e:
80 | LOGGER.warning(f'WARNING: Benchmark failure for {name}: {e}')
81 | y.append([name, None, None]) # mAP, t_inference
82 | if pt_only and i == 0:
83 | break # break after PyTorch
84 |
85 | # Print results
86 | LOGGER.info('\n')
87 | parse_opt()
88 | notebook_init() # print system info
89 | py = pd.DataFrame(y, columns=['Format', 'mAP@0.5:0.95', 'Inference time (ms)'] if map else ['Format', 'Export', ''])
90 | LOGGER.info(f'\nBenchmarks complete ({time.time() - t:.2f}s)')
91 | LOGGER.info(str(py if map else py.iloc[:, :2]))
92 | return py
93 |
94 |
95 | def test(
96 | weights=ROOT / 'yolov5s.pt', # weights path
97 | imgsz=640, # inference size (pixels)
98 | batch_size=1, # batch size
99 | data=ROOT / 'data/coco128.yaml', # dataset.yaml path
100 | device='', # cuda device, i.e. 0 or 0,1,2,3 or cpu
101 | half=False, # use FP16 half-precision inference
102 | test=False, # test exports only
103 | pt_only=False, # test PyTorch only
104 | ):
105 | y, t = [], time.time()
106 | device = select_device(device)
107 | for i, (name, f, suffix, gpu) in export.export_formats().iterrows(): # index, (name, file, suffix, gpu-capable)
108 | try:
109 | w = weights if f == '-' else \
110 | export.run(weights=weights, imgsz=[imgsz], include=[f], device=device, half=half)[-1] # weights
111 | assert suffix in str(w), 'export failed'
112 | y.append([name, True])
113 | except Exception:
114 | y.append([name, False]) # mAP, t_inference
115 |
116 | # Print results
117 | LOGGER.info('\n')
118 | parse_opt()
119 | notebook_init() # print system info
120 | py = pd.DataFrame(y, columns=['Format', 'Export'])
121 | LOGGER.info(f'\nExports complete ({time.time() - t:.2f}s)')
122 | LOGGER.info(str(py))
123 | return py
124 |
125 |
126 | def parse_opt():
127 | parser = argparse.ArgumentParser()
128 | parser.add_argument('--weights', type=str, default=ROOT / 'yolov5s.pt', help='weights path')
129 | parser.add_argument('--imgsz', '--img', '--img-size', type=int, default=640, help='inference size (pixels)')
130 | parser.add_argument('--batch-size', type=int, default=1, help='batch size')
131 | parser.add_argument('--data', type=str, default=ROOT / 'data/coco128.yaml', help='dataset.yaml path')
132 | parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
133 | parser.add_argument('--half', action='store_true', help='use FP16 half-precision inference')
134 | parser.add_argument('--test', action='store_true', help='test exports only')
135 | parser.add_argument('--pt-only', action='store_true', help='test PyTorch only')
136 | opt = parser.parse_args()
137 | opt.data = check_yaml(opt.data) # check YAML
138 | print_args(vars(opt))
139 | return opt
140 |
141 |
142 | def main(opt):
143 | test(**vars(opt)) if opt.test else run(**vars(opt))
144 |
145 |
146 | if __name__ == "__main__":
147 | opt = parse_opt()
148 | main(opt)
149 |
--------------------------------------------------------------------------------
/yolov5/utils/callbacks.py:
--------------------------------------------------------------------------------
1 | # YOLOv5 🚀 by Ultralytics, GPL-3.0 license
2 | """
3 | Callback utils
4 | """
5 |
6 |
7 | class Callbacks:
8 | """"
9 | Handles all registered callbacks for YOLOv5 Hooks
10 | """
11 |
12 | def __init__(self):
13 | # Define the available callbacks
14 | self._callbacks = {
15 | 'on_pretrain_routine_start': [],
16 | 'on_pretrain_routine_end': [],
17 | 'on_train_start': [],
18 | 'on_train_epoch_start': [],
19 | 'on_train_batch_start': [],
20 | 'optimizer_step': [],
21 | 'on_before_zero_grad': [],
22 | 'on_train_batch_end': [],
23 | 'on_train_epoch_end': [],
24 | 'on_val_start': [],
25 | 'on_val_batch_start': [],
26 | 'on_val_image_end': [],
27 | 'on_val_batch_end': [],
28 | 'on_val_end': [],
29 | 'on_fit_epoch_end': [], # fit = train + val
30 | 'on_model_save': [],
31 | 'on_train_end': [],
32 | 'on_params_update': [],
33 | 'teardown': [],}
34 | self.stop_training = False # set True to interrupt training
35 |
36 | def register_action(self, hook, name='', callback=None):
37 | """
38 | Register a new action to a callback hook
39 |
40 | Args:
41 | hook: The callback hook name to register the action to
42 | name: The name of the action for later reference
43 | callback: The callback to fire
44 | """
45 | assert hook in self._callbacks, f"hook '{hook}' not found in callbacks {self._callbacks}"
46 | assert callable(callback), f"callback '{callback}' is not callable"
47 | self._callbacks[hook].append({'name': name, 'callback': callback})
48 |
49 | def get_registered_actions(self, hook=None):
50 | """"
51 | Returns all the registered actions by callback hook
52 |
53 | Args:
54 | hook: The name of the hook to check, defaults to all
55 | """
56 | return self._callbacks[hook] if hook else self._callbacks
57 |
58 | def run(self, hook, *args, **kwargs):
59 | """
60 | Loop through the registered actions and fire all callbacks
61 |
62 | Args:
63 | hook: The name of the hook to check, defaults to all
64 | args: Arguments to receive from YOLOv5
65 | kwargs: Keyword Arguments to receive from YOLOv5
66 | """
67 |
68 | assert hook in self._callbacks, f"hook '{hook}' not found in callbacks {self._callbacks}"
69 |
70 | for logger in self._callbacks[hook]:
71 | logger['callback'](*args, **kwargs)
72 |
--------------------------------------------------------------------------------
/yolov5/utils/docker/.dockerignore:
--------------------------------------------------------------------------------
1 | # Repo-specific DockerIgnore -------------------------------------------------------------------------------------------
2 | #.git
3 | .cache
4 | .idea
5 | runs
6 | output
7 | coco
8 | storage.googleapis.com
9 |
10 | data/samples/*
11 | **/results*.csv
12 | *.jpg
13 |
14 | # Neural Network weights -----------------------------------------------------------------------------------------------
15 | **/*.pt
16 | **/*.pth
17 | **/*.onnx
18 | **/*.engine
19 | **/*.mlmodel
20 | **/*.torchscript
21 | **/*.torchscript.pt
22 | **/*.tflite
23 | **/*.h5
24 | **/*.pb
25 | *_saved_model/
26 | *_web_model/
27 | *_openvino_model/
28 |
29 | # Below Copied From .gitignore -----------------------------------------------------------------------------------------
30 | # Below Copied From .gitignore -----------------------------------------------------------------------------------------
31 |
32 |
33 | # GitHub Python GitIgnore ----------------------------------------------------------------------------------------------
34 | # Byte-compiled / optimized / DLL files
35 | __pycache__/
36 | *.py[cod]
37 | *$py.class
38 |
39 | # C extensions
40 | *.so
41 |
42 | # Distribution / packaging
43 | .Python
44 | env/
45 | build/
46 | develop-eggs/
47 | dist/
48 | downloads/
49 | eggs/
50 | .eggs/
51 | lib/
52 | lib64/
53 | parts/
54 | sdist/
55 | var/
56 | wheels/
57 | *.egg-info/
58 | wandb/
59 | .installed.cfg
60 | *.egg
61 |
62 | # PyInstaller
63 | # Usually these files are written by a python script from a template
64 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
65 | *.manifest
66 | *.spec
67 |
68 | # Installer logs
69 | pip-log.txt
70 | pip-delete-this-directory.txt
71 |
72 | # Unit test / coverage reports
73 | htmlcov/
74 | .tox/
75 | .coverage
76 | .coverage.*
77 | .cache
78 | nosetests.xml
79 | coverage.xml
80 | *.cover
81 | .hypothesis/
82 |
83 | # Translations
84 | *.mo
85 | *.pot
86 |
87 | # Django stuff:
88 | *.log
89 | local_settings.py
90 |
91 | # Flask stuff:
92 | instance/
93 | .webassets-cache
94 |
95 | # Scrapy stuff:
96 | .scrapy
97 |
98 | # Sphinx documentation
99 | docs/_build/
100 |
101 | # PyBuilder
102 | target/
103 |
104 | # Jupyter Notebook
105 | .ipynb_checkpoints
106 |
107 | # pyenv
108 | .python-version
109 |
110 | # celery beat schedule file
111 | celerybeat-schedule
112 |
113 | # SageMath parsed files
114 | *.sage.py
115 |
116 | # dotenv
117 | .env
118 |
119 | # virtualenv
120 | .venv*
121 | venv*/
122 | ENV*/
123 |
124 | # Spyder project settings
125 | .spyderproject
126 | .spyproject
127 |
128 | # Rope project settings
129 | .ropeproject
130 |
131 | # mkdocs documentation
132 | /site
133 |
134 | # mypy
135 | .mypy_cache/
136 |
137 |
138 | # https://github.com/github/gitignore/blob/master/Global/macOS.gitignore -----------------------------------------------
139 |
140 | # General
141 | .DS_Store
142 | .AppleDouble
143 | .LSOverride
144 |
145 | # Icon must end with two \r
146 | Icon
147 | Icon?
148 |
149 | # Thumbnails
150 | ._*
151 |
152 | # Files that might appear in the root of a volume
153 | .DocumentRevisions-V100
154 | .fseventsd
155 | .Spotlight-V100
156 | .TemporaryItems
157 | .Trashes
158 | .VolumeIcon.icns
159 | .com.apple.timemachine.donotpresent
160 |
161 | # Directories potentially created on remote AFP share
162 | .AppleDB
163 | .AppleDesktop
164 | Network Trash Folder
165 | Temporary Items
166 | .apdisk
167 |
168 |
169 | # https://github.com/github/gitignore/blob/master/Global/JetBrains.gitignore
170 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
171 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
172 |
173 | # User-specific stuff:
174 | .idea/*
175 | .idea/**/workspace.xml
176 | .idea/**/tasks.xml
177 | .idea/dictionaries
178 | .html # Bokeh Plots
179 | .pg # TensorFlow Frozen Graphs
180 | .avi # videos
181 |
182 | # Sensitive or high-churn files:
183 | .idea/**/dataSources/
184 | .idea/**/dataSources.ids
185 | .idea/**/dataSources.local.xml
186 | .idea/**/sqlDataSources.xml
187 | .idea/**/dynamic.xml
188 | .idea/**/uiDesigner.xml
189 |
190 | # Gradle:
191 | .idea/**/gradle.xml
192 | .idea/**/libraries
193 |
194 | # CMake
195 | cmake-build-debug/
196 | cmake-build-release/
197 |
198 | # Mongo Explorer plugin:
199 | .idea/**/mongoSettings.xml
200 |
201 | ## File-based project format:
202 | *.iws
203 |
204 | ## Plugin-specific files:
205 |
206 | # IntelliJ
207 | out/
208 |
209 | # mpeltonen/sbt-idea plugin
210 | .idea_modules/
211 |
212 | # JIRA plugin
213 | atlassian-ide-plugin.xml
214 |
215 | # Cursive Clojure plugin
216 | .idea/replstate.xml
217 |
218 | # Crashlytics plugin (for Android Studio and IntelliJ)
219 | com_crashlytics_export_strings.xml
220 | crashlytics.properties
221 | crashlytics-build.properties
222 | fabric.properties
223 |
--------------------------------------------------------------------------------
/yolov5/utils/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | # YOLOv5 🚀 by Ultralytics, GPL-3.0 license
2 | # Builds ultralytics/yolov5:latest image on DockerHub https://hub.docker.com/r/ultralytics/yolov5
3 | # Image is CUDA-optimized for YOLOv5 single/multi-GPU training and inference
4 |
5 | # Start FROM NVIDIA PyTorch image https://ngc.nvidia.com/catalog/containers/nvidia:pytorch
6 | FROM nvcr.io/nvidia/pytorch:22.05-py3
7 | RUN rm -rf /opt/pytorch # remove 1.2GB dir
8 |
9 | # Downloads to user config dir
10 | ADD https://ultralytics.com/assets/Arial.ttf https://ultralytics.com/assets/Arial.Unicode.ttf /root/.config/Ultralytics/
11 |
12 | # Install linux packages
13 | RUN apt update && apt install --no-install-recommends -y zip htop screen libgl1-mesa-glx
14 |
15 | # Install pip packages
16 | COPY requirements.txt .
17 | RUN python -m pip install --upgrade pip
18 | RUN pip uninstall -y torch torchvision torchtext Pillow
19 | RUN pip install --no-cache -r requirements.txt albumentations wandb gsutil notebook Pillow>=9.1.0 \
20 | 'opencv-python<4.6.0.66' \
21 | --extra-index-url https://download.pytorch.org/whl/cu113
22 |
23 | # Create working directory
24 | RUN mkdir -p /usr/src/app
25 | WORKDIR /usr/src/app
26 |
27 | # Copy contents
28 | COPY . /usr/src/app
29 | RUN git clone https://github.com/ultralytics/yolov5 /usr/src/yolov5
30 |
31 | # Set environment variables
32 | ENV OMP_NUM_THREADS=8
33 |
34 |
35 | # Usage Examples -------------------------------------------------------------------------------------------------------
36 |
37 | # Build and Push
38 | # t=ultralytics/yolov5:latest && sudo docker build -f utils/docker/Dockerfile -t $t . && sudo docker push $t
39 |
40 | # Pull and Run
41 | # t=ultralytics/yolov5:latest && sudo docker pull $t && sudo docker run -it --ipc=host --gpus all $t
42 |
43 | # Pull and Run with local directory access
44 | # t=ultralytics/yolov5:latest && sudo docker pull $t && sudo docker run -it --ipc=host --gpus all -v "$(pwd)"/datasets:/usr/src/datasets $t
45 |
46 | # Kill all
47 | # sudo docker kill $(sudo docker ps -q)
48 |
49 | # Kill all image-based
50 | # sudo docker kill $(sudo docker ps -qa --filter ancestor=ultralytics/yolov5:latest)
51 |
52 | # Bash into running container
53 | # sudo docker exec -it 5a9b5863d93d bash
54 |
55 | # Bash into stopped container
56 | # id=$(sudo docker ps -qa) && sudo docker start $id && sudo docker exec -it $id bash
57 |
58 | # Clean up
59 | # docker system prune -a --volumes
60 |
61 | # Update Ubuntu drivers
62 | # https://www.maketecheasier.com/install-nvidia-drivers-ubuntu/
63 |
64 | # DDP test
65 | # python -m torch.distributed.run --nproc_per_node 2 --master_port 1 train.py --epochs 3
66 |
67 | # GCP VM from Image
68 | # docker.io/ultralytics/yolov5:latest
69 |
--------------------------------------------------------------------------------
/yolov5/utils/docker/Dockerfile-arm64:
--------------------------------------------------------------------------------
1 | # YOLOv5 🚀 by Ultralytics, GPL-3.0 license
2 | # Builds ultralytics/yolov5:latest-arm64 image on DockerHub https://hub.docker.com/r/ultralytics/yolov5
3 | # Image is aarch64-compatible for Apple M1 and other ARM architectures i.e. Jetson Nano and Raspberry Pi
4 |
5 | # Start FROM Ubuntu image https://hub.docker.com/_/ubuntu
6 | FROM arm64v8/ubuntu:20.04
7 |
8 | # Downloads to user config dir
9 | ADD https://ultralytics.com/assets/Arial.ttf https://ultralytics.com/assets/Arial.Unicode.ttf /root/.config/Ultralytics/
10 |
11 | # Install linux packages
12 | RUN apt update
13 | RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt install -y tzdata
14 | RUN apt install --no-install-recommends -y python3-pip git zip curl htop gcc \
15 | libgl1-mesa-glx libglib2.0-0 libpython3.8-dev
16 | # RUN alias python=python3
17 |
18 | # Install pip packages
19 | COPY requirements.txt .
20 | RUN python3 -m pip install --upgrade pip
21 | RUN pip install --no-cache -r requirements.txt gsutil notebook \
22 | tensorflow-aarch64
23 | # tensorflowjs \
24 | # onnx onnx-simplifier onnxruntime \
25 | # coremltools openvino-dev \
26 |
27 | # Create working directory
28 | RUN mkdir -p /usr/src/app
29 | WORKDIR /usr/src/app
30 |
31 | # Copy contents
32 | COPY . /usr/src/app
33 | RUN git clone https://github.com/ultralytics/yolov5 /usr/src/yolov5
34 |
35 |
36 | # Usage Examples -------------------------------------------------------------------------------------------------------
37 |
38 | # Build and Push
39 | # t=ultralytics/yolov5:latest-M1 && sudo docker build --platform linux/arm64 -f utils/docker/Dockerfile-arm64 -t $t . && sudo docker push $t
40 |
41 | # Pull and Run
42 | # t=ultralytics/yolov5:latest-M1 && sudo docker pull $t && sudo docker run -it --ipc=host -v "$(pwd)"/datasets:/usr/src/datasets $t
43 |
--------------------------------------------------------------------------------
/yolov5/utils/docker/Dockerfile-cpu:
--------------------------------------------------------------------------------
1 | # YOLOv5 🚀 by Ultralytics, GPL-3.0 license
2 | # Builds ultralytics/yolov5:latest-cpu image on DockerHub https://hub.docker.com/r/ultralytics/yolov5
3 | # Image is CPU-optimized for ONNX, OpenVINO and PyTorch YOLOv5 deployments
4 |
5 | # Start FROM Ubuntu image https://hub.docker.com/_/ubuntu
6 | FROM ubuntu:20.04
7 |
8 | # Downloads to user config dir
9 | ADD https://ultralytics.com/assets/Arial.ttf https://ultralytics.com/assets/Arial.Unicode.ttf /root/.config/Ultralytics/
10 |
11 | # Install linux packages
12 | RUN apt update
13 | RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt install -y tzdata
14 | RUN apt install --no-install-recommends -y python3-pip git zip curl htop libgl1-mesa-glx libglib2.0-0 libpython3.8-dev
15 | # RUN alias python=python3
16 |
17 | # Install pip packages
18 | COPY requirements.txt .
19 | RUN python3 -m pip install --upgrade pip
20 | RUN pip install --no-cache -r requirements.txt albumentations gsutil notebook \
21 | coremltools onnx onnx-simplifier onnxruntime openvino-dev tensorflow-cpu tensorflowjs \
22 | --extra-index-url https://download.pytorch.org/whl/cpu
23 |
24 | # Create working directory
25 | RUN mkdir -p /usr/src/app
26 | WORKDIR /usr/src/app
27 |
28 | # Copy contents
29 | COPY . /usr/src/app
30 | RUN git clone https://github.com/ultralytics/yolov5 /usr/src/yolov5
31 |
32 |
33 | # Usage Examples -------------------------------------------------------------------------------------------------------
34 |
35 | # Build and Push
36 | # t=ultralytics/yolov5:latest-cpu && sudo docker build -f utils/docker/Dockerfile-cpu -t $t . && sudo docker push $t
37 |
38 | # Pull and Run
39 | # t=ultralytics/yolov5:latest-cpu && sudo docker pull $t && sudo docker run -it --ipc=host -v "$(pwd)"/datasets:/usr/src/datasets $t
40 |
--------------------------------------------------------------------------------
/yolov5/utils/downloads.py:
--------------------------------------------------------------------------------
1 | # YOLOv5 🚀 by Ultralytics, GPL-3.0 license
2 | """
3 | Download utils
4 | """
5 |
6 | import logging
7 | import os
8 | import platform
9 | import subprocess
10 | import time
11 | import urllib
12 | from pathlib import Path
13 | from zipfile import ZipFile
14 |
15 | import requests
16 | import torch
17 |
18 |
19 | def is_url(url):
20 | # Check if online file exists
21 | try:
22 | r = urllib.request.urlopen(url) # response
23 | return r.getcode() == 200
24 | except urllib.request.HTTPError:
25 | return False
26 |
27 |
28 | def gsutil_getsize(url=''):
29 | # gs://bucket/file size https://cloud.google.com/storage/docs/gsutil/commands/du
30 | s = subprocess.check_output('gsutil du {}'.format(url), shell=True).decode('utf-8')
31 | return eval(s.split(' ')[0]) if len(s) else 0 # bytes
32 |
33 |
34 | def safe_download(file, url, url2=None, min_bytes=1E0, error_msg=''):
35 | # Attempts to download file from url or url2, checks and removes incomplete downloads < min_bytes
36 | from utils.general import LOGGER
37 |
38 | file = Path(file)
39 | assert_msg = "Downloaded file '{}' does not exist or size is < min_bytes={}".format(file, min_bytes)
40 | try: # url1
41 | LOGGER.info('Downloading {} to {}...'.format(url, file))
42 | torch.hub.download_url_to_file(url, str(file), progress=LOGGER.level <= logging.INFO)
43 | assert file.exists() and file.stat().st_size > min_bytes, assert_msg # check
44 | except Exception as e: # url2
45 | file.unlink(missing_ok=True) # remove partial downloads
46 | LOGGER.info('ERROR: {}\nRe-attempting {} to {}...'.format(e, url2 or url, file))
47 | os.system("curl -L '{}' -o '{}' --retry 3 -C -".format(url2 or url, file)) # curl download, retry and resume on fail
48 | finally:
49 | if not file.exists() or file.stat().st_size < min_bytes: # check
50 | file.unlink(missing_ok=True) # remove partial downloads
51 | LOGGER.info("ERROR: {}\n{}".format(assert_msg, error_msg))
52 | LOGGER.info('')
53 |
54 |
55 | def attempt_download(file, repo='ultralytics/yolov5', release='v6.1'):
56 | # Attempt file download from GitHub release assets if not found locally. release = 'latest', 'v6.1', etc.
57 | from utils.general import LOGGER
58 |
59 | def github_assets(repository, version='latest'):
60 | # Return GitHub repo tag (i.e. 'v6.1') and assets (i.e. ['yolov5s.pt', 'yolov5m.pt', ...])
61 | if version != 'latest':
62 | version = 'tags/{}'.format(version) # i.e. tags/v6.1
63 | response = requests.get('https://api.github.com/repos/{}/releases/{}'.format(repository, version)).json() # github api
64 | return response['tag_name'], [x['name'] for x in response['assets']] # tag, assets
65 |
66 | file = Path(str(file).strip().replace("'", ''))
67 | if not file.exists():
68 | # URL specified
69 | name = Path(urllib.parse.unquote(str(file))).name # decode '%2F' to '/' etc.
70 | if str(file).startswith(('http:/', 'https:/')): # download
71 | url = str(file).replace(':/', '://') # Pathlib turns :// -> :/
72 | file = name.split('?')[0] # parse authentication https://url.com/file.txt?auth...
73 | if Path(file).is_file():
74 | LOGGER.info('Found {} locally at {}'.format(url, file)) # file already exists
75 | else:
76 | safe_download(file=file, url=url, min_bytes=1E5)
77 | return file
78 |
79 | # GitHub assets
80 | assets = [
81 | 'yolov5n.pt', 'yolov5s.pt', 'yolov5m.pt', 'yolov5l.pt', 'yolov5x.pt', 'yolov5n6.pt', 'yolov5s6.pt',
82 | 'yolov5m6.pt', 'yolov5l6.pt', 'yolov5x6.pt']
83 | try:
84 | tag, assets = github_assets(repo, release)
85 | except Exception:
86 | try:
87 | tag, assets = github_assets(repo) # latest release
88 | except Exception:
89 | try:
90 | tag = subprocess.check_output('git tag', shell=True, stderr=subprocess.STDOUT).decode().split()[-1]
91 | except Exception:
92 | tag = release
93 |
94 | file.parent.mkdir(parents=True, exist_ok=True) # make parent dir (if required)
95 | if name in assets:
96 | url3 = 'https://drive.google.com/drive/folders/1EFQTEUeXWSFww0luse2jB9M1QNZQGwNl' # backup gdrive mirror
97 | safe_download(
98 | file,
99 | url='https://github.com/{}/releases/download/{}/{}'.format(repo, tag, name),
100 | url2='https://storage.googleapis.com/{}/{}/{}'.format(repo, tag, name), # backup url (optional)
101 | min_bytes=1E5,
102 | error_msg='{} missing, try downloading from https://github.com/{}/releases/{} or {}'.format(file, repo, tag, url3))
103 |
104 | return str(file)
105 |
106 |
107 | def gdrive_download(id='16TiPfZj7htmTyhntwcZyEEAejOUxuT6m', file='tmp.zip'):
108 | # Downloads a file from Google Drive. from utils.downloads import *; gdrive_download()
109 | t = time.time()
110 | file = Path(file)
111 | cookie = Path('cookie') # gdrive cookie
112 | print('Downloading https://drive.google.com/uc?export=download&id={} as {}... '.format(id, file), end='')
113 | file.unlink(missing_ok=True) # remove existing file
114 | cookie.unlink(missing_ok=True) # remove existing cookie
115 |
116 | # Attempt file download
117 | out = "NUL" if platform.system() == "Windows" else "/dev/null"
118 | os.system('curl -c ./cookie -s -L "drive.google.com/uc?export=download&id={}" > {}'.format(id, out))
119 | if os.path.exists('cookie'): # large file
120 | s = 'curl -Lb ./cookie "drive.google.com/uc?export=download&confirm={}&id={}" -o {}'.format(get_token(), id, file)
121 | else: # small file
122 | s = 'curl -s -L -o {} "drive.google.com/uc?export=download&id={}"'.format(file, id)
123 | r = os.system(s) # execute, capture return
124 | cookie.unlink(missing_ok=True) # remove existing cookie
125 |
126 | # Error check
127 | if r != 0:
128 | file.unlink(missing_ok=True) # remove partial
129 | print('Download error ') # raise Exception('Download error')
130 | return r
131 |
132 | # Unzip if archive
133 | if file.suffix == '.zip':
134 | print('unzipping... ', end='')
135 | ZipFile(file).extractall(path=file.parent) # unzip
136 | file.unlink() # remove zip
137 |
138 | print('Done ({:.1f}s)'.format(time.time() - t))
139 | return r
140 |
141 |
142 | def get_token(cookie="./cookie"):
143 | with open(cookie) as f:
144 | for line in f:
145 | if "download" in line:
146 | return line.split()[-1]
147 | return ""
148 |
149 |
150 | # Google utils: https://cloud.google.com/storage/docs/reference/libraries ----------------------------------------------
151 | #
152 | #
153 | # def upload_blob(bucket_name, source_file_name, destination_blob_name):
154 | # # Uploads a file to a bucket
155 | # # https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python
156 | #
157 | # storage_client = storage.Client()
158 | # bucket = storage_client.get_bucket(bucket_name)
159 | # blob = bucket.blob(destination_blob_name)
160 | #
161 | # blob.upload_from_filename(source_file_name)
162 | #
163 | # print('File {} uploaded to {}.'.format(
164 | # source_file_name,
165 | # destination_blob_name))
166 | #
167 | #
168 | # def download_blob(bucket_name, source_blob_name, destination_file_name):
169 | # # Uploads a blob from a bucket
170 | # storage_client = storage.Client()
171 | # bucket = storage_client.get_bucket(bucket_name)
172 | # blob = bucket.blob(source_blob_name)
173 | #
174 | # blob.download_to_filename(destination_file_name)
175 | #
176 | # print('Blob {} downloaded to {}.'.format(
177 | # source_blob_name,
178 | # destination_file_name))
179 |
--------------------------------------------------------------------------------
/yolov5/utils/flask_rest_api/README.md:
--------------------------------------------------------------------------------
1 | # Flask REST API
2 |
3 | [REST](https://en.wikipedia.org/wiki/Representational_state_transfer) [API](https://en.wikipedia.org/wiki/API)s are
4 | commonly used to expose Machine Learning (ML) models to other services. This folder contains an example REST API
5 | created using Flask to expose the YOLOv5s model from [PyTorch Hub](https://pytorch.org/hub/ultralytics_yolov5/).
6 |
7 | ## Requirements
8 |
9 | [Flask](https://palletsprojects.com/p/flask/) is required. Install with:
10 |
11 | ```shell
12 | $ pip install Flask
13 | ```
14 |
15 | ## Run
16 |
17 | After Flask installation run:
18 |
19 | ```shell
20 | $ python3 restapi.py --port 5000
21 | ```
22 |
23 | Then use [curl](https://curl.se/) to perform a request:
24 |
25 | ```shell
26 | $ curl -X POST -F image=@zidane.jpg 'http://localhost:5000/v1/object-detection/yolov5s'
27 | ```
28 |
29 | The model inference results are returned as a JSON response:
30 |
31 | ```json
32 | [
33 | {
34 | "class": 0,
35 | "confidence": 0.8900438547,
36 | "height": 0.9318675399,
37 | "name": "person",
38 | "width": 0.3264600933,
39 | "xcenter": 0.7438579798,
40 | "ycenter": 0.5207948685
41 | },
42 | {
43 | "class": 0,
44 | "confidence": 0.8440024257,
45 | "height": 0.7155083418,
46 | "name": "person",
47 | "width": 0.6546785235,
48 | "xcenter": 0.427829951,
49 | "ycenter": 0.6334488392
50 | },
51 | {
52 | "class": 27,
53 | "confidence": 0.3771208823,
54 | "height": 0.3902671337,
55 | "name": "tie",
56 | "width": 0.0696444362,
57 | "xcenter": 0.3675483763,
58 | "ycenter": 0.7991207838
59 | },
60 | {
61 | "class": 27,
62 | "confidence": 0.3527112305,
63 | "height": 0.1540903747,
64 | "name": "tie",
65 | "width": 0.0336618312,
66 | "xcenter": 0.7814827561,
67 | "ycenter": 0.5065554976
68 | }
69 | ]
70 | ```
71 |
72 | An example python script to perform inference using [requests](https://docs.python-requests.org/en/master/) is given
73 | in `example_request.py`
74 |
--------------------------------------------------------------------------------
/yolov5/utils/flask_rest_api/example_request.py:
--------------------------------------------------------------------------------
1 | # YOLOv5 🚀 by Ultralytics, GPL-3.0 license
2 | """
3 | Perform test request
4 | """
5 |
6 | import pprint
7 |
8 | import requests
9 |
10 | DETECTION_URL = "http://localhost:5000/v1/object-detection/yolov5s"
11 | IMAGE = "zidane.jpg"
12 |
13 | # Read image
14 | with open(IMAGE, "rb") as f:
15 | image_data = f.read()
16 |
17 | response = requests.post(DETECTION_URL, files={"image": image_data}).json()
18 |
19 | pprint.pprint(response)
20 |
--------------------------------------------------------------------------------
/yolov5/utils/flask_rest_api/restapi.py:
--------------------------------------------------------------------------------
1 | # YOLOv5 🚀 by Ultralytics, GPL-3.0 license
2 | """
3 | Run a Flask REST API exposing a YOLOv5s model
4 | """
5 |
6 | import argparse
7 | import io
8 |
9 | import torch
10 | from flask import Flask, request
11 | from PIL import Image
12 |
13 | app = Flask(__name__)
14 |
15 | DETECTION_URL = "/v1/object-detection/yolov5s"
16 |
17 |
18 | @app.route(DETECTION_URL, methods=["POST"])
19 | def predict():
20 | if request.method != "POST":
21 | return
22 |
23 | if request.files.get("image"):
24 | # Method 1
25 | # with request.files["image"] as f:
26 | # im = Image.open(io.BytesIO(f.read()))
27 |
28 | # Method 2
29 | im_file = request.files["image"]
30 | im_bytes = im_file.read()
31 | im = Image.open(io.BytesIO(im_bytes))
32 |
33 | results = model(im, size=640) # reduce size=320 for faster inference
34 | return results.pandas().xyxy[0].to_json(orient="records")
35 |
36 |
37 | if __name__ == "__main__":
38 | parser = argparse.ArgumentParser(description="Flask API exposing YOLOv5 model")
39 | parser.add_argument("--port", default=5000, type=int, help="port number")
40 | opt = parser.parse_args()
41 |
42 | # Fix known issue urllib.error.HTTPError 403: rate limit exceeded https://github.com/ultralytics/yolov5/pull/7210
43 | torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
44 |
45 | model = torch.hub.load("ultralytics/yolov5", "yolov5s", force_reload=True) # force_reload to recache
46 | app.run(host="0.0.0.0", port=opt.port) # debug=True causes Restarting with stat
47 |
--------------------------------------------------------------------------------
/yolov5/utils/google_app_engine/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gcr.io/google-appengine/python
2 |
3 | # Create a virtualenv for dependencies. This isolates these packages from
4 | # system-level packages.
5 | # Use -p python3 or -p python3.7 to select python version. Default is version 2.
6 | RUN virtualenv /env -p python3
7 |
8 | # Setting these environment variables are the same as running
9 | # source /env/bin/activate.
10 | ENV VIRTUAL_ENV /env
11 | ENV PATH /env/bin:$PATH
12 |
13 | RUN apt-get update && apt-get install -y python-opencv
14 |
15 | # Copy the application's requirements.txt and run pip to install all
16 | # dependencies into the virtualenv.
17 | ADD requirements.txt /app/requirements.txt
18 | RUN pip install -r /app/requirements.txt
19 |
20 | # Add the application source code.
21 | ADD . /app
22 |
23 | # Run a WSGI server to serve the application. gunicorn must be declared as
24 | # a dependency in requirements.txt.
25 | CMD gunicorn -b :$PORT main:app
26 |
--------------------------------------------------------------------------------
/yolov5/utils/google_app_engine/additional_requirements.txt:
--------------------------------------------------------------------------------
1 | # add these requirements in your app on top of the existing ones
2 | pip==21.1
3 | Flask==1.0.2
4 | gunicorn==19.9.0
5 |
--------------------------------------------------------------------------------
/yolov5/utils/google_app_engine/app.yaml:
--------------------------------------------------------------------------------
1 | runtime: custom
2 | env: flex
3 |
4 | service: yolov5app
5 |
6 | liveness_check:
7 | initial_delay_sec: 600
8 |
9 | manual_scaling:
10 | instances: 1
11 | resources:
12 | cpu: 1
13 | memory_gb: 4
14 | disk_size_gb: 20
15 |
--------------------------------------------------------------------------------
/yolov5/utils/loggers/wandb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tangjielong928/Multi-modal-Event-Extraction/4e133e609f3ada97862dd65123652f569761de5d/yolov5/utils/loggers/wandb/__init__.py
--------------------------------------------------------------------------------
/yolov5/utils/loggers/wandb/log_dataset.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from wandb_utils import WandbLogger
4 |
5 | from utils.general import LOGGER
6 |
7 | WANDB_ARTIFACT_PREFIX = 'wandb-artifact://'
8 |
9 |
10 | def create_dataset_artifact(opt):
11 | logger = WandbLogger(opt, None, job_type='Dataset Creation') # TODO: return value unused
12 | if not logger.wandb:
13 | LOGGER.info("install wandb using `pip install wandb` to log the dataset")
14 |
15 |
16 | if __name__ == '__main__':
17 | parser = argparse.ArgumentParser()
18 | parser.add_argument('--data', type=str, default='data/coco128.yaml', help='data.yaml path')
19 | parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset')
20 | parser.add_argument('--project', type=str, default='YOLOv5', help='name of W&B Project')
21 | parser.add_argument('--entity', default=None, help='W&B entity')
22 | parser.add_argument('--name', type=str, default='log dataset', help='name of W&B run')
23 |
24 | opt = parser.parse_args()
25 | opt.resume = False # Explicitly disallow resume check for dataset upload job
26 |
27 | create_dataset_artifact(opt)
28 |
--------------------------------------------------------------------------------
/yolov5/utils/loggers/wandb/sweep.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from pathlib import Path
3 |
4 | import wandb
5 |
6 | FILE = Path(__file__).resolve()
7 | ROOT = FILE.parents[3] # YOLOv5 root directory
8 | if str(ROOT) not in sys.path:
9 | sys.path.append(str(ROOT)) # add ROOT to PATH
10 |
11 | from train import parse_opt, train
12 | from utils.callbacks import Callbacks
13 | from utils.general import increment_path
14 | from utils.torch_utils import select_device
15 |
16 |
17 | def sweep():
18 | wandb.init()
19 | # Get hyp dict from sweep agent. Copy because train() modifies parameters which confused wandb.
20 | hyp_dict = vars(wandb.config).get("_items").copy()
21 |
22 | # Workaround: get necessary opt args
23 | opt = parse_opt(known=True)
24 | opt.batch_size = hyp_dict.get("batch_size")
25 | opt.save_dir = str(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok or opt.evolve))
26 | opt.epochs = hyp_dict.get("epochs")
27 | opt.nosave = True
28 | opt.data = hyp_dict.get("data")
29 | opt.weights = str(opt.weights)
30 | opt.cfg = str(opt.cfg)
31 | opt.data = str(opt.data)
32 | opt.hyp = str(opt.hyp)
33 | opt.project = str(opt.project)
34 | device = select_device(opt.device, batch_size=opt.batch_size)
35 |
36 | # train
37 | train(hyp_dict, opt, device, callbacks=Callbacks())
38 |
39 |
40 | if __name__ == "__main__":
41 | sweep()
42 |
--------------------------------------------------------------------------------
/yolov5/utils/loggers/wandb/sweep.yaml:
--------------------------------------------------------------------------------
1 | # Hyperparameters for training
2 | # To set range-
3 | # Provide min and max values as:
4 | # parameter:
5 | #
6 | # min: scalar
7 | # max: scalar
8 | # OR
9 | #
10 | # Set a specific list of search space-
11 | # parameter:
12 | # values: [scalar1, scalar2, scalar3...]
13 | #
14 | # You can use grid, bayesian and hyperopt search strategy
15 | # For more info on configuring sweeps visit - https://docs.wandb.ai/guides/sweeps/configuration
16 |
17 | program: utils/loggers/wandb/sweep.py
18 | method: random
19 | metric:
20 | name: metrics/mAP_0.5
21 | goal: maximize
22 |
23 | parameters:
24 | # hyperparameters: set either min, max range or values list
25 | data:
26 | value: "data/coco128.yaml"
27 | batch_size:
28 | values: [64]
29 | epochs:
30 | values: [10]
31 |
32 | lr0:
33 | distribution: uniform
34 | min: 1e-5
35 | max: 1e-1
36 | lrf:
37 | distribution: uniform
38 | min: 0.01
39 | max: 1.0
40 | momentum:
41 | distribution: uniform
42 | min: 0.6
43 | max: 0.98
44 | weight_decay:
45 | distribution: uniform
46 | min: 0.0
47 | max: 0.001
48 | warmup_epochs:
49 | distribution: uniform
50 | min: 0.0
51 | max: 5.0
52 | warmup_momentum:
53 | distribution: uniform
54 | min: 0.0
55 | max: 0.95
56 | warmup_bias_lr:
57 | distribution: uniform
58 | min: 0.0
59 | max: 0.2
60 | box:
61 | distribution: uniform
62 | min: 0.02
63 | max: 0.2
64 | cls:
65 | distribution: uniform
66 | min: 0.2
67 | max: 4.0
68 | cls_pw:
69 | distribution: uniform
70 | min: 0.5
71 | max: 2.0
72 | obj:
73 | distribution: uniform
74 | min: 0.2
75 | max: 4.0
76 | obj_pw:
77 | distribution: uniform
78 | min: 0.5
79 | max: 2.0
80 | iou_t:
81 | distribution: uniform
82 | min: 0.1
83 | max: 0.7
84 | anchor_t:
85 | distribution: uniform
86 | min: 2.0
87 | max: 8.0
88 | fl_gamma:
89 | distribution: uniform
90 | min: 0.0
91 | max: 4.0
92 | hsv_h:
93 | distribution: uniform
94 | min: 0.0
95 | max: 0.1
96 | hsv_s:
97 | distribution: uniform
98 | min: 0.0
99 | max: 0.9
100 | hsv_v:
101 | distribution: uniform
102 | min: 0.0
103 | max: 0.9
104 | degrees:
105 | distribution: uniform
106 | min: 0.0
107 | max: 45.0
108 | translate:
109 | distribution: uniform
110 | min: 0.0
111 | max: 0.9
112 | scale:
113 | distribution: uniform
114 | min: 0.0
115 | max: 0.9
116 | shear:
117 | distribution: uniform
118 | min: 0.0
119 | max: 10.0
120 | perspective:
121 | distribution: uniform
122 | min: 0.0
123 | max: 0.001
124 | flipud:
125 | distribution: uniform
126 | min: 0.0
127 | max: 1.0
128 | fliplr:
129 | distribution: uniform
130 | min: 0.0
131 | max: 1.0
132 | mosaic:
133 | distribution: uniform
134 | min: 0.0
135 | max: 1.0
136 | mixup:
137 | distribution: uniform
138 | min: 0.0
139 | max: 1.0
140 | copy_paste:
141 | distribution: uniform
142 | min: 0.0
143 | max: 1.0
144 |
--------------------------------------------------------------------------------