├── models
├── __init__.py
├── model_builder.py
├── encoders.py
└── decoders.py
├── dataloaders
├── __init__.py
└── data_loader.py
├── helper_scripts
├── countLinesCONLL.py
├── CombineAnnotatedFiles.py
├── removeAnnotatedSents.py
├── pickKTokens.py
├── pickKTokensRev.py
└── SimulateAnnotations.py
├── utils
├── features.py
└── util.py
├── commands
├── SAL_CT.sh
├── ETAL_FULL_CRF_CT.sh
├── ETAL_PARTIAL_CRF_CT.sh
└── CFEAL_PARTIAL_CRF_CT.sh
├── README.md
├── args.py
├── main.py
└── eval
└── conlleval.v2
/models/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/dataloaders/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/helper_scripts/countLinesCONLL.py:
--------------------------------------------------------------------------------
1 | import codecs
2 | import argparse
3 |
4 | arg_parser = argparse.ArgumentParser()
5 |
6 | arg_parser.add_argument("--input", help="Folder with the raw text data",
7 | default=None,
8 | type=str)
9 |
10 | args = arg_parser.parse_args()
11 | print("Args used for this run:")
12 | print(args)
13 |
14 | with codecs.open(args.input,"r",encoding='utf-8') as fin:
15 | index = 0
16 | one_line = []
17 | for line in fin:
18 | if line == "" or line == "\n":
19 | if len(one_line) > 0:
20 | index +=1
21 | one_line = []
22 | else:
23 | line = line.strip()
24 | one_line.append(line)
25 |
26 | if len(one_line)>0:
27 | index = index + 1
28 | print index
29 |
30 |
--------------------------------------------------------------------------------
/helper_scripts/CombineAnnotatedFiles.py:
--------------------------------------------------------------------------------
1 | import codecs
2 | import argparse
3 |
4 |
5 | arg_parser = argparse.ArgumentParser()
6 |
7 | arg_parser.add_argument("--files", help="File 1",
8 | default=None,nargs='+')
9 |
10 | #arg_parser.add_argument("--file2", help="File 2",
11 | # default=None,
12 | # type=str)
13 |
14 | arg_parser.add_argument("--output", help="Output File",
15 | default=None,
16 | type=str)
17 |
18 | args = arg_parser.parse_args()
19 | print("Args used for this run:")
20 | print(args)
21 |
22 |
23 | files = args.files
24 | fout = codecs.open(args.output, "w", encoding='utf-8')
25 |
26 | for i in files:
27 | with codecs.open(i,"r", encoding='utf-8') as fin:
28 | for line in fin:
29 | fout.write(line)
30 | print "Done reading file: " + str(i)
31 | fout.write("\n")
32 |
--------------------------------------------------------------------------------
/utils/features.py:
--------------------------------------------------------------------------------
1 | import codecs
2 | import numpy as np
3 | import pdb
4 |
5 |
6 | def get_feature_sent(lang, sent, args, cap_ratio_dict, type=None):
7 | dsf = []
8 | individual_feats = []
9 |
10 | if args.cap and not args.use_discrete_features:
11 | cap_feat = [w[0].isupper() for w in sent]
12 | individual_feats.append(cap_feat)
13 |
14 | if args.cap_ratio_path is not None:
15 | cap_feats = []
16 | for w in sent:
17 | # feat = np.zeros(4,)
18 | feat = [0, 0, 0, 0]
19 | if w in cap_ratio_dict:
20 | feat[cap_ratio_dict[w]] = 1
21 | cap_feats.append(feat)
22 | individual_feats.append(cap_feats)
23 |
24 | # individual_feats = zip(*individual_feats) # [(), ()]
25 | if len(dsf) > 0 and len(individual_feats) > 0:
26 | # individual_feats = [list(i) for i in individual_feats]
27 | dsf = [list(i) for i in dsf]
28 | # for i, d in zip(individual_feats, dsf):
29 | # print i, d
30 | # print len(i), len(d)
31 | new_feat = [list(tuple(i + d)) for i, d in zip(individual_feats[0], dsf)]
32 | # pdb.set_trace()
33 | return new_feat
34 | elif len(individual_feats) > 0:
35 | return individual_feats
36 | elif len(dsf) > 0:
37 | return dsf
38 | else:
39 | return []
40 |
41 |
42 | def get_brown_cluster(path):
43 | bc_dict = dict()
44 | linear_map = dict()
45 | with codecs.open(path, "r", "utf-8") as fin:
46 | for line in fin:
47 | fields = line.strip().split('\t')
48 | if len(fields) == 3:
49 | word = fields[1]
50 | binary_string = fields[0]
51 | bid = int(binary_string, 2)
52 | if bid not in linear_map:
53 | linear_map[bid] = len(linear_map)
54 | bc_dict[word] = linear_map[bid]
55 | return bc_dict
56 |
--------------------------------------------------------------------------------
/helper_scripts/removeAnnotatedSents.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import codecs
3 |
4 | def selectUnAnnotated(args):
5 | annotated_sents = set()
6 | with codecs.open(args.annotated, "r",encoding='utf-8') as fin:
7 | sent = []
8 | count = 0
9 | for line in fin:
10 | line = line.strip()
11 | if line == "" or line == "\n":
12 | annotated_sents.add(" ".join(sent))
13 | count +=1
14 | sent =[]
15 | else:
16 | tokens = line.split("\t")
17 | sent.append(tokens[0])
18 |
19 | print(count, len(annotated_sents))
20 | fout = codecs.open("./annotated_sents.txt","w", encoding='utf-8')
21 | for sent in annotated_sents:
22 | fout.write(sent + "\n")
23 |
24 | ffull = codecs.open("./orig_sents.txt","w", encoding='utf-8')
25 | with codecs.open(args.input, "r", encoding='utf-8') as fin, codecs.open(args.output, "w", encoding='utf-8') as fout:
26 | sent = []
27 | tokens = []
28 | for line in fin:
29 | line = line.strip()
30 | if line == "" or line == "\n":
31 | sentence = " ".join(tokens)
32 | ffull.write(sentence + "\n")
33 | tokens = []
34 | if sentence not in annotated_sents:
35 | #q print(sentence)
36 | for l in sent:
37 | fout.write(l + "\n")
38 | fout.write("\n")
39 | sent = []
40 | else:
41 | sent.append(line)
42 | tokens.append(line.split("\t")[0])
43 |
44 |
45 | if __name__ == "__main__":
46 | parser = argparse.ArgumentParser()
47 | parser.add_argument("--input",type=str)
48 | parser.add_argument("--annotated", type=str)
49 | parser.add_argument("--output", type=str)
50 | args = parser.parse_args()
51 | print(args)
52 | selectUnAnnotated(args)
53 |
--------------------------------------------------------------------------------
/helper_scripts/pickKTokens.py:
--------------------------------------------------------------------------------
1 | import codecs, argparse
2 |
3 |
4 | def pickKTokens(args):
5 | with codecs.open(args.input, "r", encoding='utf-8') as fin, codecs.open(args.output, "w", encoding='utf-8') as fout:
6 | count = args.k
7 | one_sent = []
8 | for line in fin:
9 | if line == "" or line == "\n":
10 | for s in one_sent:
11 | fout.write(s + "\n")
12 | fout.write('\n')
13 | one_sent = []
14 | if count <=0:
15 | break
16 |
17 | else:
18 | tokens = line.strip().split("\t")
19 | tag = tokens[1]
20 | token = tokens[0]
21 | if "UNK" in tag:
22 | count -= 1
23 |
24 | one_sent.append(line.strip())
25 |
26 |
27 | if len(one_sent) > 0:
28 | for s in one_sent:
29 | fout.write(s + "\n")
30 | fout.write('\n')
31 |
32 | def pickKTokensRev(args):
33 | with codecs.open(args.input, "r", encoding='utf-8') as fin, codecs.open(args.output, "w", encoding='utf-8') as fout:
34 | count = args.k
35 | one_sent = []
36 | for line in fin:
37 | if line == "" or line == "\n":
38 | for s in one_sent:
39 | fout.write(s + "\n")
40 | fout.write('\n')
41 | one_sent = []
42 | if count <=0:
43 | break
44 |
45 | else:
46 | tokens = line.strip().split("\t")
47 | tag = tokens[1]
48 | token = tokens[0]
49 | count -= 1
50 |
51 | one_sent.append(line.strip())
52 |
53 |
54 | if len(one_sent) > 0:
55 | for s in one_sent:
56 | fout.write(s + "\n")
57 | fout.write('\n')
58 |
59 |
60 |
61 | if __name__ == "__main__":
62 | parser = argparse.ArgumentParser()
63 | parser.add_argument("--input", type=str)
64 | parser.add_argument("--k", type=int)
65 | parser.add_argument("--output",type=str)
66 | args = parser.parse_args()
67 |
68 | pickKTokens(args)
69 | #pickKTokensRev(args)
70 |
--------------------------------------------------------------------------------
/helper_scripts/pickKTokensRev.py:
--------------------------------------------------------------------------------
1 | import codecs, argparse
2 |
3 |
4 | def pickKTokens(args):
5 | with codecs.open(args.input, "r", encoding='utf-8') as fin, codecs.open(args.output, "w", encoding='utf-8') as fout:
6 | count = args.k
7 | one_sent = []
8 | for line in fin:
9 | if line == "" or line == "\n":
10 | for s in one_sent:
11 | fout.write(s + "\n")
12 | fout.write('\n')
13 | one_sent = []
14 | if count <=0:
15 | break
16 |
17 | else:
18 | tokens = line.strip().split("\t")
19 | tag = tokens[1]
20 | token = tokens[0]
21 | if "UNK" in tag:
22 | count -= 1
23 |
24 | one_sent.append(line.strip())
25 |
26 |
27 | if len(one_sent) > 0:
28 | for s in one_sent:
29 | fout.write(s + "\n")
30 | fout.write('\n')
31 |
32 | def pickKTokensRev(args):
33 | with codecs.open(args.input, "r", encoding='utf-8') as fin, codecs.open(args.output, "w", encoding='utf-8') as fout:
34 | count = args.k
35 | one_sent = []
36 | for line in fin:
37 | if line == "" or line == "\n":
38 | for s in one_sent:
39 | fout.write(s + "\n")
40 | fout.write('\n')
41 | one_sent = []
42 | if count <=0:
43 | break
44 |
45 | else:
46 | tokens = line.strip().split("\t")
47 | tag = tokens[1]
48 | token = tokens[0]
49 | count -= 1
50 |
51 | one_sent.append(line.strip())
52 |
53 |
54 | if len(one_sent) > 0:
55 | for s in one_sent:
56 | fout.write(s + "\n")
57 | fout.write('\n')
58 |
59 |
60 |
61 | if __name__ == "__main__":
62 | parser = argparse.ArgumentParser()
63 | parser.add_argument("--input", type=str)
64 | parser.add_argument("--k", type=int)
65 | parser.add_argument("--output",type=str)
66 | args = parser.parse_args()
67 |
68 | #pickKTokens(args)
69 | pickKTokensRev(args)
70 |
--------------------------------------------------------------------------------
/commands/SAL_CT.sh:
--------------------------------------------------------------------------------
1 | DIR="../data/Spanish/SAL_CT"
2 | DATA="../data/Spanish"
3 |
4 | for i in {1..20} ; do
5 | python2 ../helper_scripts/pickKTokensRev.py --input $DIR/to_annotate_v${i}.1_LC.conll --k 200 --output $DIR/to_annotate_v${i}.1_200.conll
6 |
7 | python2 ../helper_scripts/SimulateAnnotations.py --input $DIR/to_annotate_v${i}.1_200.conll --output $DIR/v${i}.1.conll
8 |
9 | PREV=`expr $i - 1`
10 |
11 | python2 ../helper_scripts/removeAnnotatedSents.py --input $DIR//unlabel_v${PREV}.1.conll --annotated $DIR/v${i}.1.conll --output $DIR/unlabel_v${i}.1.conll
12 |
13 | if [ "$i" -gt 1 ]
14 | then
15 | python2 ../helper_scripts/CombineAnnotatedFiles.py --files $DIR/Entropy_v${PREV}.1.conll $DIR/v${i}.1.conll --output $DIR/Entropy_v${i}.1.conll
16 | else
17 | cp $DIR/v1.1.conll $DIR/Entropy_v1.1.conll
18 | fi
19 |
20 | #Train the NER Model Using FineTune
21 | MODEL_NAME="200_SAL_CT_spa_${i}.1_finetune"
22 | python -u ../main.py \
23 | --dynet-seed 3278657 \
24 | --word_emb_dim 100 \
25 | --batch_size 10 \
26 | --model_name ${MODEL_NAME} \
27 | --lang es \
28 | --fixedVocab \
29 | --fineTune \
30 | --test_conll \
31 | --tot_epochs 1000 \
32 | --aug_lang_train_path $DATA/vocab.conll \
33 | --misc \
34 | --init_lr 0.015 \
35 | --load_from_path ../saved_models/spanish_full_transfer_baseline.model \
36 | --valid_freq 1300 \
37 | --pretrain_emb_path $DATA/esp.vec \
38 | --dev_path $DATA/esp.dev \
39 | --test_path $DATA/esp.test \
40 | --train_path $DIR/Entropy_v${i}.1.conll 2>&1 | tee ${MODEL_NAME}.log
41 |
42 |
43 | #Run the Active Learning Session
44 | NEW=`expr $i + 1`
45 | #!/usr/bin/env bash
46 | MODEL_NAME="200_SAL_spa_${i}.1_finetune_activelearning"
47 | python -u ../main.py \
48 | --dynet-seed 3278657 \
49 | --mode test_1 \
50 | --fixedVocab \
51 | --aug_lang_train_path $DATA/vocab.conll \
52 | --misc \
53 | --word_emb_dim 100 \
54 | --model_name ${MODEL_NAME} \
55 | --lang es \
56 | --load_from_path ../saved_models/200_SAL_CT_spa_${i}.1_finetune.model \
57 | --pretrain_emb_path $DATA/esp.vec \
58 | --dev_path $DATA/esp.dev \
59 | --test_path $DIR/unlabel_v${i}.1.conll \
60 | --to_annotate $DIR/to_annotate_v${NEW}.1.conll \
61 | --test_conll \
62 | --k 200 \
63 | --SPAN_wise \
64 | --train_path $DIR/Entropy_v${i}.1.conll 2>&1 | tee ${MODEL_NAME}.log
65 |
66 | done
67 |
--------------------------------------------------------------------------------
/commands/ETAL_FULL_CRF_CT.sh:
--------------------------------------------------------------------------------
1 | DIR="../data/Spanish/ETAL_FULL_CRF_CT"
2 | DATA="../data/Spanish"
3 |
4 | for i in {1..20} ; do
5 | python2 ../helper_scripts/pickKTokens.py --input $DIR/to_annotate_v${i}.1.conll --k 200 --output $DIR/to_annotate_v${i}.1_200.conll
6 |
7 | python2 ../helper_scripts/SimulateAnnotations.py --input $DIR/to_annotate_v${i}.1_200.conll --output $DIR/v${i}.1.conll
8 |
9 | PREV=`expr $i - 1`
10 |
11 | python2 ../helper_scripts/removeAnnotatedSents.py --input $DIR//unlabel_v${PREV}.1.conll --annotated $DIR/v${i}.1.conll --output $DIR/unlabel_v${i}.1.conll
12 |
13 | if [ "$i" -gt 1 ]
14 | then
15 | python2 ../helper_scripts/CombineAnnotatedFiles.py --files $DIR/Entropy_v${PREV}.1.conll $DIR/v${i}.1.conll --output $DIR/Entropy_v${i}.1.conll
16 | else
17 | cp $DIR/v1.1.conll $DIR/Entropy_v1.1.conll
18 | fi
19 |
20 | #Train the NER Model Using FineTune
21 | MODEL_NAME="200_Entropy_Full_CT_spa_${i}.1_finetune"
22 | python -u ../main.py \
23 | --dynet-seed 3278657 \
24 | --word_emb_dim 100 \
25 | --batch_size 10 \
26 | --model_name ${MODEL_NAME} \
27 | --lang es \
28 | --fixedVocab \
29 | --fineTune \
30 | --test_conll \
31 | --tot_epochs 1000 \
32 | --misc \
33 | --aug_lang_train_path $DATA/vocab.conll \
34 | --init_lr 0.015 \
35 | --load_from_path ../saved_models/spanish_full_transfer_baseline.model \
36 | --valid_freq 1300 \
37 | --pretrain_emb_path $DATA/esp.vec \
38 | --dev_path $DATA/esp.dev \
39 | --test_path $DATA/esp.test \
40 | --train_path $DIR/Entropy_v${i}.1.conll 2>&1 | tee ${MODEL_NAME}.log
41 |
42 |
43 | #Run the Active Learning Session
44 | NEW=`expr $i + 1`
45 | #!/usr/bin/env bash
46 | MODEL_NAME="200_Entropy_Full_CT_spa_${i}.1_finetune_activelearning"
47 | python -u ../main.py \
48 | --dynet-seed 3278657 \
49 | --mode test_1 \
50 | --fixedVocab \
51 | --aug_lang_train_path $DATA/vocab.conll \
52 | --word_emb_dim 100 \
53 | --model_name ${MODEL_NAME} \
54 | --lang es \
55 | --misc \
56 | --load_from_path ../saved_models/200_Entropy_Full_CT_spa_${i}.1_finetune.model \
57 | --pretrain_emb_path $DATA/esp.vec \
58 | --dev_path $DATA/esp.dev \
59 | --test_path $DIR/unlabel_v${i}.1.conll \
60 | --to_annotate $DIR/to_annotate_v${NEW}.1.conll \
61 | --ngram 5 \
62 | --test_conll \
63 | --entropy_threshold 1e-8 \
64 | --k 200 \
65 | --SPAN_wise \
66 | --train_path $DIR/Entropy_v${i}.1.conll 2>&1 | tee ${MODEL_NAME}.log
67 |
68 | done
69 |
--------------------------------------------------------------------------------
/commands/ETAL_PARTIAL_CRF_CT.sh:
--------------------------------------------------------------------------------
1 | DIR="../data/Spanish/ETAL_PARTAL_CRF_CT"
2 | DATA="../data/Spanish"
3 |
4 | for i in {1..20} ; do
5 | python2 ../helper_scripts/pickKTokens.py --input $DIR/to_annotate_v${i}.1.conll --k 200 --output $DIR/to_annotate_v${i}.1_200.conll
6 |
7 | python2 ../helper_scripts/SimulateAnnotations.py --input $DIR/to_annotate_v${i}.1_200.conll --output $DIR/v${i}.1.conll --needUNK
8 |
9 | PREV=`expr $i - 1`
10 |
11 | python2 ../helper_scripts/removeAnnotatedSents.py --input $DIR//unlabel_v${PREV}.1.conll --annotated $DIR/v${i}.1.conll --output $DIR/unlabel_v${i}.1.conll
12 |
13 | if [ "$i" -gt 1 ]
14 | then
15 | python2 ../helper_scripts/CombineAnnotatedFiles.py --files $DIR/Entropy_v${PREV}.1.conll $DIR/v${i}.1.conll --output $DIR/Entropy_v${i}.1.conll
16 | else
17 | cp $DIR/v1.1.conll $DIR/Entropy_v1.1.conll
18 | fi
19 |
20 | #Train the NER Model Using FineTune
21 | MODEL_NAME="200_Entropy_Partial_CT_spa_${i}.1_finetune"
22 | python -u ../main.py \
23 | --dynet-seed 3278657 \
24 | --word_emb_dim 100 \
25 | --batch_size 10 \
26 | --model_name ${MODEL_NAME} \
27 | --lang es \
28 | --fixedVocab \
29 | --fineTune \
30 | --test_conll \
31 | --misc \
32 | --tot_epochs 1000 \
33 | --aug_lang_train_path $DATA/vocab.conll \
34 | --init_lr 0.015 \
35 | --load_from_path ../saved_models/spanish_full_transfer_baseline.model \
36 | --valid_freq 1300 \
37 | --pretrain_emb_path $DATA/esp.vec \
38 | --use_partial \
39 | --dev_path $DATA/esp.dev \
40 | --test_path $DATA/esp.test \
41 | --train_path $DIR/Entropy_v${i}.1.conll 2>&1 | tee ${MODEL_NAME}.log
42 |
43 |
44 | #Run the Active Learning Session
45 | NEW=`expr $i + 1`
46 | #!/usr/bin/env bash
47 | MODEL_NAME="200_Entropy_Partial_CT_spa_${i}.1_finetune_activelearning"
48 | python -u ../main.py \
49 | --dynet-seed 3278657 \
50 | --mode test_1 \
51 | --fixedVocab \
52 | --aug_lang_train_path $DATA/vocab.conll \
53 | --word_emb_dim 100 \
54 | --model_name ${MODEL_NAME} \
55 | --lang es \
56 | --load_from_path ../saved_models/200_Entropy_Partial_CT_spa_${i}.1_finetune.model \
57 | --pretrain_emb_path $DATA/esp.vec \
58 | --dev_path $DATA/esp.dev \
59 | --test_path $DIR/unlabel_v${i}.1.conll \
60 | --to_annotate $DIR/to_annotate_v${NEW}.1.conll \
61 | --ngram 5 \
62 | --misc \
63 | --test_conll \
64 | --entropy_threshold 1e-8 \
65 | --use_partial \
66 | --k 200 \
67 | --SPAN_wise \
68 | --train_path $DIR/Entropy_v${i}.1.conll 2>&1 | tee ${MODEL_NAME}.log
69 |
70 | done
71 |
--------------------------------------------------------------------------------
/commands/CFEAL_PARTIAL_CRF_CT.sh:
--------------------------------------------------------------------------------
1 | DIR="../data/Spanish/CFEAL_PARTAL_CRF_CT"
2 | DATA="../data/Spanish"
3 |
4 | for i in {1..20} ; do
5 | python2 ../helper_scripts/pickKTokens.py --input $DIR/to_annotate_v${i}.1.conll --k 200 --output $DIR/to_annotate_v${i}.1_200.conll
6 |
7 | python2 ../helper_scripts/SimulateAnnotations.py --input $DIR/to_annotate_v${i}.1_200.conll --output $DIR/v${i}.1.conll --needUNK
8 |
9 | PREV=`expr $i - 1`
10 |
11 | python2 ../helper_scripts/removeAnnotatedSents.py --input $DIR//unlabel_v${PREV}.1.conll --annotated $DIR/v${i}.1.conll --output $DIR/unlabel_v${i}.1.conll
12 |
13 | if [ "$i" -gt 1 ]
14 | then
15 | python2 ../helper_scripts/CombineAnnotatedFiles.py --files $DIR/Entropy_v${PREV}.1.conll $DIR/v${i}.1.conll --output $DIR/Entropy_v${i}.1.conll
16 | else
17 | cp $DIR/v1.1.conll $DIR/Entropy_v1.1.conll
18 | fi
19 |
20 | #Train the NER Model Using FineTune
21 | MODEL_NAME="200_Entropy_Partial_CT_spa_${i}.1_finetune"
22 | python -u ../main.py \
23 | --dynet-seed 3278657 \
24 | --word_emb_dim 100 \
25 | --batch_size 10 \
26 | --model_name ${MODEL_NAME} \
27 | --lang es \
28 | --fixedVocab \
29 | --fineTune \
30 | --test_conll \
31 | --tot_epochs 1000 \
32 | --aug_lang_train_path $DATA/vocab.conll \
33 | --misc \
34 | --init_lr 0.015 \
35 | --load_from_path ../saved_models/spanish_full_transfer_baseline.model \
36 | --valid_freq 1300 \
37 | --pretrain_emb_path $DATA/esp.vec \
38 | --use_partial \
39 | --dev_path $DATA/esp.dev \
40 | --test_path $DATA/esp.test \
41 | --train_path $DIR/Entropy_v${i}.1.conll 2>&1 | tee ${MODEL_NAME}.log
42 |
43 |
44 | #Run the Active Learning Session
45 | NEW=`expr $i + 1`
46 | #!/usr/bin/env bash
47 | MODEL_NAME="200_Entropy_Partial_CT_spa_${i}.1_finetune_activelearning"
48 | python -u ../main.py \
49 | --dynet-seed 3278657 \
50 | --mode test_1 \
51 | --fixedVocab \
52 | --aug_lang_train_path $DATA/vocab.conll \
53 | --word_emb_dim 100 \
54 | --model_name ${MODEL_NAME} \
55 | --lang es \
56 | --load_from_path ../saved_models/200_Entropy_Partial_CT_spa_${i}.1_finetune.model \
57 | --pretrain_emb_path $DATA/esp.vec \
58 | --dev_path $DATA/esp.dev \
59 | --test_path $DIR/unlabel_v${i}.1.conll \
60 | --to_annotate $DIR/to_annotate_v${NEW}.1.conll \
61 | --misc \
62 | --ngram 5 \
63 | --test_conll \
64 | --entropy_threshold 0 \
65 | --use_partial \
66 | --k 200 \
67 | --use_CFB \
68 | --SPAN_wise \
69 | --train_path $DIR/Entropy_v${i}.1.conll 2>&1 | tee ${MODEL_NAME}.log
70 |
71 | done
72 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Active Learning for Entity Recognition
2 |
3 | ### Requirements
4 | python 2.7
5 | DynetVersion commit 284838815ece9297a7100cc43035e1ea1b133a5
6 |
7 |
8 | ### Data
9 | In the ```data/```, create a directory per language as shown for ```data/Spanish```. Download the CoNLL train/dev/test NER datasets for that language here. To acquire LDC datasets, please get the required access.
10 |
11 |
12 | For storing the trained models, create directory ```saved_models``` in the parent folder.
13 | ### Embeddings
14 | Combine monolingual data acquired from Wikipedia with the plain text extracted from the labeled data. Train 100-d [Glove]((https://nlp.stanford.edu/projects/glove/)) embeddings
15 |
16 | ### Active Learning Simulation
17 | The best NER performance was obtained using fine-tuning training scheme. The scripts below runs simulation active learning runs for different active learning strategies:
18 | ``` cd commands```
19 | * ETAL + Partial-CRF + CT (Proposed recipe)
``` ./ETAL_PARTIAL_CRF_CT.sh ```
20 | * ETAL + Full-CRF + CT
``` ./ETAL_FULL_CRF_CT.sh ```
21 | * CFEAL + Full-CRF + CT
``` ./CFEAL_PARTIAL_CRF_CT.sh ```
22 | * SAL + CT
23 | ``` ./SAL_CT.sh ```
24 | Things to note:
25 |
26 | We load the vocabulary from the following path```--aug_lang_train_path```. Therefore, create a conll formatted file with dummy labels from the unlabeled text.
27 | For our experiments, we concatenated the transferred data with the unlabeled data (which was the entire training dataset) into a single conll formatted file.
28 | The conll format is a tab separated two-column format as shown below:
29 |
30 | ```El O```
31 | ```grupo O```
32 |
33 | The LDC NER label set differ from the CoNLL label set by one tag. Therefore, add ``` --misc ``` to the argument set when running any experiments on CoNLL data. The label set has been hard-coded in the ```data_loaders/data_loader.py``` file.
34 |
35 | ### Cross-Lingual Transferred Data
36 | We used the model proposed by (Xie et al. 2018) to get the cross-lingually transferred data from English.
37 | Please refer to their code [here](https://github.com/thespectrewithin/cross-lingual_NER).
38 |
39 | For the Fine-Tune training scheme, train a base NER model on the transferred model as follows:
40 |
41 | MODEL_NAME="spanish_full_transfer_baseline"
42 | python -u ../main.py \
43 | --dynet-seed 3278657 \
44 | --word_emb_dim 100 \
45 | --batch_size 10 \
46 | --model_name ${MODEL_NAME} \
47 | --lang es \
48 | --fixedVocab \
49 | --test_conll \
50 | --tot_epochs 1000 \
51 | --aug_lang_train_path $DATA/vocab.conll \
52 | --init_lr 0.015 \
53 | --valid_freq 1300 \
54 | --misc \
55 | --pretrain_emb_path $DATA/esp.vec \
56 | --dev_path $DATA/esp.dev \
57 | --test_path $DATA/esp.test \
58 | --train_path $DIR/transferred_data.conll 2>&1 | tee ${MODEL_NAME}.log
59 |
60 | ### References
61 | If you make use of this software for research purposes, we will appreciate citing the following:
62 | ```
63 | @inproceedings{chaudhary19emnlp,
64 | title = {A Little Annotation does a Lot of Good: A Study in Bootstrapping Low-resource Named Entity Recognizers},
65 | author = {Aditi Chaudhary and Jiateng Xie and Zaid Sheikh and Graham Neubig and Jaime Carbonell},
66 | booktitle = {Conference on Empirical Methods in Natural Language Processing (EMNLP)},
67 | address = {Hong Kong},
68 | month = {November},
69 | url = {http://arxiv.org/abs/1908.08983},
70 | year = {2019}
71 | }
72 | ```
73 |
74 | ### Contact
75 | For any issues, please feel free to reach out to `aschaudh@andrew.cmu.edu`.
76 |
--------------------------------------------------------------------------------
/helper_scripts/SimulateAnnotations.py:
--------------------------------------------------------------------------------
1 | import codecs
2 | import argparse
3 | from copy import deepcopy
4 |
5 |
6 |
7 |
8 |
9 |
10 | def annotate(input, output):
11 | gold_lines = []
12 |
13 | with codecs.open(input, "r", encoding='utf-8') as fin,codecs.open(output, "w", encoding='utf-8') as fout:
14 | actual_line = []
15 | actual_one_line = []
16 |
17 | crf_line= []
18 | crf_one_line = []
19 |
20 | gold_one_line = []
21 | prev = ""
22 | for line in fin:
23 | if line == "" or line == "\n":
24 | #fout.write("\n")
25 | actual_line.append(actual_one_line)
26 | actual_one_line = []
27 |
28 | crf_line.append(crf_one_line)
29 | crf_one_line = []
30 |
31 | gold_lines.append(gold_one_line)
32 | gold_one_line = []
33 |
34 | prev = ""
35 | else:
36 | tokens = line.strip().split()
37 | gold_one_line.append(tokens[-1])
38 |
39 | if "UNK" in tokens[1]: #Find the true start of the entity
40 | #fout.write(tokens[0] + "\t" + tokens[-1] + '\n')
41 | actual_one_line.append(tokens[0] + "\t" + tokens[-1])
42 | prev = tokens[-1]
43 |
44 |
45 | else:
46 | #fout.write(tokens[0] + "\t" + tokens[1] + '\n')
47 | # actual_one_line.append(tokens[0] + "\t" + tokens[1])
48 | if prev != "" and tokens[-1].startswith("I-"):
49 | BIO_tag = tokens[-1]
50 | prev =tokens[-1]
51 | else:
52 | if args.needUNK:
53 | BIO_tag = "B-UNK"
54 | else:
55 | #BIO_tag = "O"
56 | BIO_tag = tokens[1]
57 | prev = ""
58 | actual_one_line.append(tokens[0] + "\t" + BIO_tag)
59 |
60 |
61 | index = 0
62 | lines = []
63 | one_line = []
64 | for line in actual_line:
65 | prev = ""
66 | for token_tag in line:
67 | current_tag = token_tag.split("\t")[-1]
68 | token = token_tag.split("\t")[0]
69 |
70 |
71 | if prev != "":
72 | if prev == "O" and "I-" in current_tag:
73 | #print("Check index :{0} for inconsistency {1}".format(index, token))
74 | token_tag = token + "\t" + "B-" + current_tag.split("-")[-1]
75 |
76 | if (prev == "B-PER" or prev == "I-PER") and current_tag in ['I-LOC, I-ORG, I-GPE']:
77 | #print("Check index :{0} for inconsistency {1}".format(index, token))
78 | token_tag = token + "\t" + "I-PER"
79 |
80 | if (prev == "B-GPE" or prev == "I-GPE") and current_tag in ['I-LOC, I-ORG, I-PER']:
81 | #print("Check index :{0} for inconsistency".format(index,token))
82 | token_tag = token + "\t" + "I-GPE"
83 |
84 | if (prev == "B-LOC" or prev == "I-LOC") and current_tag in ['I-PER, I-ORG, I-GPE']:
85 | #print("Check index :{0} for inconsistency {1}".format(index,token))
86 | token_tag = token + "\t" + "I-LOC"
87 |
88 | if (prev == "B-ORG" or prev == "I-ORG") and current_tag in ['I-LOC, I-PER, I-GPE']:
89 | #print("Check index :{0} for inconsistency {1}".format(index,token))
90 | token_tag = token + "\t" + "I-ORG"
91 |
92 |
93 |
94 | prev = current_tag
95 |
96 | index +=1
97 | one_line.append(token_tag)
98 | #fout.write(token_tag + "\n")
99 | index += 1
100 | lines.append(one_line)
101 | one_line =[]
102 | #fout.write("\n")
103 | print(len(lines))
104 | for line_num, line in enumerate(lines):
105 | prev = ""
106 | for token_num, token_tag in enumerate(line):
107 | token = token_tag.split("\t")[0]
108 | tag = token_tag.split("\t")[-1]
109 | if prev != "":
110 | if prev in ["B-UNK","O"] and tag in ["I-LOC", "I-GPE", "I-LOC", "I-MISC","I-PER","I-ORG"]:
111 | gold_one_line = gold_lines[line_num]
112 | gold_cur_tag = gold_one_line[token_num]
113 | temp_num = deepcopy(token_num)
114 | while not gold_cur_tag.startswith("B-"):
115 | temp_num -=1
116 | gold_cur_tag = gold_one_line[temp_num]
117 | line[temp_num] = line[temp_num].split("\t")[0] + "\t" + gold_cur_tag
118 | prev = tag
119 |
120 | for token_tag in line:
121 | fout.write(token_tag + '\n')
122 | fout.write("\n")
123 |
124 |
125 | if __name__ == "__main__":
126 | parser = argparse.ArgumentParser()
127 | parser.add_argument("--input", type=str, default=None, help="Active learning output")
128 | parser.add_argument("--output", type=str, default=None, help ="Simulated NI with gold annotations in place of UNK")
129 | parser.add_argument("--needUNK", default=False, action="store_true", help="Simulated NI with gold annotations in place of UNK")
130 | args = parser.parse_args()
131 |
132 | annotate(args.input, args.output)
133 |
--------------------------------------------------------------------------------
/args.py:
--------------------------------------------------------------------------------
1 | def init_config():
2 | import argparse
3 | parser = argparse.ArgumentParser()
4 | parser.add_argument("--dynet-mem", default=1000, type=int)
5 | parser.add_argument("--dynet-seed", default=5783287, type=int)
6 | parser.add_argument("--dynet-gpu")
7 |
8 | parser.add_argument("--model_name", type=str, default=None)
9 | parser.add_argument("--eval_folder", type=str, default="../eval")
10 | parser.add_argument("--lang", default="english", help="the target language")
11 | parser.add_argument("--train_ensemble", default=False, action="store_true")
12 | parser.add_argument("--full_data_path", type=str, default=None, help="when train_ensemble is true, this one is the full data path from which to load vocabulary.")
13 | parser.add_argument("--train_path", default="../datasets/english/eng.train.bio.conll", type=str)
14 | # parser.add_argument("--train_path", default="../datasets/english/debug_train.bio", type=str)
15 | parser.add_argument("--monolingual_data_path", default=None, type=str)
16 | parser.add_argument("--dev_path", default="../datasets/english/eng.dev.bio.conll", type=str)
17 | parser.add_argument("--test_path", default="../datasets/english/eng.test.bio.conll", type=str)
18 | parser.add_argument("--new_test_path", default="../datasets/english/eng.test.bio.conll", type=str)
19 | parser.add_argument("--new_test_conll", default="../datasets/english/eng.test.bio.conll", type=str)
20 | parser.add_argument("--save_to_path", default="../saved_models/")
21 | parser.add_argument("--load_from_path", default=None)
22 | parser.add_argument("--train_filename_path", default=None, type=str)
23 | parser.add_argument("--dev_filename_path", default=None, type=str)
24 | parser.add_argument("--test_filename_path", default=None, type=str)
25 |
26 |
27 | parser.add_argument("--model_arc", default="char_cnn", choices=["char_cnn", "char_birnn", "char_birnn_cnn", "sep", "sep_cnn_only"], type=str)
28 | parser.add_argument("--tag_emb_dim", default=50, type=int)
29 | parser.add_argument("--pos_emb_dim", default=50, type=int)
30 | parser.add_argument("--char_emb_dim", default=30, type=int)
31 | parser.add_argument("--word_emb_dim", default=100, type=int)
32 | parser.add_argument("--cnn_filter_size", default=30, type=int)
33 | parser.add_argument("--cnn_win_size", default=3, type=int)
34 | parser.add_argument("--rnn_type", default="lstm", choices=['lstm', 'gru'], type=str)
35 | parser.add_argument("--hidden_dim", default=200, type=int, help="token level rnn hidden dim")
36 | parser.add_argument("--char_hidden_dim", default=25, type=int, help="char level rnn hidden dim")
37 | parser.add_argument("--layer", default=1, type=int)
38 |
39 | parser.add_argument("--replace_unk_rate", default=0.0, type=float, help="uses when not all words in the test data is covered by the pretrained embedding")
40 | parser.add_argument("--remove_singleton", default=False, action="store_true")
41 | parser.add_argument("--map_pretrain", default=False, action="store_true")
42 | parser.add_argument("--map_dim", default=100, type=int)
43 | parser.add_argument("--pretrain_fix", default=False, action="store_true")
44 |
45 | parser.add_argument("--output_dropout_rate", default=0.5, type=float, help="dropout applied to the output of birnn before crf")
46 | parser.add_argument("--emb_dropout_rate", default=0.3, type=float, help="dropout applied to the input of token-level birnn")
47 | parser.add_argument("--valid_freq", default=500, type=int)
48 | parser.add_argument("--tot_epochs", default=100, type=int)
49 | parser.add_argument("--batch_size", default=10, type=int)
50 | parser.add_argument("--init_lr", default=0.015, type=float)
51 | parser.add_argument("--lr_decay", default=False, action="store_true")
52 | parser.add_argument("--decay_rate", default=0.05, action="store", type=float)
53 | parser.add_argument("--patience", default=3, type=int)
54 |
55 | parser.add_argument("--tagging_scheme", default="bio", choices=["bio", "bioes"], type=str)
56 |
57 | parser.add_argument("--data_aug", default=False, action="store_true", help="If use data_aug, the train_path should be the combined training file")
58 | parser.add_argument("--aug_lang", default="english", help="the language to augment the dataset")
59 | parser.add_argument("--aug_lang_train_path", default=None, type=str)
60 | parser.add_argument("--tgt_lang_train_path", default="../datasets/english/eng.train.bio.conll", type=str)
61 |
62 | parser.add_argument("--pretrain_emb_path", type=str, default=None)
63 | parser.add_argument("--res_discrete_feature", default=False, action="store_true", help="residual use of discrete features")
64 |
65 | parser.add_argument("--feature_birnn_hidden_dim", default=50, type=int, action="store")
66 |
67 | parser.add_argument("--use_discrete_features", default=False, action="store_true", help="David's indicator features")
68 | parser.add_argument("--split_hashtag", default=False, action="store_true", help="indicator of preceding hashtags")
69 | parser.add_argument("--cap", default=False, action="store_true", help="capitalization feature")
70 | parser.add_argument("--feature_dim", type=int, default=10, help="dimension of discrete features")
71 |
72 | parser.add_argument("--use_brown_cluster", default=False, action="store_true")
73 | parser.add_argument("--brown_cluster_path", action="store", type=str, help="path to the brown cluster features")
74 | parser.add_argument("--brown_cluster_num", default=0, type=int, action="store")
75 | parser.add_argument("--brown_cluster_dim", default=30, type=int, action="store")
76 |
77 | # Use trained model to test
78 | parser.add_argument("--mode", default="train", type=str, choices=["train", "test_1"],
79 | help="test_1: use one model")
80 |
81 | # Partial CRF
82 | parser.add_argument("--use_partial", default=False, action="store_true")
83 |
84 | # Active Learning
85 | parser.add_argument("--ngram", default=2, type=int)
86 | parser.add_argument("--to_annotate", type=str,default="./annotate.txt")
87 | parser.add_argument("--entropy_threshold", type=float, default=None)
88 | parser.add_argument("--use_CFB", default=False, action="store_true")
89 | parser.add_argument("--SPAN_wise", default=False, action="store_true", help="get span wise scores, even if there are duplicates.")
90 | parser.add_argument("--k", default=200, type=int, help="fixed number of spans to annotate")
91 | parser.add_argument("--debug", type=str)
92 | # Format of test output
93 | parser.add_argument("--test_conll", default=False, action="store_true")
94 | parser.add_argument("--fixedVocab", default=False, action="store_true", help="for loading pre-trained model")
95 | parser.add_argument("--fineTune", default=False, action="store_true", help="for loading pre-trained model")
96 | parser.add_argument("--run",default=0, type=int)
97 | parser.add_argument("--misc",default=False, action="store_true")
98 | parser.add_argument("--addbias", default=False, action="store_true")
99 | args = parser.parse_args()
100 |
101 |
102 | if args.train_ensemble:
103 | # model_name = ens_1_ + original
104 | # set dynet seed manually
105 | ens_no = int(args.model_name.split("_")[1])
106 | # dyparams = dy.DynetParams()
107 | # dyparams.set_random_seed(ens_no + 5783287)
108 | # dyparams.init()
109 |
110 | import dynet_config
111 | dynet_config.set(random_seed=ens_no + 5783290)
112 | # if args.cuda:
113 | # dynet_config.set_gpu()
114 |
115 | # args.train_path = args.train_path.split(".")[0] + "_" + str(ens_no) + ".conll"
116 |
117 | if args.full_data_path is None:
118 | args.full_data_path = args.train_path
119 | args.save_to_path = args.save_to_path + args.model_name + ".model"
120 | print(args)
121 | return args
122 |
--------------------------------------------------------------------------------
/models/model_builder.py:
--------------------------------------------------------------------------------
1 | __author__ = 'chuntingzhou and aditichaudhary'
2 | from encoders import *
3 | from decoders import *
4 | from collections import defaultdict
5 | from copy import deepcopy
6 |
7 | #np.set_printoptions(threshold='nan')
8 |
9 |
10 | class CRF_Model(object):
11 | def __init__(self, args, data_loader, lm_data_loader=None):
12 | self.save_to = args.save_to_path
13 | self.load_from = args.load_from_path
14 | tag_to_id = data_loader.tag_to_id
15 | self.constraints = None
16 | # print self.constraints
17 |
18 | #partial CRF
19 | self.use_partial = args.use_partial
20 | self.tag_to_id = tag_to_id
21 | self.B_UNK = data_loader.B_UNK
22 | self.I_UNK = data_loader.I_UNK
23 |
24 | #active learning for partial annotations
25 | self.entropy_spans = defaultdict(lambda: 0)
26 | self.full_sentences = {}
27 | self.use_CFB = args.use_CFB
28 | self.addbias = args.addbias
29 | self.B_tags = []
30 | self.I_tags = []
31 | self.O_tags = []
32 | B_tags = []
33 | I_tags = []
34 | for tag in tag_to_id:
35 | if "B-" in tag:
36 | B_tags.append(tag)
37 | elif "I-" in tag:
38 | I_tags.append(tag)
39 | elif tag == "O":
40 | self.O_tags.append(tag_to_id[tag])
41 | B_tags = sorted(B_tags)
42 | I_tags = sorted(I_tags)
43 | self.B_tags = [tag_to_id[tag] for tag in B_tags]
44 | self.I_tags = [tag_to_id[tag] for tag in I_tags]
45 |
46 | def forward(self, sents, char_sents, feats, bc_feats, training=True):
47 | raise NotImplementedError
48 |
49 | def save(self):
50 | if self.save_to is not None:
51 | self.model.save(self.save_to)
52 | else:
53 | print('Save to path not provided!')
54 |
55 | def load(self, path=None):
56 | if path is None:
57 | path = self.load_from
58 | if self.load_from is not None or path is not None:
59 | print('Load model parameters from %s!' % path)
60 | self.model.populate(path)
61 | else:
62 | print('Load from path not provided!')
63 |
64 | def cal_loss(self, sents, char_sents, ner_tags, feats, bc_feats, known_tags, lm_batch=None, training=True):
65 | birnn_outputs = self.forward(sents, char_sents, feats, bc_feats, training=training)
66 | crf_loss = self.crf_decoder.decode_loss(birnn_outputs, ner_tags,self.use_partial, known_tags, self.tag_to_id, self.B_UNK, self.I_UNK)
67 | return crf_loss#, sum_s, sent_s
68 |
69 | def eval(self, sents, char_sents, feats, bc_feats, training=False,type="eval"):
70 | birnn_outputs = self.forward(sents, char_sents, feats, bc_feats, training=training)
71 | best_score, best_path, tag_scores = self.crf_decoder.decoding(birnn_outputs, self.O_tags, addbias=self.addbias)
72 | best_path_copy = deepcopy(best_path)
73 | if type == "test":
74 | alpha_value, alphas = self.crf_decoder.forward_alg(tag_scores)
75 | beta_value, betas = self.crf_decoder.backward_one_sequence(tag_scores)
76 | # print("Alpha:{0} Beta:{1}".format(alpha_value.value(), beta_value.value()))
77 | sent = sents[0]
78 | gammas = []
79 | sum = []
80 | for i in range(len(sent)):
81 | gammas.append(alphas[i] + betas[i] - alpha_value)
82 |
83 | if self.use_CFB:
84 | self.crf_decoder.get_uncertain_subsequences_CFB(sent, tag_scores, alphas, betas, alpha_value, gammas,
85 | best_path_copy, self.tag_to_id
86 | , self.B_UNK, self.I_UNK)
87 |
88 | else:
89 | self.crf_decoder.get_uncertain_subsequences(sent, tag_scores, alphas, betas, alpha_value, gammas,
90 | best_path_copy
91 | , self.B_tags, self.I_tags, self.O_tags)
92 |
93 |
94 | return best_score - alpha_value, best_path
95 | else:
96 | return best_score, best_path
97 |
98 | def eval_scores(self, sents, char_sents, feats, bc_feats, training=False):
99 | birnn_outputs = self.forward(sents, char_sents, feats, bc_feats, training=training)
100 | tag_scores, transit_score = self.crf_decoder.get_crf_scores(birnn_outputs)
101 | return tag_scores, transit_score
102 |
103 |
104 | class vanilla_NER_CRF_model(CRF_Model):
105 | ''' Implement End-to-end Sequence Labeling via Bi-directional LSTM-CNNs-CRF. '''
106 | def __init__(self, args, data_loader, lm_data_loader=None):
107 | # super(vanilla_NER_CRF_model, self).__init__(args, data_loader)
108 | self.model = dy.Model()
109 | self.args = args
110 | super(vanilla_NER_CRF_model, self).__init__(args, data_loader)
111 |
112 | self.res_discrete = args.res_discrete_feature
113 |
114 | ner_tag_size = data_loader.ner_vocab_size
115 | char_vocab_size = data_loader.char_vocab_size
116 | word_vocab_size = data_loader.word_vocab_size
117 | word_padding_token = data_loader.word_padding_token
118 |
119 | char_emb_dim = args.char_emb_dim
120 | word_emb_dim = args.word_emb_dim
121 | tag_emb_dim = args.tag_emb_dim
122 | birnn_input_dim = args.cnn_filter_size + args.word_emb_dim
123 | hidden_dim = args.hidden_dim
124 | src_ctx_dim = args.hidden_dim * 2
125 |
126 | cnn_filter_size = args.cnn_filter_size
127 | cnn_win_size = args.cnn_win_size
128 | output_dropout_rate = args.output_dropout_rate
129 | emb_dropout_rate = args.emb_dropout_rate
130 |
131 | if args.use_discrete_features:
132 | self.num_feats = data_loader.num_feats
133 | self.feature_encoder = Discrete_Feature_Encoder(self.model, self.num_feats, args.feature_dim)
134 | if self.res_discrete:
135 | src_ctx_dim += args.feature_dim * self.num_feats
136 | else:
137 | birnn_input_dim += args.feature_dim * self.num_feats
138 |
139 | if args.use_brown_cluster:
140 | bc_num = args.brown_cluster_num
141 | bc_dim = args.brown_cluster_dim
142 | # for each batch, the length of input seqs are the same, so we don't have bother with padding
143 | self.bc_encoder = Lookup_Encoder(self.model, args, bc_num, bc_dim, word_padding_token, isFeatureEmb=True)
144 |
145 | if self.res_discrete:
146 | src_ctx_dim += bc_dim
147 | else:
148 | birnn_input_dim += bc_dim
149 |
150 | self.char_cnn_encoder = CNN_Encoder(self.model, char_emb_dim, cnn_win_size, cnn_filter_size,
151 | 0.0, char_vocab_size, data_loader.char_padding_token)
152 | if args.pretrain_emb_path is None:
153 | self.word_lookup = Lookup_Encoder(self.model, args, word_vocab_size, word_emb_dim, word_padding_token)
154 | else:
155 | print("In NER CRF: Using pretrained word embedding!")
156 | self.word_lookup = Lookup_Encoder(self.model, args, word_vocab_size, word_emb_dim, word_padding_token, data_loader.pretrain_word_emb)
157 | self.birnn_encoder = BiRNN_Encoder(self.model, birnn_input_dim, hidden_dim, emb_dropout_rate=emb_dropout_rate,
158 | output_dropout_rate=output_dropout_rate)
159 |
160 | self.crf_decoder = chain_CRF_decoder(args, self.model, src_ctx_dim, tag_emb_dim, ner_tag_size, constraints=self.constraints)
161 |
162 | def forward(self, sents, char_sents, feats, bc_feats, training=True):
163 | char_embs = self.char_cnn_encoder.encode(char_sents, training=training)
164 | word_embs = self.word_lookup.encode(sents)
165 |
166 | if self.args.use_discrete_features:
167 | feat_embs = self.feature_encoder.encode(feats)
168 |
169 | if self.args.use_brown_cluster:
170 | bc_feat_embs = self.bc_encoder.encode(bc_feats)
171 |
172 | if self.args.use_discrete_features and self.args.use_brown_cluster:
173 | concat_inputs = [dy.concatenate([c, w, f, b]) for c, w, f, b in
174 | zip(char_embs, word_embs, feat_embs, bc_feat_embs)]
175 | elif self.args.use_brown_cluster and not self.args.use_discrete_features:
176 | concat_inputs = [dy.concatenate([c, w, f]) for c, w, f in
177 | zip(char_embs, word_embs, bc_feat_embs)]
178 | elif self.args.use_discrete_features and not self.args.use_brown_cluster:
179 | concat_inputs = [dy.concatenate([c, w, f]) for c, w, f in
180 | zip(char_embs, word_embs, feat_embs)]
181 | else:
182 | concat_inputs = [dy.concatenate([c, w]) for c, w in zip(char_embs, word_embs)]
183 |
184 | birnn_outputs = self.birnn_encoder.encode(concat_inputs, training=training)
185 | return birnn_outputs
186 |
--------------------------------------------------------------------------------
/dataloaders/data_loader.py:
--------------------------------------------------------------------------------
1 | __author__ = 'chuntingzhou and aditichaudhary'
2 | import os
3 | from utils.util import *
4 | from utils.features import *
5 | #from utils.segnerfts import orm_morph as ormnorm
6 |
7 | #tagset = ['B-LOC','B-PER','B-MISC', 'B-ORG','I-LOC','I-PER','I-MISC', 'I-ORG','O']
8 | tagset = ['B-LOC','B-PER','B-GPE', 'B-ORG','I-LOC','I-PER','I-GPE', 'I-ORG','O']
9 |
10 | class NER_DataLoader():
11 | def __init__(self, args, special_normal=False):
12 | # This is data loader as well as feature extractor!!
13 |
14 | self.args = args
15 | if args.train_ensemble:
16 | self.train_path = args.full_data_path
17 | else:
18 | self.train_path = args.train_path
19 | self.test_path = args.test_path
20 | self.dev_path = args.dev_path
21 | self.args = args
22 |
23 | self.tag_vocab_path = self.train_path + ".tag_vocab"
24 | self.word_vocab_path = self.train_path + ".word_vocab"
25 | self.char_vocab_path = self.train_path + ".char_vocab"
26 |
27 | self.pretrained_embedding_path = args.pretrain_emb_path
28 | self.use_discrete_feature = args.use_discrete_features
29 | self.use_brown_cluster = args.use_brown_cluster
30 |
31 | self.train_senttypes = self.dev_senttypes = self.test_senttypes = None
32 |
33 | if self.use_brown_cluster:
34 | self.brown_cluster_dicts = get_brown_cluster(args.brown_cluster_path)
35 | self.brown_cluster_dicts[''] = len(self.brown_cluster_dicts)
36 | args.brown_cluster_num = len(self.brown_cluster_dicts)
37 | else:
38 | self.brown_cluster_dicts = None
39 |
40 | print("Generating vocabs from training file ....")
41 | paths_to_read = [self.train_path, self.dev_path, self.test_path]
42 |
43 | if args.fixedVocab: #Make vaocabulary from the args.aug_lang_train_path
44 | _, self.word_to_id, self.char_to_id = self.read_files([self.args.aug_lang_train_path])
45 | self.tag_to_id = {}
46 | # self.word_to_id = {}
47 | # self.char_to_id = {}
48 | for tag in tagset:
49 | if args.misc:
50 | tag = tag.replace("GPE", "MISC")
51 | self.tag_to_id[tag] = len(self.tag_to_id)
52 | else:
53 | self.tag_to_id, self.word_to_id, self.char_to_id = self.read_files(paths_to_read)
54 | print("Size of vocab before: %d" % len(self.word_to_id))
55 | self.word_to_id[''] = len(self.word_to_id) + 1
56 | self.char_to_id[''] = len(self.char_to_id) + 1
57 | self.word_to_id['<\s>'] = 0
58 | self.char_to_id[''] = 0
59 | print("Size of vocab after: %d" % len(self.word_to_id))
60 | self.word_padding_token = 0
61 | self.char_padding_token = 0
62 |
63 | if self.pretrained_embedding_path is not None:
64 | self.pretrain_word_emb, self.word_to_id, self.char_to_id = get_pretrained_emb(self.args.fixedVocab, self.pretrained_embedding_path,
65 | self.word_to_id, self.char_to_id, args.word_emb_dim)
66 |
67 | # for char vocab and word vocab, we reserve id 0 for the eos padding, and len(vocab)-1 for the
68 | self.id_to_tag = {v: k for k, v in self.tag_to_id.iteritems()}
69 | self.id_to_word = {v: k for k, v in self.word_to_id.iteritems()}
70 | self.id_to_char = {v: k for k, v in self.char_to_id.iteritems()}
71 |
72 | self.ner_vocab_size = len(self.id_to_tag)
73 | self.word_vocab_size = len(self.id_to_word)
74 | self.char_vocab_size = len(self.id_to_char)
75 |
76 | self.cap_ratio_dict = None
77 |
78 | #Partial CRF
79 | self.B_UNK = self.ner_vocab_size + 1
80 | self.I_UNK = self.ner_vocab_size + 2
81 |
82 | print("Size of vocab after: %d" % len(self.word_to_id))
83 | print("NER tag num=%d, Word vocab size=%d, Char Vocab size=%d" % (self.ner_vocab_size, self.word_vocab_size, self.char_vocab_size))
84 |
85 |
86 | @staticmethod
87 | def exists(path):
88 | return os.path.exists(path)
89 |
90 | def read_one_line(self, line, tag_set, word_dict, char_set):
91 | for w in line:
92 | fields = w.split()
93 | if len(fields) !=2:
94 | print("ERROR! Incorrect number of fields in the file, required two.")
95 | print(fields)
96 | exit(0)
97 | word = fields[0]
98 | ner_tag = fields[-1]
99 |
100 | for c in word:
101 | char_set.add(c)
102 | if "UNK" not in ner_tag:
103 | if self.args.misc:
104 | ner_tag = ner_tag.replace("GPE","MISC")
105 | tag_set.add(ner_tag)
106 | word_dict[word] += 1
107 |
108 | def get_vocab_from_set(self, a_set, shift=0):
109 | vocab = {}
110 | for i, elem in enumerate(a_set):
111 | vocab[elem] = i + shift
112 |
113 | return vocab
114 |
115 | def get_vocab_from_dict(self, a_dict, shift=0, remove_singleton=False):
116 | vocab = {}
117 | i = 0
118 | self.singleton_words = set()
119 |
120 | #Sort the defaultdict
121 | sortedDict = sorted(a_dict.iteritems(), key=lambda (k, v): v, reverse=True)
122 | for (k,v) in sortedDict:
123 |
124 | #for k, v in a_dict.iteritems():
125 | if v == 1:
126 | self.singleton_words.add(i + shift)
127 | if remove_singleton:
128 | if v > 1:
129 | # print k, v
130 | vocab[k] = i + shift
131 | i += 1
132 | else:
133 | vocab[k] = i + shift
134 | i += 1
135 | print("Singleton words number: %d" % len(self.singleton_words))
136 | return vocab
137 |
138 | def read_files(self, paths):
139 | # word_list = []
140 | # char_list = []
141 | # tag_list = []
142 | word_dict = defaultdict(lambda: 0)
143 | char_set = set()
144 | tag_set = set()
145 |
146 | def _read_a_file(path):
147 | with codecs.open(path, "r", "utf-8") as fin:
148 | to_read_line = []
149 | for line in fin:
150 | if line.strip() == "":
151 | self.read_one_line(to_read_line, tag_set, word_dict, char_set)
152 | to_read_line = []
153 | else:
154 | to_read_line.append(line.strip())
155 | self.read_one_line(to_read_line, tag_set, word_dict, char_set)
156 |
157 | for path in paths:
158 | _read_a_file(path)
159 |
160 | tag_vocab = self.get_vocab_from_set(tag_set)
161 | word_vocab = self.get_vocab_from_dict(word_dict, 1, self.args.remove_singleton)
162 | char_vocab = self.get_vocab_from_set(char_set, 1)
163 |
164 | return tag_vocab, word_vocab, char_vocab
165 |
166 | def get_data_set(self, path, lang, source="train"):
167 | sents = []
168 | char_sents = []
169 | tgt_tags = []
170 | discrete_features = []
171 | bc_features = []
172 | known_tags = []
173 |
174 | if source == "train":
175 | sent_types = self.train_senttypes
176 | else:
177 | sent_types = self.dev_senttypes
178 |
179 | def add_sent(one_sent, type):
180 | temp_sent = []
181 | temp_ner = []
182 | temp_char = []
183 | temp_bc = []
184 | sent = []
185 | temp_known_tag = []
186 | for w in one_sent:
187 | fields = w.split()
188 | if len(fields)!=2:
189 | fields = w.split("\t")
190 | assert len(fields)==2
191 | word = fields[0]
192 | sent.append(word)
193 | ner_tag = fields[-1]
194 | if self.use_brown_cluster:
195 | temp_bc.append(self.brown_cluster_dicts[word] if word in self.brown_cluster_dicts else self.brown_cluster_dicts[""])
196 |
197 | if self.args.fixedVocab:
198 | if word in self.word_to_id:
199 | temp_sent.append(self.word_to_id[word])
200 | elif word.lower() in self.word_to_id:
201 | temp_sent.append(self.word_to_id[word.lower()])
202 | else:
203 | temp_sent.append(self.word_to_id[""])
204 | else:
205 | temp_sent.append(self.word_to_id[word] if word in self.word_to_id else self.word_to_id[""])
206 |
207 | if "B-UNK" in ner_tag:
208 | temp_ner.append(self.B_UNK)
209 | elif "I-UNK" in ner_tag:
210 | temp_ner.append(self.I_UNK)
211 | else:
212 | if self.args.misc:
213 | ner_tag = ner_tag.replace("GPE","MISC")
214 | temp_ner.append(self.tag_to_id[ner_tag])
215 |
216 | if "UNK" in ner_tag:
217 | temp_known_tag.append([0])
218 | else:
219 | temp_known_tag.append([1])
220 |
221 | temp_char.append([self.char_to_id[c] if c in self.char_to_id else self.char_to_id[""] for c in word])
222 |
223 | sents.append(temp_sent)
224 | char_sents.append(temp_char)
225 | tgt_tags.append(temp_ner)
226 | bc_features.append(temp_bc)
227 | known_tags.append(temp_known_tag)
228 | discrete_features.append([])
229 |
230 | # print len(discrete_features[-1])
231 |
232 | with codecs.open(path, "r", "utf-8") as fin:
233 | i = 0
234 | one_sent = []
235 | for line in fin:
236 | if line.strip() == "" or line.strip() == "\n":
237 | if len(one_sent) > 0:
238 | add_sent(one_sent, sent_types[i] if sent_types is not None else None)
239 | i += 1
240 | if i % 1000 == 0:
241 | print("Processed %d training data." % (i,))
242 | one_sent = []
243 | else:
244 | one_sent.append(line.strip())
245 |
246 | if len(one_sent) > 0:
247 | add_sent(one_sent, sent_types[i] if sent_types is not None else None)
248 | i += 1
249 |
250 | if sent_types is not None:
251 | assert i == len(sent_types), "Not match between number of sentences and sentence types!"
252 |
253 | if self.use_discrete_feature:
254 | self.num_feats = len(discrete_features[0][0])
255 | else:
256 | self.num_feats = 0
257 | return sents, char_sents, tgt_tags, discrete_features, bc_features, known_tags
258 |
--------------------------------------------------------------------------------
/utils/util.py:
--------------------------------------------------------------------------------
1 | __author__ = 'chuntingzhou'
2 | import dynet as dy
3 | import numpy as np
4 | from collections import defaultdict
5 | import gzip
6 | import cPickle as pkl
7 | import codecs
8 | import math
9 | import random
10 | from random import shuffle
11 |
12 | random.seed(448)
13 | np.random.seed(1)
14 | import operator
15 | import re
16 |
17 | MAX_CHAR_LENGTH = 45
18 |
19 | # Regular expressions used to normalize digits.
20 | DIGIT_RE = re.compile(br"\d")
21 |
22 |
23 | # word = utils.DIGIT_RE.sub(b"0", tokens[0]) if normalize_digits else tokens[0]
24 |
25 |
26 | def iob2(tags):
27 | """
28 | Check that tags have a valid IOB format.
29 | Tags in IOB1 format are converted to IOB2.
30 | """
31 | for i, tag in enumerate(tags):
32 | if tag == 'O':
33 | continue
34 | split = tag.split('-')
35 | if len(split) != 2 or split[0] not in ['I', 'B']:
36 | return False
37 | if split[0] == 'B':
38 | continue
39 | elif i == 0 or tags[i - 1] == 'O': # conversion IOB1 to IOB2
40 | tags[i] = 'B' + tag[1:]
41 | elif tags[i - 1][1:] == tag[1:]:
42 | continue
43 | else: # conversion IOB1 to IOB2
44 | tags[i] = 'B' + tag[1:]
45 | return True
46 |
47 |
48 | def get_entity(label):
49 | entities = []
50 | i = 0
51 | while i < len(label):
52 | if label[i] != 'O':
53 | e_type = label[i][2:]
54 | j = i + 1
55 | while j < len(label) and label[j] == 'I-' + e_type:
56 | j += 1
57 | entities.append((i, j, e_type))
58 | i = j
59 | else:
60 | i += 1
61 | return entities
62 |
63 |
64 | def evaluate_ner(pred, gold):
65 | tp = 0
66 | fp = 0
67 | fn = 0
68 | for i in range(len(pred)):
69 | pred_entities = get_entity(pred[i])
70 | gold_entities = get_entity(gold[i])
71 | temp = 0
72 | for entity in pred_entities:
73 | if entity in gold_entities:
74 | tp += 1
75 | temp += 1
76 | else:
77 | fp += 1
78 | fn += len(gold_entities) - temp
79 | precision = 1.0 * tp / (tp + fp)
80 | recall = 1.0 * tp / (tp + fn)
81 | f1 = 2 * precision * recall / (precision + recall)
82 | return precision, recall, f1
83 |
84 |
85 | def fopen(filename, mode='r'):
86 | if filename.endswith('.gz'):
87 | return gzip.open(filename, mode)
88 | return open(filename, mode)
89 |
90 |
91 | def get_pretrained_emb(fixedVocab, path_to_emb, word_to_id, char_to_id, dim):
92 | word_emb = []
93 | print("Loading pretrained embeddings from %s." % (path_to_emb))
94 | print("length of dict: %d" % len(word_to_id))
95 |
96 | pretrain_word_emb = {}
97 | pretrain_vocab = []
98 | for line in codecs.open(path_to_emb, "r", "utf-8", errors='replace'):
99 | items = line.strip().split()
100 | if len(items) == dim + 1:
101 | try:
102 | pretrain_word_emb[items[0]] = np.asarray(items[1:]).astype(np.float32)
103 | pretrain_vocab.append(items[0])
104 | except ValueError:
105 | continue
106 |
107 | for _ in range(len(word_to_id)):
108 | word_emb.append(np.random.uniform(-math.sqrt(3.0 / dim), math.sqrt(3.0 / dim), size=dim))
109 |
110 | not_covered = 0
111 | print(len(word_to_id), len(word_emb))
112 |
113 | for word, id in word_to_id.iteritems():
114 | if word in pretrain_word_emb:
115 | word_emb[id] = pretrain_word_emb[word]
116 | elif word.lower() in pretrain_word_emb:
117 | word_emb[id] = pretrain_word_emb[word.lower()]
118 | else:
119 | not_covered += 1
120 |
121 | if fixedVocab:
122 | #Take top 100000 from the word embeddings
123 | num = 0
124 | for word in pretrain_vocab:
125 | if num > 400000:
126 | break
127 | if word not in word_to_id:
128 | word_to_id[word] = len(word_to_id)
129 | word_emb.append(pretrain_word_emb[word])
130 | num +=1
131 |
132 | else:
133 | for word in pretrain_word_emb.keys():
134 | if word not in word_to_id:
135 | word_to_id[word] = len(word_to_id)
136 | word_emb.append(pretrain_word_emb[word])
137 |
138 | emb = np.array(word_emb, dtype=np.float32)
139 |
140 | print("Word number not covered in pretrain embedding: %d" % not_covered)
141 | return emb, word_to_id, char_to_id
142 |
143 |
144 | def pkl_dump(obj, path):
145 | with open(path, "wb") as fout:
146 | pkl.dump(obj, fout)
147 |
148 |
149 | def pkl_load(path):
150 | with open(path, "rb") as fin:
151 | obj = pkl.load(fin)
152 | return obj
153 |
154 |
155 | def log_sum_exp_dim_0(x):
156 | # numerically stable log_sum_exp
157 | dims = x.dim()
158 | max_score = dy.max_dim(x, 0) # (dim_1, batch_size)
159 | if len(dims[0]) == 1:
160 | max_score_extend = max_score
161 | else:
162 | max_score_reshape = dy.reshape(max_score, (1, dims[0][1]), batch_size=dims[1])
163 | max_score_extend = dy.concatenate([max_score_reshape] * dims[0][0])
164 | x = x - max_score_extend
165 | exp_x = dy.exp(x)
166 | # (dim_1, batch_size), if no dim_1, return ((1,), batch_size)
167 | log_sum_exp_x = dy.log(dy.mean_dim(exp_x, d=[0], b=False) * dims[0][0])
168 | return log_sum_exp_x + max_score
169 |
170 |
171 | def data_iterator(data_pair, batch_size):
172 | batches = make_bucket_batches(data_pair, batch_size)
173 | for batch in batches:
174 | yield batch
175 |
176 |
177 | def make_bucket_batches(data_collections, batch_size):
178 | # Data are bucketed according to the length of the first item in the data_collections.
179 | buckets = defaultdict(list)
180 | tot_items = len(data_collections[0])
181 | for data_item in data_collections:
182 | src = data_item[0]
183 | buckets[len(src)].append(data_item)
184 |
185 | batches = []
186 | # np.random.seed(2)
187 | for src_len in buckets:
188 | bucket = buckets[src_len]
189 | np.random.shuffle(bucket)
190 |
191 | num_batches = int(np.ceil(len(bucket) * 1.0 / batch_size))
192 | for i in range(num_batches):
193 | cur_batch_size = batch_size if i < num_batches - 1 else len(bucket) - batch_size * i
194 | batches.append([[bucket[i * batch_size + j][k] for j in range(cur_batch_size)] for k in range(tot_items)])
195 | np.random.shuffle(batches)
196 | return batches
197 |
198 |
199 | def transpose_input(seq, padding_token=0):
200 | # input seq: list of samples [[w1, w2, ..], [w1, w2, ..]]
201 | max_len = max([len(sent) for sent in seq])
202 | seq_pad = []
203 | seq_mask = []
204 | for i in range(max_len):
205 | pad_temp = [sent[i] if i < len(sent) else padding_token for sent in seq]
206 | mask_temp = [1.0 if i < len(sent) else 0.0 for sent in seq]
207 | seq_pad.append(pad_temp)
208 | seq_mask.append(mask_temp)
209 |
210 | return seq_pad, seq_mask
211 |
212 |
213 | def transpose_discrete_features(feature_batch):
214 | # Discrete features are zero-one features
215 | # TODO: Other integer features, create lookup tables
216 | # tgt_batch: [[[feature of word 1 of sent 1], [feature of word 2 of sent 2], ]]
217 | # return: [(feature_num, batchsize)]
218 | max_sent_len = max([len(s) for s in feature_batch])
219 | feature_num = len(feature_batch[0][0])
220 | batch_size = len(feature_batch)
221 | features = [] # each: (feature_num, batch_size)
222 | for i in range(max_sent_len):
223 | w_i_feature = [dy.inputTensor(sent[i], batched=True) if i < len(sent) else dy.zeros(feature_num) for sent in feature_batch]
224 | w_i_feature = dy.reshape(dy.concatenate(w_i_feature, d=1), (feature_num,), batch_size=batch_size)
225 | features.append(w_i_feature)
226 |
227 | return features
228 |
229 |
230 | def transpose_and_batch_embs(input_embs, emb_size):
231 | # input_embs: [[w1_emb, w2_emb, ]], embs are dy.expressions
232 | max_len = max(len(sent) for sent in input_embs)
233 | batch_size = len(input_embs)
234 | padded_seq_emb = []
235 | seq_masks = []
236 | for i in range(max_len):
237 | w_i_emb = [sent[i] if i < len(sent) else dy.zeros(emb_size) for sent in input_embs]
238 | w_i_emb = dy.reshape(dy.concatenate(w_i_emb, d=1), (emb_size,), batch_size=batch_size)
239 | w_i_mask = [1.0 if i < len(sent) else 0.0 for sent in input_embs]
240 | padded_seq_emb.append(w_i_emb)
241 | seq_masks.append(w_i_mask)
242 |
243 | return padded_seq_emb, seq_masks
244 |
245 |
246 | def transpose_char_input(tgt_batch, padding_token):
247 | # The tgt_batch may not be padded with and
248 | # tgt_batch: [[[, , ], [, s,h,e, ],
249 | # [, i,s, ], [, p,r,e,t,t,y, ], [, , ]], [[],[],[]]]
250 | max_sent_len = max([len(s) for s in tgt_batch])
251 | sent_w_batch = [] # each is list of list: max_word_len, batch_size
252 | sent_mask_batch = [] # each is list of list: max_word_len, batch_size
253 | max_w_lens = []
254 | SOW_PAD = 0
255 | EOW_PAD = 1
256 | EOS_PAD = 2
257 | for i in range(max_sent_len):
258 | max_len_w = max([len(sent[i]) for sent in tgt_batch if i < len(sent)])
259 | max_w_lens.append(max_len_w)
260 | w_batch = []
261 | mask_batch = []
262 | for j in range(0, max_len_w):
263 | temp_j_w = []
264 | for sent in tgt_batch:
265 | if i < len(sent) and j < len(sent[i]):
266 | temp_j_w.append(sent[i][j])
267 | elif i >= len(sent):
268 | if j == 0:
269 | temp_j_w.append(SOW_PAD)
270 | elif j == max_len_w - 1:
271 | temp_j_w.append(EOW_PAD)
272 | else:
273 | temp_j_w.append(EOS_PAD)
274 | else:
275 | temp_j_w.append(EOW_PAD)
276 | # w_batch = [sent[i][j] if i < len(sent) and j < len(sent[i]) else self.EOW for sent in tgt_batch]
277 | # print "temp: ", temp_j_w
278 | w_batch.append(temp_j_w)
279 | mask_batch.append([1. if i < len(sent) and j < len(sent[i]) else 0.0 for sent in tgt_batch])
280 | sent_w_batch.append(w_batch)
281 | sent_mask_batch.append(mask_batch)
282 | return sent_w_batch, sent_mask_batch, max_sent_len, max_w_lens
283 |
284 | def get_vocab_from_set(a_set, shift=0):
285 | vocab = {}
286 | for i, elem in enumerate(a_set):
287 | vocab[elem] = i + shift
288 |
289 | return vocab
290 |
291 | if __name__ == "__main__":
292 | # from scipy.misc import logsumexp
293 | # import numpy as np
294 | #
295 | # # a = np.random.rand(3, 4, 2)
296 | # # b = logsumexp(a, axis=0)
297 | # # a_t = dy.inputTensor(a, batched=True)
298 | # # b_t = log_sum_exp_dim_0(a_t)
299 | # # print "numpy "
300 | # # print b
301 | # # print "dynet "
302 | # # print b_t.value(), b_t.dim()
303 | # # print dy.pick_batch_elem(b_t, 1).npvalue()
304 | #
305 | # a = np.random.rand(3, 2)
306 | # b = logsumexp(a, axis=0)
307 | # a_t = dy.inputTensor(a, batched=True)
308 | # b_t = log_sum_exp_dim_0(a_t)
309 | # print "numpy "
310 | # print b
311 | # print "dynet "
312 | # print b_t.value(), b_t.dim()
313 | # print dy.pick_batch_elem(b_t, 1).npvalue()
314 | dim = 100
315 | # 9 1000
316 |
317 | path_to_emb = "../datasets/english/glove.6B/glove.6B.100d.txt"
318 | pretrain_word_emb = {}
319 | i = 1
320 | for line in codecs.open(path_to_emb, "r", 'utf-8', errors='replace'):
321 | items = line.strip().split()
322 | if len(items) == dim + 1:
323 | try:
324 | pretrain_word_emb[items[0]] = np.asarray(items[1:]).astype(np.float32)
325 | except ValueError:
326 | continue
327 | print items[0], i, pretrain_word_emb[items[0]][:3]
328 | i += 1
329 |
330 | # gradient clipping
331 | # turn off the dropout
332 | # use smaller initial lr
333 | # variational dropout
334 |
--------------------------------------------------------------------------------
/models/encoders.py:
--------------------------------------------------------------------------------
1 | __author__ = 'chuntingzhou'
2 | from utils.util import *
3 |
4 | ''' Designing idea: the encoder should be agnostic to the input, it can be either
5 | arbitrary spans, characters, or words, or even raw feature. However, user has to specify
6 | whether to have the lookup table for any input.
7 |
8 | There are also two ways to feed in multiple input features:
9 | (a) First concatenate all features for each position, and then use them as features for one encoder, e.g. bilstm
10 | (b) Use multiple encoders for multiple features then combine outputs from multiple encoders, either concat them
11 | or feed them to another encoder.'''
12 |
13 |
14 | class Encoder():
15 | def __init__(self):
16 | pass
17 |
18 | def encode(self):
19 | raise NotImplementedError
20 |
21 | # class concat_input_encoder(encoder):
22 | # def __init__(self, model, lookups, lookup_table_dims):
23 | # # length of elements in lookup_table_dims == number of elements in lookups which are true
24 | # self.num_inputs = len(lookups)
25 | # self.lookups = lookups
26 | # self.lookup_params = []
27 | # for i, lookup in enumerate(lookups):
28 | # if lookup == 1:
29 | # # add loop up parameters
30 | # self.lookup_params.append(model.add_lookup_parameters((lookup_table_dims[i][0], lookup_table_dims[i][1])))
31 | # elif lookup == 2:
32 | # # add normal transformation parameters
33 | # # dims: discrete_feature_num, continuous_emb_dim
34 | # # the input should concatenate all the discrete features together first
35 | # self.lookup_params.append(model.add_parameters((lookup_table_dims[i][0], lookup_table_dims[i][1])))
36 | # else:
37 | # self.lookup_params.append(0)
38 | #
39 | # def prepare_inputs(self, inputs):
40 | # # inputs: (a)
41 | # input_features = []
42 | # for i, lookup in enumerate(self.lookups):
43 | # if lookup == 1:
44 |
45 |
46 | class Lookup_Encoder(Encoder):
47 | def __init__(self, model, args, vocab_size, emb_size, padding_token=None, pretrain_embedding=None, isFeatureEmb=False):
48 | Encoder.__init__(self)
49 | self.padding_token = padding_token
50 | self.map_pretrain = args.map_pretrain
51 | self.pretrain_fix = args.pretrain_fix
52 | self.isFeatureEmb = isFeatureEmb
53 | if args.map_pretrain:
54 | self.W_map = model.add_parameters((args.map_dim, emb_size))
55 | self.b_map = model.add_parameters(args.map_dim)
56 | self.b_map.zero()
57 | if pretrain_embedding is not None:
58 | self.lookup_table = model.lookup_parameters_from_numpy(pretrain_embedding)
59 | else:
60 | self.lookup_table = model.add_lookup_parameters((vocab_size, emb_size))
61 |
62 | def encode(self, input_seqs):
63 | transpose_inputs, _ = transpose_input(input_seqs, self.padding_token)
64 | embs = [dy.lookup_batch(self.lookup_table, wids) for wids in transpose_inputs]
65 | if self.pretrain_fix and not self.isFeatureEmb:
66 | embs = [dy.nobackprop(emb) for emb in embs]
67 | # TODO: initialize with ones vector, initialize W_map with identity matrix
68 | if self.map_pretrain and not self.isFeatureEmb:
69 | if not self.pretrain_fix:
70 | embs = [dy.nobackprop(emb) for emb in embs]
71 | W_map = dy.parameter(self.W_map)
72 | b_map = dy.parameter(self.b_map)
73 | embs = [dy.affine_transform([b_map, W_map, emb]) for emb in embs]
74 | return embs
75 |
76 |
77 | class Discrete_Feature_Encoder(Encoder):
78 | def __init__(self, model, num_feats, to_dim):
79 | Encoder.__init__(self)
80 | self.num_feats = num_feats
81 | self.to_dim = to_dim
82 | self.W_feat_emb = model.add_parameters((to_dim, num_feats))
83 |
84 | def encode(self, input_feats):
85 | batch_size = len(input_feats)
86 | # after transpose: input_feats: [(num_feats, batch_size)]
87 | input_feats = transpose_discrete_features(input_feats)
88 | W_feat_emb = dy.parameter(self.W_feat_emb)
89 | output_emb = []
90 | for wif in input_feats:
91 | extend_wif = dy.transpose(dy.concatenate_cols([wif for _ in range(self.to_dim)]))
92 | feature_emb = dy.cmult(extend_wif, W_feat_emb)
93 | output_emb.append(dy.reshape(feature_emb, (self.to_dim * self.num_feats, ), batch_size=batch_size))
94 | return output_emb
95 |
96 |
97 | class CNN_Encoder(Encoder):
98 | def __init__(self, model, emb_size, win_size=3, filter_size=64, dropout=0.5, vocab_size=0, padding_token=0, lookup_emb=None):
99 | Encoder.__init__(self)
100 | self.vocab_size = vocab_size # if 0, no lookup tables
101 | self.win_size = win_size
102 | self.filter_size = filter_size
103 | self.emb_size = emb_size
104 | self.dropout_rate = dropout
105 | self.paddding_token = padding_token
106 | if vocab_size != 0:
107 | print("In CNN encoder: creating lookup embedding!")
108 | self.lookup_emb = model.add_lookup_parameters((vocab_size, 1, 1, emb_size))
109 | else:
110 | assert lookup_emb is not None
111 | print("In CNN encoder: reusing lookup embedding!")
112 | self.lookup_emb = lookup_emb
113 |
114 | self.W_cnn = model.add_parameters((1, win_size, emb_size, filter_size))
115 | self.b_cnn = model.add_parameters((filter_size))
116 | self.b_cnn.zero()
117 |
118 | def _cnn_emb(self, input_embs, training):
119 | # input_embs: (h, time_step, dim, batch_size), h=1
120 | if self.dropout_rate > 0 and training:
121 | input_embs = dy.dropout(input_embs, self.dropout_rate)
122 | W_cnn = dy.parameter(self.W_cnn)
123 | b_cnn = dy.parameter(self.b_cnn)
124 |
125 | cnn_encs = dy.conv2d_bias(input_embs, W_cnn, b_cnn, stride=(1, 1), is_valid=False)
126 | tanh_cnn_encs = dy.tanh(cnn_encs)
127 | max_pool_out = dy.reshape(dy.max_dim(tanh_cnn_encs, d=1), (self.filter_size,))
128 | # rec_pool_out = dy.rectify(max_pool_out)
129 | return max_pool_out
130 |
131 | def encode(self, input_seqs, training=True, char=True):
132 | batch_size = len(input_seqs)
133 | sents_embs = []
134 | if char:
135 | # we don't batch at first, we batch after cnn
136 | for sent in input_seqs:
137 | sent_emb = []
138 | for w in sent:
139 | if len(w) < self.win_size:
140 | w += [self.paddding_token] * (self.win_size - len(w))
141 | input_embs = dy.concatenate([dy.lookup(self.lookup_emb, c) for c in w], d=1)
142 | w_emb = self._cnn_emb(input_embs, training) # (filter_size, 1)
143 | sent_emb.append(w_emb)
144 | sents_embs.append(sent_emb)
145 | sents_embs, sents_mask = transpose_and_batch_embs(sents_embs, self.filter_size) # [(filter_size, batch_size)]
146 | else:
147 | for sent in input_seqs:
148 | if self.vocab_size != 0:
149 | if len(sent) < self.win_size:
150 | sent += [0] * (self.win_size - len(sent))
151 | input_embs = dy.concatenate([dy.lookup(self.lookup_emb, w) for w in sent], d=1)
152 | else:
153 | # input_seqs: [(emb_size, batch_size)]
154 | if len(sent) < self.win_size:
155 | sent += [dy.zeros(self.emb_size)] * (self.win_size - len(sent))
156 | input_embs = dy.transpose(dy.concatenate_cols(sent)) # (time_step, emb_size, bs)
157 | input_embs = dy.reshape(input_embs, (1, len(sent), self.emb_size), )
158 |
159 | sent_emb = self._cnn_emb(input_embs, training) # (filter_size, 1)
160 | sents_embs.append(sent_emb)
161 | sents_embs = dy.reshape(dy.concatenate(sents_embs, d=1), (self.filter_size,), batch_size =batch_size) # (filter_size, batch_size)
162 |
163 | return sents_embs
164 |
165 |
166 | class BiRNN_Encoder(Encoder):
167 | def __init__(self,
168 | model,
169 | input_dim,
170 | hidden_dim,
171 | emb_dropout_rate=0.3,
172 | output_dropout_rate=0.5,
173 | padding_token=None,
174 | vocab_size=0,
175 | emb_size=0,
176 | layer=1,
177 | rnn="lstm",
178 | vocab_emb=None):
179 | Encoder.__init__(self)
180 | # self.birnn = dy.BiRNNBuilder(layer, input_dim, hidden_dim, model, dy.LSTMBuilder if rnn == "lstm" else dy.GRUBuilder)
181 | self.fwd_RNN = dy.LSTMBuilder(layer, input_dim, hidden_dim, model) if rnn == "lstm" else dy.GRUBuilder(layer, input_dim, hidden_dim, model)
182 | self.bwd_RNN = dy.LSTMBuilder(layer, input_dim, hidden_dim, model) if rnn == "lstm" else dy.GRUBuilder(layer, input_dim, hidden_dim, model)
183 |
184 | self.input_dim = input_dim
185 | self.vocab_size = vocab_size
186 | self.padding_token = padding_token
187 | self.drop_out_rate = output_dropout_rate
188 | self.emb_drop_rate = emb_dropout_rate
189 | self.hidden_dim = hidden_dim
190 | if vocab_size > 0:
191 | print("In BiRNN, creating lookup table!")
192 | self.vocab_emb = model.add_lookup_parameters((vocab_size, emb_size))
193 | else:
194 | if vocab_emb is not None:
195 | # assert vocab_emb is not None
196 | self.vocab_emb = vocab_emb
197 | else:
198 | self.vocab_emb = None
199 |
200 | def encode(self, input_seqs, training=True, char=False):
201 | if char:
202 | return self.encode_word(input_seqs, training=training)
203 | else:
204 | return self.encode_seq(input_seqs, training=training)
205 |
206 | def encode_seq(self, input_seqs, training=True, char=False):
207 | if self.vocab_emb is not None:
208 | # input_seqs = [[w1, w2],[]]
209 | transpose_inputs, _ = transpose_input(input_seqs, self.padding_token)
210 | if self.vocab_size != 0:
211 | w_embs = [dy.dropout(dy.lookup_batch(self.vocab_emb, wids),
212 | self.emb_drop_rate) if self.emb_drop_rate > 0. and training
213 | else dy.lookup_batch(self.vocab_emb, wids)
214 | for wids in transpose_inputs]
215 | else:
216 | # print "In our case, use parameters shared by CNN char encoder, need conversion!"
217 | vocab_emb = dy.parameter(self.vocab_emb)
218 | vocab_size = vocab_emb.dim()[0][-1]
219 | # print "In BiRNN Char vocab size: ", vocab_size
220 | vocab_emb = dy.reshape(vocab_emb, (self.input_dim, vocab_size)) # expression, not lookup_parameters
221 |
222 | # for wids in transpose_inputs:
223 | # print wids
224 | # print vocab_emb.dim()
225 | # a = dy.pick_batch(vocab_emb, wids, dim=1)
226 | # print a.value()
227 | # Special case handler: use pick_batch
228 | w_embs = [dy.dropout(dy.pick_batch(vocab_emb, wids, dim=1),
229 | self.emb_drop_rate) if self.emb_drop_rate > 0. and training
230 | else dy.pick_batch(vocab_emb, wids, dim=1)
231 | for wids in transpose_inputs]
232 | # print "In BiRNN char: ", w_embs[0].dim()
233 | else:
234 | w_embs = [dy.dropout(emb, self.emb_drop_rate) if self.emb_drop_rate > 0. and training else emb for emb in input_seqs]
235 | # if vocab_size = 0: input_seqs = [(input_dim, batch_size)]
236 |
237 | w_embs_r = w_embs[::-1]
238 | # birnn_outputs = [dy.dropout(emb, self.drop_out_rate) if self.drop_out_rate > 0. else emb for emb in self.birnn.transduce(w_embs)]
239 | fwd_vectors = self.fwd_RNN.initial_state().transduce(w_embs)
240 | bwd_vectors = self.bwd_RNN.initial_state().transduce(w_embs_r)[::-1]
241 |
242 | if char:
243 | return dy.concatenate([fwd_vectors[-1], bwd_vectors[0]])
244 |
245 | birnn_outputs = [dy.dropout(dy.concatenate([fwd_v, bwd_v]), self.drop_out_rate) if self.drop_out_rate > 0.0 and training
246 | else dy.concatenate([fwd_v, bwd_v])
247 | for (fwd_v, bwd_v) in zip(fwd_vectors, bwd_vectors)]
248 | return birnn_outputs
249 |
250 | def encode_word(self, input_seqs, training=True):
251 | # embedding dropout rate is 0.0, because we dropout at the later stage of RNN
252 | sents_embs = []
253 |
254 | for sent in input_seqs:
255 | sent_emb = []
256 | for w in sent:
257 | w_emb = self.encode_seq([w], training=training, char=True)
258 | sent_emb.append(w_emb)
259 | sents_embs.append(sent_emb)
260 | sents_embs, sents_mask = transpose_and_batch_embs(sents_embs, self.hidden_dim*2) # [(hidden_dim*2, batch_size)]
261 | return sents_embs
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | __author__ = 'chuntingzhou and aditichaudhary'
2 | import sys
3 | reload(sys)
4 | sys.setdefaultencoding('utf-8')
5 |
6 | def evaluate(data_loader, path, model, model_name,type="dev"):
7 | sents, char_sents, tgt_tags, discrete_features, bc_feats,_ = data_loader.get_data_set(path, args.lang, source="dev")
8 |
9 | prefix = model_name + "_" + str(uid)
10 | # tot_acc = 0.0
11 | predictions = []
12 | gold_standards = []
13 | sentences = []
14 | i = 0
15 | sentence_gold = {}
16 |
17 | score_sent = {}
18 | for sent, char_sent, tgt_tag, discrete_feature, bc_feat in zip(sents, char_sents, tgt_tags, discrete_features, bc_feats):
19 | dy.renew_cg()
20 | sent, char_sent, discrete_feature, bc_feat = [sent], [char_sent], [discrete_feature], [bc_feat]
21 | best_score, best_path = model.eval(sent, char_sent, discrete_feature, bc_feat, training=False,type=type)
22 |
23 | assert len(best_path) == len(tgt_tag)
24 | # acc = model.crf_decoder.cal_accuracy(best_path, tgt_tag)
25 | # tot_acc += acc
26 | predictions.append(best_path)
27 | gold_standards.append(tgt_tag)
28 |
29 | sentences.append(sent)
30 | sent_key = " ".join([str(x) for x in sent[0]])
31 | sentence_gold[sent_key] = tgt_tag
32 | score_sent[sent_key] = best_score
33 |
34 | i += 1
35 | if i % 1000 == 0:
36 | print("Testing processed %d lines " % i)
37 |
38 | pred_output_fname = "%s/%s_pred_output.txt" % (args.eval_folder,prefix)
39 | eval_output_fname = "%s_eval_score.txt" % (prefix)
40 | with open(pred_output_fname, "w") as fout:
41 | for sent, pred, gold in zip(sentences, predictions, gold_standards):
42 | for s, p, g in zip(sent[0], pred, gold):
43 | fout.write(data_loader.id_to_word[int(s)] + " " + data_loader.id_to_tag[g] + " " + data_loader.id_to_tag[p] + "\n")
44 | fout.write("\n")
45 |
46 | os.system("%s/conlleval.v2 < %s > %s" % (args.eval_folder,pred_output_fname, eval_output_fname))
47 |
48 | with open(eval_output_fname, "r") as fin:
49 | lid = 0
50 | for line in fin:
51 | if lid == 1:
52 | fields = line.split(";")
53 | acc = float(fields[0].split(":")[1].strip()[:-1])
54 | precision = float(fields[1].split(":")[1].strip()[:-1])
55 | recall = float(fields[2].split(":")[1].strip()[:-1])
56 | f1 = float(fields[3].split(":")[1].strip())
57 | lid += 1
58 |
59 | output = open(eval_output_fname, "r").read().strip()
60 | print(output)
61 | if type == "dev":
62 | os.system("rm %s" % (eval_output_fname,))
63 | os.system("rm %s" % (pred_output_fname,))
64 |
65 |
66 | return acc, precision, recall, f1,sentence_gold, score_sent
67 |
68 |
69 | def replace_singletons(data_loader, sents, replace_rate):
70 | new_batch_sents = []
71 | for sent in sents:
72 | new_sent = []
73 | for word in sent:
74 | if word in data_loader.singleton_words:
75 | new_sent.append(word if np.random.uniform(0., 1.) > replace_rate else data_loader.word_to_id[""])
76 | else:
77 | new_sent.append(word)
78 | new_batch_sents.append(new_sent)
79 | return new_batch_sents
80 |
81 |
82 | def main(args):
83 | prefix = args.model_name + "_" + str(uid)
84 | print("PREFIX: %s" % prefix)
85 | final_darpa_output_fname = "%s/%s_output.conll" % (args.eval_folder,prefix)
86 | best_output_fname = "%s/best_%s_output.conll" % (args.eval_folder,prefix)
87 | ner_data_loader = NER_DataLoader(args)
88 | print(ner_data_loader.id_to_tag)
89 |
90 | #Loading training data from CoNLL format
91 | if not args.data_aug:
92 | sents, char_sents, tgt_tags, discrete_features, bc_features,known_tags = ner_data_loader.get_data_set(args.train_path, args.lang)
93 | else:
94 | sents_tgt, char_sents_tgt, tags_tgt, dfs_tgt, bc_feats_tgt,known_tags_tgt = ner_data_loader.get_data_set(args.tgt_lang_train_path, args.lang)
95 | sents_aug, char_sents_aug, tags_aug, dfs_aug, bc_feats_aug, known_tags_aug= ner_data_loader.get_data_set(args.aug_lang_train_path, args.aug_lang)
96 | sents, char_sents, tgt_tags, discrete_features, bc_features,known_tags = sents_tgt+sents_aug, char_sents_tgt+char_sents_aug, tags_tgt+tags_aug, dfs_tgt+dfs_aug, bc_feats_tgt+bc_feats_aug,known_tags_tgt+known_tags_aug
97 |
98 |
99 | print("Data set size (train): %d" % len(sents))
100 | print("Number of discrete features: ", ner_data_loader.num_feats)
101 | epoch = bad_counter = updates = tot_example = cum_loss = 0
102 | patience = args.patience
103 |
104 | display_freq = 100
105 | valid_freq = args.valid_freq
106 | batch_size = args.batch_size
107 |
108 |
109 | print("Using Char CNN model!")
110 | model = vanilla_NER_CRF_model(args, ner_data_loader)
111 | inital_lr = args.init_lr
112 |
113 | if args.fineTune:
114 | print("Loading pre-trained model!")
115 | model.load()
116 |
117 | if len(sents) < 100:
118 | inital_lr = 0.0001
119 | else:
120 | inital_lr = args.init_lr #+ inital_lr * len(sents) / 1500.0
121 |
122 |
123 | trainer = dy.MomentumSGDTrainer(model.model, inital_lr, 0.9)
124 |
125 | def _check_batch_token(batch, id_to_vocab):
126 | for line in batch:
127 | print([id_to_vocab[i] for i in line])
128 |
129 | def _check_batch_char(batch, id_to_vocab):
130 | for line in batch:
131 | print([u" ".join([id_to_vocab[c] for c in w]) for w in line])
132 |
133 | lr_decay = args.decay_rate
134 |
135 | # decay_patience = 3
136 | # decay_num = 0
137 | valid_history = []
138 | best_results = [0.0, 0.0, 0.0, 0.0]
139 | while epoch <= args.tot_epochs:
140 | batches = make_bucket_batches(
141 | zip(sents, char_sents, tgt_tags, discrete_features, bc_features, known_tags), batch_size)
142 |
143 | for b_sents, b_char_sents, b_ner_tags, b_feats, b_bc_feats, b_known_tags in batches:
144 | dy.renew_cg()
145 |
146 | if args.replace_unk_rate > 0.0:
147 | b_sents = replace_singletons(ner_data_loader, b_sents, args.replace_unk_rate)
148 | # _check_batch_token(b_sents, ner_data_loader.id_to_word)
149 | # _check_batch_token(b_ner_tags, ner_data_loader.id_to_tag)
150 | # _check_batch_char(b_char_sents, ner_data_loader.id_to_char)
151 |
152 | loss = model.cal_loss(b_sents, b_char_sents, b_ner_tags, b_feats, b_bc_feats, b_known_tags, training=True)
153 | loss_val = loss.value()
154 | cum_loss += loss_val * len(b_sents)
155 | tot_example += len(b_sents)
156 |
157 | updates += 1
158 | loss.backward()
159 | trainer.update()
160 |
161 | if updates % display_freq == 0:
162 | print("Epoch = %d, Updates = %d, CRF Loss=%f, Accumulative Loss=%f." % (epoch, updates, loss_val, cum_loss*1.0/tot_example))
163 | if updates % valid_freq == 0:
164 | acc, precision, recall, f1,_,_ = evaluate(ner_data_loader, args.dev_path, model, args.model_name)
165 |
166 | if len(valid_history) == 0 or f1 > max(valid_history):
167 | bad_counter = 0
168 | best_results = [acc, precision, recall, f1]
169 | if updates > 0:
170 | print("Saving the best model so far.......")
171 | model.save()
172 | else:
173 | bad_counter += 1
174 | if args.lr_decay and bad_counter >= 3 and os.path.exists(args.save_to_path):
175 | bad_counter = 0
176 | model.load()
177 | lr = inital_lr / (1 + epoch * lr_decay)
178 | print("Epoch = %d, Learning Rate = %f." % (epoch, lr))
179 | trainer = dy.MomentumSGDTrainer(model.model, lr)
180 |
181 | if bad_counter > patience:
182 | print("Early stop!")
183 | print("Best on validation: acc=%f, prec=%f, recall=%f, f1=%f" % tuple(best_results))
184 |
185 | acc, precision, recall, f1,sentence_gold, score_sent = evaluate(ner_data_loader, args.test_path, model, args.model_name,"test")
186 | if args.SPAN_wise:
187 | createAnnotationOutput_SPAN_wise(args, model, ner_data_loader, sentence_gold, score_sent)
188 |
189 | exit(0)
190 | valid_history.append(f1)
191 | epoch += 1
192 |
193 |
194 |
195 | _,_,_,_,sentence_gold, score_sent = evaluate(ner_data_loader, args.test_path, model, args.model_name,"test")
196 | if args.SPAN_wise:
197 | createAnnotationOutput_SPAN_wise(args, model, ner_data_loader, sentence_gold, score_sent)
198 | print("All Epochs done.")
199 |
200 | def createAnnotationOutput_SPAN_wise(args, model, data_loader, sentence_gold, score_sent):
201 | # normalize all the entropy_spans ONLY DONE for the CFB
202 |
203 |
204 | reverse = True #For ETAL we look at the highest entropy ones, hence sorting is reversed
205 | if args.use_CFB: #For CFEAL we look at the least confident, hence sorting is not reversed
206 | reverse = False
207 |
208 |
209 | # Order the sentences by entropy of the spans
210 | fout= codecs.open(args.to_annotate, "w", encoding='utf-8')
211 |
212 | sorted_spans = sorted(model.crf_decoder.most_uncertain_entropy_spans, key=lambda k:model.crf_decoder.most_uncertain_entropy_spans[k],reverse=reverse)
213 | print("Total unique spans: {0}".format(len(sorted_spans)))
214 | count_span = args.k
215 | count_tokens = args.k
216 |
217 | #DEBUG Print Span Entropy in the sorted order of selected spans
218 | fdebug = codecs.open("./" + args.model_name + "_span_entropy_debug.txt", "w", encoding='utf-8')
219 |
220 | for sorted_span in sorted_spans:
221 |
222 | span_words= []
223 | if count_tokens <=0:
224 | break
225 | (span_entropy,sentence_key, start, end,best_path) = model.crf_decoder.most_uncertain_entropy_spans[sorted_span]
226 | gold_path = sentence_gold[sentence_key]
227 | sent = sentence_key.split()
228 |
229 | for t in sorted_span.split():
230 | span_words.append(data_loader.id_to_word[int(t)])
231 | fdebug.write(" ".join(span_words) + " " + str(span_entropy) + "\n")
232 |
233 | first = True
234 | path = deepcopy(best_path)
235 | for i in range(start, end):
236 | if first:
237 | path[i] = -10 #Id for B-UNK
238 | first = False
239 | else:
240 | path[i] = -11 #Id for I-UNK
241 |
242 | idx = 0
243 | for token, tag in zip(sent, path):
244 |
245 | if tag == -10:
246 | tag_label = "B-UNK"
247 | count_span -= 1
248 | count_tokens -= 1
249 | elif tag == -11:
250 | tag_label = "I-UNK"
251 | count_tokens -= 1
252 | else:
253 | tag_label = data_loader.id_to_tag[tag]
254 |
255 | gold_tag_label = data_loader.id_to_tag[gold_path[idx]]
256 | idx += 1
257 | fout.write(data_loader.id_to_word[int(token)] + "\t" + tag_label + "\t" + gold_tag_label + "\n")
258 |
259 | fout.write("\n")
260 |
261 | print("Total unique spans for exercise: {0}".format(args.k))
262 |
263 | #SAL: Select most uncertain sequence
264 | basename = os.path.basename(args.to_annotate).replace(".conll", "")
265 | LC_output_file = os.path.dirname(args.to_annotate) + "/" + basename + "_LC.conll"
266 | count_tokens = args.k
267 | with codecs.open(LC_output_file, "w", encoding='utf-8') as fout:
268 | idx = 0
269 | for sentence_key in sorted(score_sent.keys(), reverse=False):
270 | if count_tokens<=0:
271 | break
272 | sent = sentence_key.split()
273 | gold_path = sentence_gold[sentence_key]
274 | token_count = 0
275 | for token in sent:
276 | count_tokens -= 1
277 | gold_tag_label = data_loader.id_to_tag[gold_path[token_count]]
278 | token_count += 1
279 | fout.write(data_loader.id_to_word[int(token)] + "\t" + "UNK " + gold_tag_label + "\n")
280 | fout.write("\n")
281 | idx += 1
282 |
283 |
284 | def test_single_model(args):
285 | ner_data_loader = NER_DataLoader(args)
286 | # ugly: get discrete number features
287 | _, _, _, _, _,_ = ner_data_loader.get_data_set(args.train_path, args.lang)
288 |
289 | print("Using Char CNN model!")
290 | model = vanilla_NER_CRF_model(args, ner_data_loader)
291 | model.load()
292 |
293 | _,_,_,_,sentence_gold, score_sent = evaluate(ner_data_loader, args.test_path, model, args.model_name,"test")
294 | if args.SPAN_wise:
295 | createAnnotationOutput_SPAN_wise(args, model, ner_data_loader, sentence_gold, score_sent)
296 |
297 |
298 |
299 |
300 | from args import init_config
301 |
302 | args = init_config()
303 | from models.model_builder import *
304 | import os
305 | import uuid
306 | from dataloaders.data_loader import *
307 | uid = uuid.uuid4().get_hex()[:6]
308 |
309 | if __name__ == "__main__":
310 | # args = init_config()
311 | if args.mode == "train":
312 | if args.load_from_path is not None:
313 | args.load_from_path = args.load_from_path
314 | else:
315 | args.load_from_path = args.save_to_path
316 | main(args)
317 |
318 | elif args.mode == "test_1":
319 | test_single_model(args)
320 |
321 | else:
322 | raise NotImplementedError
323 |
--------------------------------------------------------------------------------
/eval/conlleval.v2:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl -w
2 | # conlleval: evaluate result of processing CoNLL-2000 shared task
3 | # usage: conlleval [-l] [-r] [-d delimiterTag] [-o oTag] < file
4 | # README: http://cnts.uia.ac.be/conll2000/chunking/output.html
5 | # options: l: generate LaTeX output for tables like in
6 | # http://cnts.uia.ac.be/conll2003/ner/example.tex
7 | # r: accept raw result tags (without B- and I- prefix;
8 | # assumes one word per chunk)
9 | # d: alternative delimiter tag (default is single space)
10 | # o: alternative outside tag (default is O)
11 | # note: the file should contain lines with items separated
12 | # by $delimiter characters (default space). The final
13 | # two items should contain the correct tag and the
14 | # guessed tag in that order. Sentences should be
15 | # separated from each other by empty lines or lines
16 | # with $boundary fields (default -X-).
17 | # url: http://lcg-www.uia.ac.be/conll2000/chunking/
18 | # started: 1998-09-25
19 | # version: 2004-01-26
20 | # author: Erik Tjong Kim Sang
21 |
22 | use strict;
23 |
24 | my $false = 0;
25 | my $true = 42;
26 |
27 | my $boundary = "-X-"; # sentence boundary
28 | my $correct; # current corpus chunk tag (I,O,B)
29 | my $correctChunk = 0; # number of correctly identified chunks
30 | my $correctTags = 0; # number of correct chunk tags
31 | my $correctType; # type of current corpus chunk tag (NP,VP,etc.)
32 | my $delimiter = " "; # field delimiter
33 | my $FB1 = 0.0; # FB1 score (Van Rijsbergen 1979)
34 | my $firstItem; # first feature (for sentence boundary checks)
35 | my $foundCorrect = 0; # number of chunks in corpus
36 | my $foundGuessed = 0; # number of identified chunks
37 | my $guessed; # current guessed chunk tag
38 | my $guessedType; # type of current guessed chunk tag
39 | my $i; # miscellaneous counter
40 | my $inCorrect = $false; # currently processed chunk is correct until now
41 | my $lastCorrect = "O"; # previous chunk tag in corpus
42 | my $latex = 0; # generate LaTeX formatted output
43 | my $lastCorrectType = ""; # type of previously identified chunk tag
44 | my $lastGuessed = "O"; # previously identified chunk tag
45 | my $lastGuessedType = ""; # type of previous chunk tag in corpus
46 | my $lastType; # temporary storage for detecting duplicates
47 | my $line; # line
48 | my $nbrOfFeatures = -1; # number of features per line
49 | my $precision = 0.0; # precision score
50 | my $oTag = "O"; # outside tag, default O
51 | my $raw = 0; # raw input: add B to every token
52 | my $recall = 0.0; # recall score
53 | my $tokenCounter = 0; # token counter (ignores sentence breaks)
54 |
55 | my %correctChunk = (); # number of correctly identified chunks per type
56 | my %foundCorrect = (); # number of chunks in corpus per type
57 | my %foundGuessed = (); # number of identified chunks per type
58 |
59 | my @features; # features on line
60 | my @sortedTypes; # sorted list of chunk type names
61 |
62 | # sanity check
63 | while (@ARGV and $ARGV[0] =~ /^-/) {
64 | if ($ARGV[0] eq "-l") { $latex = 1; shift(@ARGV); }
65 | elsif ($ARGV[0] eq "-r") { $raw = 1; shift(@ARGV); }
66 | elsif ($ARGV[0] eq "-d") {
67 | shift(@ARGV);
68 | if (not defined $ARGV[0]) {
69 | die "conlleval: -d requires delimiter character";
70 | }
71 | $delimiter = shift(@ARGV);
72 | } elsif ($ARGV[0] eq "-o") {
73 | shift(@ARGV);
74 | if (not defined $ARGV[0]) {
75 | die "conlleval: -o requires delimiter character";
76 | }
77 | $oTag = shift(@ARGV);
78 | } else { die "conlleval: unknown argument $ARGV[0]\n"; }
79 | }
80 | if (@ARGV) { die "conlleval: unexpected command line argument\n"; }
81 | # process input
82 | while () {
83 | chomp($line = $_);
84 | @features = split(/$delimiter/,$line);
85 | if ($nbrOfFeatures < 0) { $nbrOfFeatures = $#features; }
86 | elsif ($nbrOfFeatures != $#features and @features != 0) {
87 | printf STDERR "unexpected number of features: %d (%d)\n",
88 | $#features+1,$nbrOfFeatures+1;
89 | exit(1);
90 | }
91 | if (@features == 0 or
92 | $features[0] eq $boundary) { @features = ($boundary,"O","O"); }
93 | if (@features < 2) {
94 | die "conlleval: unexpected number of features in line $line\n";
95 | }
96 | if ($raw) {
97 | if ($features[$#features] eq $oTag) { $features[$#features] = "O"; }
98 | if ($features[$#features-1] eq $oTag) { $features[$#features-1] = "O"; }
99 | if ($features[$#features] ne "O") {
100 | $features[$#features] = "B-$features[$#features]";
101 | }
102 | if ($features[$#features-1] ne "O") {
103 | $features[$#features-1] = "B-$features[$#features-1]";
104 | }
105 | }
106 | # 20040126 ET code which allows hyphens in the types
107 | if ($features[$#features] =~ /^([^-]*)-(.*)$/) {
108 | $guessed = $1;
109 | $guessedType = $2;
110 | } else {
111 | $guessed = $features[$#features];
112 | $guessedType = "";
113 | }
114 | pop(@features);
115 | if ($features[$#features] =~ /^([^-]*)-(.*)$/) {
116 | $correct = $1;
117 | $correctType = $2;
118 | } else {
119 | $correct = $features[$#features];
120 | $correctType = "";
121 | }
122 | pop(@features);
123 | # ($guessed,$guessedType) = split(/-/,pop(@features));
124 | # ($correct,$correctType) = split(/-/,pop(@features));
125 | $guessedType = $guessedType ? $guessedType : "";
126 | $correctType = $correctType ? $correctType : "";
127 | $firstItem = shift(@features);
128 |
129 | # 1999-06-26 sentence breaks should always be counted as out of chunk
130 | if ( $firstItem eq $boundary ) { $guessed = "O"; }
131 |
132 | if ($inCorrect) {
133 | if ( &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and
134 | &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and
135 | $lastGuessedType eq $lastCorrectType) {
136 | $inCorrect=$false;
137 | $correctChunk++;
138 | $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ?
139 | $correctChunk{$lastCorrectType}+1 : 1;
140 | } elsif (
141 | &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) !=
142 | &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) or
143 | $guessedType ne $correctType ) {
144 | $inCorrect=$false;
145 | }
146 | }
147 |
148 | if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and
149 | &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and
150 | $guessedType eq $correctType) { $inCorrect = $true; }
151 |
152 | if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) ) {
153 | $foundCorrect++;
154 | $foundCorrect{$correctType} = $foundCorrect{$correctType} ?
155 | $foundCorrect{$correctType}+1 : 1;
156 | }
157 | if ( &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) ) {
158 | $foundGuessed++;
159 | $foundGuessed{$guessedType} = $foundGuessed{$guessedType} ?
160 | $foundGuessed{$guessedType}+1 : 1;
161 | }
162 | if ( $firstItem ne $boundary ) {
163 | if ( $correct eq $guessed and $guessedType eq $correctType ) {
164 | $correctTags++;
165 | }
166 | $tokenCounter++;
167 | }
168 |
169 | $lastGuessed = $guessed;
170 | $lastCorrect = $correct;
171 | $lastGuessedType = $guessedType;
172 | $lastCorrectType = $correctType;
173 | }
174 | if ($inCorrect) {
175 | $correctChunk++;
176 | $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ?
177 | $correctChunk{$lastCorrectType}+1 : 1;
178 | }
179 |
180 | if (not $latex) {
181 | # compute overall precision, recall and FB1 (default values are 0.0)
182 | $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0);
183 | $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0);
184 | $FB1 = 2*$precision*$recall/($precision+$recall)
185 | if ($precision+$recall > 0);
186 |
187 | # print overall performance
188 | printf "processed $tokenCounter tokens with $foundCorrect phrases; ";
189 | printf "found: $foundGuessed phrases; correct: $correctChunk.\n";
190 | if ($tokenCounter>0) {
191 | printf "accuracy: %6.2f%%; ",100*$correctTags/$tokenCounter;
192 | printf "precision: %6.2f%%; ",$precision;
193 | printf "recall: %6.2f%%; ",$recall;
194 | printf "FB1: %6.2f\n",$FB1;
195 | }
196 | }
197 |
198 | # sort chunk type names
199 | undef($lastType);
200 | @sortedTypes = ();
201 | foreach $i (sort (keys %foundCorrect,keys %foundGuessed)) {
202 | if (not($lastType) or $lastType ne $i) {
203 | push(@sortedTypes,($i));
204 | }
205 | $lastType = $i;
206 | }
207 | # print performance per chunk type
208 | if (not $latex) {
209 | for $i (@sortedTypes) {
210 | $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0;
211 | if (not($foundGuessed{$i})) { $foundGuessed{$i} = 0; $precision = 0.0; }
212 | else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; }
213 | if (not($foundCorrect{$i})) { $recall = 0.0; }
214 | else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; }
215 | if ($precision+$recall == 0.0) { $FB1 = 0.0; }
216 | else { $FB1 = 2*$precision*$recall/($precision+$recall); }
217 | printf "%17s: ",$i;
218 | printf "precision: %6.2f%%; ",$precision;
219 | printf "recall: %6.2f%%; ",$recall;
220 | printf "FB1: %6.2f %d\n",$FB1,$foundGuessed{$i};
221 | }
222 | } else {
223 | print " & Precision & Recall & F\$_{\\beta=1} \\\\\\hline";
224 | for $i (@sortedTypes) {
225 | $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0;
226 | if (not($foundGuessed{$i})) { $precision = 0.0; }
227 | else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; }
228 | if (not($foundCorrect{$i})) { $recall = 0.0; }
229 | else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; }
230 | if ($precision+$recall == 0.0) { $FB1 = 0.0; }
231 | else { $FB1 = 2*$precision*$recall/($precision+$recall); }
232 | printf "\n%-7s & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\",
233 | $i,$precision,$recall,$FB1;
234 | }
235 | print "\\hline\n";
236 | $precision = 0.0;
237 | $recall = 0;
238 | $FB1 = 0.0;
239 | $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0);
240 | $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0);
241 | $FB1 = 2*$precision*$recall/($precision+$recall)
242 | if ($precision+$recall > 0);
243 | printf "Overall & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline\n",
244 | $precision,$recall,$FB1;
245 | }
246 |
247 | exit 0;
248 |
249 | # endOfChunk: checks if a chunk ended between the previous and current word
250 | # arguments: previous and current chunk tags, previous and current types
251 | # note: this code is capable of handling other chunk representations
252 | # than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
253 | # Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
254 |
255 | sub endOfChunk {
256 | my $prevTag = shift(@_);
257 | my $tag = shift(@_);
258 | my $prevType = shift(@_);
259 | my $type = shift(@_);
260 | my $chunkEnd = $false;
261 |
262 | if ( $prevTag eq "B" and $tag eq "B" ) { $chunkEnd = $true; }
263 | if ( $prevTag eq "B" and $tag eq "O" ) { $chunkEnd = $true; }
264 | if ( $prevTag eq "B" and $tag eq "S" ) { $chunkEnd = $true; }
265 |
266 | if ( $prevTag eq "I" and $tag eq "B" ) { $chunkEnd = $true; }
267 | if ( $prevTag eq "I" and $tag eq "S" ) { $chunkEnd = $true; }
268 | if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; }
269 |
270 | if ( $prevTag eq "E" and $tag eq "E" ) { $chunkEnd = $true; }
271 | if ( $prevTag eq "E" and $tag eq "I" ) { $chunkEnd = $true; }
272 | if ( $prevTag eq "E" and $tag eq "O" ) { $chunkEnd = $true; }
273 | if ( $prevTag eq "E" and $tag eq "S" ) { $chunkEnd = $true; }
274 | if ( $prevTag eq "E" and $tag eq "B" ) { $chunkEnd = $true; }
275 |
276 | if ( $prevTag eq "S" and $tag eq "E" ) { $chunkEnd = $true; }
277 | if ( $prevTag eq "S" and $tag eq "I" ) { $chunkEnd = $true; }
278 | if ( $prevTag eq "S" and $tag eq "O" ) { $chunkEnd = $true; }
279 | if ( $prevTag eq "S" and $tag eq "S" ) { $chunkEnd = $true; }
280 | if ( $prevTag eq "S" and $tag eq "B" ) { $chunkEnd = $true; }
281 |
282 |
283 | if ($prevTag ne "O" and $prevTag ne "." and $prevType ne $type) {
284 | $chunkEnd = $true;
285 | }
286 |
287 | # corrected 1998-12-22: these chunks are assumed to have length 1
288 | if ( $prevTag eq "]" ) { $chunkEnd = $true; }
289 | if ( $prevTag eq "[" ) { $chunkEnd = $true; }
290 |
291 | return($chunkEnd);
292 | }
293 |
294 | # startOfChunk: checks if a chunk started between the previous and current word
295 | # arguments: previous and current chunk tags, previous and current types
296 | # note: this code is capable of handling other chunk representations
297 | # than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
298 | # Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
299 |
300 | sub startOfChunk {
301 | my $prevTag = shift(@_);
302 | my $tag = shift(@_);
303 | my $prevType = shift(@_);
304 | my $type = shift(@_);
305 | my $chunkStart = $false;
306 |
307 | if ( $prevTag eq "B" and $tag eq "B" ) { $chunkStart = $true; }
308 | if ( $prevTag eq "I" and $tag eq "B" ) { $chunkStart = $true; }
309 | if ( $prevTag eq "O" and $tag eq "B" ) { $chunkStart = $true; }
310 | if ( $prevTag eq "S" and $tag eq "B" ) { $chunkStart = $true; }
311 | if ( $prevTag eq "E" and $tag eq "B" ) { $chunkStart = $true; }
312 |
313 | if ( $prevTag eq "B" and $tag eq "S" ) { $chunkStart = $true; }
314 | if ( $prevTag eq "I" and $tag eq "S" ) { $chunkStart = $true; }
315 | if ( $prevTag eq "O" and $tag eq "S" ) { $chunkStart = $true; }
316 | if ( $prevTag eq "S" and $tag eq "S" ) { $chunkStart = $true; }
317 | if ( $prevTag eq "E" and $tag eq "S" ) { $chunkStart = $true; }
318 |
319 | if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; }
320 | if ( $prevTag eq "S" and $tag eq "I" ) { $chunkStart = $true; }
321 | if ( $prevTag eq "E" and $tag eq "I" ) { $chunkStart = $true; }
322 |
323 | if ( $prevTag eq "S" and $tag eq "E" ) { $chunkStart = $true; }
324 | if ( $prevTag eq "E" and $tag eq "E" ) { $chunkStart = $true; }
325 | if ( $prevTag eq "O" and $tag eq "E" ) { $chunkStart = $true; }
326 |
327 | if ($tag ne "O" and $tag ne "." and $prevType ne $type) {
328 | $chunkStart = $true;
329 | }
330 |
331 | # corrected 1998-12-22: these chunks are assumed to have length 1
332 | if ( $tag eq "[" ) { $chunkStart = $true; }
333 | if ( $tag eq "]" ) { $chunkStart = $true; }
334 |
335 | return($chunkStart);
336 | }
337 |
--------------------------------------------------------------------------------
/models/decoders.py:
--------------------------------------------------------------------------------
1 | __author__ = 'chuntingzhou and aditichaudhary'
2 | from utils.util import *
3 | import numpy as np
4 | from collections import defaultdict
5 | from scipy.special import logsumexp
6 |
7 |
8 | class Decoder():
9 | def __init__(self, tag_size):
10 | # type: () -> object
11 | pass
12 |
13 | def decode_loss(self):
14 | raise NotImplementedError
15 |
16 | def decoding(self):
17 | raise NotImplementedError
18 |
19 |
20 | def constrained_transition_init(transition_matrix, contraints):
21 | '''
22 | :param transition_matrix: numpy array, (from, to)
23 | :param contraints: [[from_indexes], [to_indexes]]
24 | :return: newly initialized transition matrix
25 | '''
26 | for cons in contraints:
27 | transition_matrix[cons[0], cons[1]] = -1000.0
28 | return transition_matrix
29 |
30 | class chain_CRF_decoder(Decoder):
31 | ''' For NER and POS Tagging. '''
32 |
33 | def __init__(self, args, model, src_output_dim, tag_emb_dim, tag_size, constraints=None):
34 | Decoder.__init__(self, tag_size)
35 | self.model = model
36 | self.start_id = tag_size
37 | self.end_id = tag_size + 1
38 | self.tag_size = tag_size + 2
39 | tag_size = tag_size + 2
40 | self.args = args
41 |
42 | # optional: transform the hidden space of src encodings into the tag embedding space
43 | self.W_src2tag_readout = model.add_parameters((tag_emb_dim, src_output_dim))
44 | self.b_src2tag_readout = model.add_parameters((tag_emb_dim))
45 | self.b_src2tag_readout.zero()
46 |
47 | self.W_scores_readout2tag = model.add_parameters((tag_size, tag_emb_dim))
48 | self.b_scores_readout2tag = model.add_parameters((tag_size))
49 | self.b_scores_readout2tag.zero()
50 |
51 | # (to, from), trans[i] is the transition score to i
52 | init_transition_matrix = np.random.randn(tag_size, tag_size) # from, to
53 | init_transition_matrix[:, self.end_id] = -1000.0
54 | init_transition_matrix[self.start_id, :] = -1000.0
55 | if False and constraints is not None:
56 | init_transition_matrix = constrained_transition_init(init_transition_matrix, constraints)
57 | # print init_transition_matrix
58 | #self.transition_matrix = model.add_lookup_parameters((tag_size, tag_size),
59 | # init=dy.NumpyInitializer(init_transition_matrix))
60 | self.transition_matrix = model.lookup_parameters_from_numpy(init_transition_matrix) # (to, from)
61 |
62 | self.ngram = args.ngram
63 |
64 | self.entropy_threshold = args.entropy_threshold
65 | if args.entropy_threshold is not None and args.use_CFB:
66 | self.entropy_threshold = args.entropy_threshold * -1
67 |
68 | self.prob_threshold = np.NINF
69 | self.entropy_spans = defaultdict(lambda: 0)
70 | self.most_uncertain_entropy_spans = {}
71 | self.entropy_spans_number = defaultdict(lambda: 0)
72 | self.full_sentences = defaultdict(list)
73 | self.avg_spans_in_sent_entropy = defaultdict(list)
74 | self.SPAN_wise = args.SPAN_wise
75 |
76 | def forward_alg(self, tag_scores):
77 | ''' Forward DP for CRF.
78 | tag_scores (list of batched dy.Tensor): (tag_size, batchsize)
79 | '''
80 | # Be aware: if a is lookup_parameter with 2 dimension, then a[i] returns one row;
81 | # if b = dy.parameter(a), then b[i] returns one column; which means dy.parameter(a) already transpose a
82 | # transpose_transition_score = self.transition_matrix
83 | transpose_transition_score = dy.parameter(self.transition_matrix) # (from, to)
84 |
85 | # alpha(t', s) = the score of sequence from t=0 to t=t' in log space
86 | # np_init_alphas = -100.0 * np.ones((self.tag_size, batch_size))
87 | # np_init_alphas[self.start_id, :] = 0.0
88 | # alpha_tm1 = dy.inputTensor(np_init_alphas, batched=True)
89 | alphas = []
90 |
91 | alpha_tm1 = transpose_transition_score[self.start_id] + tag_scores[0]
92 | # self.transition_matrix[i]: from i, column
93 | # transpose_score[i]: to i, row
94 | # transpose_score: to, from
95 | alphas.append(alpha_tm1)
96 |
97 | for tag_score in tag_scores[1:]:
98 | # extend for each transit
99 | alpha_tm1 = dy.concatenate_cols([alpha_tm1] * self.tag_size) # (from, to, batch_size)
100 | # each column i of tag_score will be the repeated emission score to tag i
101 | tag_score = dy.transpose(dy.concatenate_cols([tag_score] * self.tag_size))
102 | alpha_t = alpha_tm1 + transpose_transition_score + tag_score
103 | alpha_tm1 = log_sum_exp_dim_0(alpha_t) # (tag_size, batch_size)
104 | alphas.append(alpha_tm1)
105 |
106 | terminal_alpha = log_sum_exp_dim_0(alpha_tm1 + self.transition_matrix[self.end_id]) # (1, batch_size)
107 | return terminal_alpha,alphas
108 |
109 | def score_one_sequence(self, tag_scores, tags, batch_size):
110 | ''' tags: list of tag ids at each time step '''
111 | # print tags, batch_size
112 | # print batch_size
113 | # print "scoring one sentence"
114 | tags = [[self.start_id] * batch_size] + tags # len(tag_scores) = len(tags) - 1
115 | score = dy.inputTensor(np.zeros(batch_size), batched=True)
116 | # tag_scores = dy.concatenate_cols(tag_scores) # tot_tags, sent_len, batch_size
117 | # print "tag dim: ", tag_scores.dim()
118 | for i in range(len(tags) - 1):
119 | score += dy.pick_batch(dy.lookup_batch(self.transition_matrix, tags[i + 1]), tags[i]) \
120 | + dy.pick_batch(tag_scores[i], tags[i + 1])
121 | score += dy.pick_batch(dy.lookup_batch(self.transition_matrix, [self.end_id]*batch_size), tags[-1])
122 | return score
123 |
124 | def backward_one_sequence(self, tag_scores):
125 | ''' Backward DP for CRF.
126 | tag_scores (list of batched dy.Tensor): (tag_size, batchsize)
127 | '''
128 | # Be aware: if a is lookup_parameter with 2 dimension, then a[i] returns one row;
129 | # if b = dy.parameter(a), then b[i] returns one column; which means dy.parameter(a) already transpose a
130 | transpose_transition_score = dy.parameter(self.transition_matrix)
131 | # transpose_transition_score = dy.parameter(self.transition_matrix)
132 |
133 | # alpha(t', s) = the score of sequence from t=0 to t=t' in log space
134 | # np_init_alphas = -100.0 * np.ones((self.tag_size, batch_size))
135 | # np_init_alphas[self.start_id, :] = 0.0
136 | # alpha_tm1 = dy.inputTensor(np_init_alphas, batched=True)
137 | betas = []
138 | # beta_tp1 = self.transition_matrix[self.end_id] + tag_scores[-1]
139 | # beta_tp1 = dy.inputTensor(np.zeros(self.tag_size))
140 | beta_tp1 = self.transition_matrix[self.end_id]
141 | betas.append(beta_tp1)
142 | # self.transition_matrix[i]: from i, column
143 | # transpose_score[i]: to i, row
144 | # transpose_score: to, from
145 | seq_len = len(tag_scores)
146 | tag_scores.reverse()
147 | for tag_score in tag_scores[0:seq_len - 1]:
148 | # extend for each transit
149 | beta_tp1 = dy.concatenate_cols([beta_tp1] * self.tag_size) # (to, from, batch_size)
150 | # each column i of tag_score will be the repeated emission score to tag i
151 | tag_score = dy.concatenate_cols([tag_score] * self.tag_size) # (to, from)
152 | beta_t = beta_tp1 + dy.transpose(transpose_transition_score) + tag_score # (to, from)
153 | beta_tp1 = log_sum_exp_dim_0(beta_t) # (tag_size, batch_size)
154 | betas.append(beta_tp1)
155 |
156 | # betas.append(beta_tp1 + transpose_transition_score[self.start_id] + tag_scores[-1])
157 | terminal_beta = log_sum_exp_dim_0(
158 | beta_tp1 + transpose_transition_score[self.start_id] + tag_scores[-1]) # (1, batch_size)
159 | betas.reverse()
160 | return terminal_beta, betas
161 |
162 | def get_uncertain_subsequences(self, sents, tag_scores, alphas, betas, Z, gammas,
163 | best_path, B_tags, I_tags, O_tags):
164 | # predicted_path = deepcopy(best_path)
165 | # transition_B_O = np.array(dy.pick_batch(dy.lookup_batch(self.transition_matrix, O_tags), B_tags).value())
166 | # transition_I_O = np.array(dy.pick_batch(dy.lookup_batch(self.transition_matrix, O_tags), I_tags).value())
167 | # transition_B_I = np.array(dy.pick_batch(dy.lookup_batch(self.transition_matrix, I_tags), B_tags).value())
168 | # transition_I_I = np.array(dy.pick_batch(dy.lookup_batch(self.transition_matrix, I_tags), I_tags).value())
169 |
170 |
171 | first = True
172 | Z = Z.value()
173 | for i in range(len(sents)):
174 | # log_p_alpha = np.array(alphas[i].value())[B_tags]
175 | # transition = transition_B_O
176 | log_pin = logsumexp(np.array(gammas[i].value())[B_tags]) #Prob (y=start_entity|x)= log_sum{tags}# (e^log(P=B-tag|x))
177 |
178 | for j in range(i + 1, len(sents)):
179 | if (j - i) > self.ngram:
180 | break
181 |
182 | log_p_out = np.array(gammas[j].value())[O_tags]
183 | log_p = log_pin + log_p_out
184 | p = np.exp(log_p)
185 | if p > 1.0:
186 | #print(p, log_p)
187 | H= 0.0
188 | else:
189 | H = -(p * log_p) - ((1-p)* np.log(1-p))
190 |
191 | if H > self.entropy_threshold:
192 | # best_path = deepcopy(predicted_path)
193 | span = " ".join([str(x) for x in sents[i:j]])
194 | sent = " ".join([str(x) for x in sents])
195 | self.entropy_spans[span] += H
196 | self.entropy_spans_number[span] += 1
197 |
198 | if self.SPAN_wise:
199 | if span in self.most_uncertain_entropy_spans:
200 | (existing_entropy, _,_,_,_) = self.most_uncertain_entropy_spans[span]
201 | #if H > existing_entropy:
202 | # self.most_uncertain_entropy_spans[span] = (H, sent, i,j,best_path)
203 | self.most_uncertain_entropy_spans[span] = (self.entropy_spans[span], sent, i, j, best_path)
204 | else:
205 | self.most_uncertain_entropy_spans[span] = (H, sent, i,j,best_path)
206 |
207 | # for k in range(i,j+1):
208 | # best_path[k] = -10
209 | self.full_sentences[sent].append((i, j, best_path, self.entropy_spans[span]))
210 | self.avg_spans_in_sent_entropy[sent].append(span)
211 | # self.full_sentences[span] = (sents,best_path,predicted_path, self.entropy_spans[span])
212 |
213 | log_pin += logsumexp(np.array(gammas[j].value())[I_tags])
214 | if log_pin < np.log(1e-4):
215 | break
216 |
217 |
218 | def get_uncertain_subsequences_CFB(self, sents, tag_scores, alphas, betas, Z, gammas,
219 | best_path, tag_to_id,B_UNK, I_UNK):
220 | first = True
221 | Z = Z.value()
222 | entropy_spans_number = defaultdict(lambda :0)
223 | for i in range(len(sents)):
224 | known_tags =np.array([[0]] * len(sents)).reshape((len(sents),1,1))
225 | known_tags[i][0][0] = 1
226 | tags = np.array([B_UNK] * len(sents)).reshape((len(sents),1))
227 | tags[i][0] = best_path[i]
228 | for j in range(i + 1, len(sents)):
229 | if (j - i + 1) > self.ngram:
230 | break
231 | tags[j][0] = best_path[j]
232 | known_tags[j][0][0] = 1
233 | Z_span = self.score_one_sequence_partial(tag_scores, tags, 1, known_tags, tag_to_id, B_UNK, I_UNK)
234 | confidence = Z_span.value() - Z
235 |
236 | if confidence < self.entropy_threshold:
237 | # best_path = deepcopy(predicted_path)
238 | span = " ".join([str(x) for x in sents[i:j + 1]])
239 | sent = " ".join([str(x) for x in sents])
240 |
241 | if self.SPAN_wise:
242 | if span in self.most_uncertain_entropy_spans:
243 | (existing_threshold, _,_,_,_) = self.most_uncertain_entropy_spans[span]
244 | if confidence < existing_threshold:
245 | self.most_uncertain_entropy_spans[span] = (confidence, sent, i, j,best_path)
246 | else:
247 | self.most_uncertain_entropy_spans[span] = (confidence, sent, i, j,best_path)
248 |
249 | self.entropy_spans[span] += confidence
250 | self.entropy_spans_number[span] += 1
251 | #self.full_sentences[sent].append((i, j, best_path, self.entropy_spans[span]))
252 | self.full_sentences[sent].append((i, j, best_path, span))
253 | #self.avg_spans_in_sent_entropy[sent].append(self.entropy_spans[span])
254 | self.avg_spans_in_sent_entropy[sent].append(span)
255 |
256 |
257 | def decode_loss(self, src_encodings, tgt_tags, use_partial, known_tags, tag_to_id, B_UNK, I_UNK):
258 | # This is the batched version which requires bucketed batch input with the same length.
259 | '''
260 | The length of src_encodings and tgt_tags are time_steps.
261 | src_encodings: list of dynet.Tensor (src_output_dim, batch_size)
262 | tgt_tags: list of tag ids [(1, batch_size)]
263 | return: average of negative log likelihood
264 | '''
265 | # TODO: transpose tgt tags first
266 | batch_size = len(tgt_tags)
267 | tgt_tags, tgt_mask = transpose_input(tgt_tags, 0)
268 | known_tags, _ = transpose_input(known_tags, 0)
269 |
270 | W_src2tag_readout = dy.parameter(self.W_src2tag_readout)
271 | b_src2tag_readout = dy.parameter(self.b_src2tag_readout)
272 | W_score_tag = dy.parameter(self.W_scores_readout2tag)
273 | b_score_tag = dy.parameter(self.b_scores_readout2tag)
274 |
275 | tag_embs = [dy.tanh(dy.affine_transform([b_src2tag_readout, W_src2tag_readout, src_encoding])) for src_encoding
276 | in src_encodings]
277 |
278 | tag_scores = [dy.affine_transform([b_score_tag, W_score_tag, tag_emb]) for tag_emb in tag_embs]
279 |
280 | # scores over all paths, all scores are in log-space
281 | forward_scores,_ = self.forward_alg(tag_scores)
282 |
283 | if use_partial:
284 | gold_score = self.score_one_sequence_partial(tag_scores, tgt_tags, batch_size, known_tags, tag_to_id, B_UNK,
285 | I_UNK)
286 | else:
287 | gold_score = self.score_one_sequence(tag_scores, tgt_tags, batch_size)
288 |
289 | # negative log likelihood
290 | loss = dy.sum_batches(forward_scores - gold_score) / batch_size
291 | return loss #, dy.sum_batches(forward_scores)/batch_size, dy.sum_batches(gold_score) / batch_size
292 |
293 | def makeMask(self, batch_size, known_tags, tag_to_id, tags, index, B_UNK, I_UNK):
294 | mask_w_0 = np.array([[-1000] * self.tag_size])
295 | mask_w_0 = np.transpose(mask_w_0)
296 | mask_w_0_all_s = np.reshape(np.array([mask_w_0] * batch_size), (self.tag_size, batch_size))
297 |
298 | mask_idx = []
299 | tag_vals = []
300 | for idx, w0_si in enumerate(known_tags[index]):
301 | if w0_si[0] == 1:
302 | mask_idx.append(idx)
303 | tag_vals.append(tags[index][idx])
304 | else:
305 | if tags[index][idx] == B_UNK:
306 | if self.args.misc:
307 | possible_labels = ["B-LOC", "B-PER", "B-ORG", "B-MISC", "O","I-LOC", "I-PER", "I-ORG", "I-MISC"]
308 | else:
309 | possible_labels = ["B-LOC", "B-PER", "B-ORG", "B-GPE", "O","I-LOC", "I-PER", "I-ORG", "I-GPE"]
310 | for pl in possible_labels:
311 | mask_idx.append(idx)
312 | tag_vals.append(tag_to_id[pl])
313 | mask_w_0_all_s[tag_vals, mask_idx] = 0
314 | return mask_w_0_all_s
315 |
316 | def score_one_sequence_partial(self, tag_scores, tags, batch_size, known_tags, tag_to_id, B_UNK, I_UNK):
317 | transpose_transition_score = dy.parameter(self.transition_matrix)
318 |
319 | alpha_tm1 = transpose_transition_score[self.start_id] + tag_scores[0]
320 |
321 | mask_w_0_all_s = self.makeMask(batch_size, known_tags, tag_to_id, tags, 0, B_UNK, I_UNK)
322 | i = 1
323 | alpha_tm1 = alpha_tm1 + dy.inputTensor(mask_w_0_all_s, batched=True)
324 | for tag_score in tag_scores[1:]:
325 | alpha_tm1 = dy.concatenate_cols([alpha_tm1] * self.tag_size) # (from, to, batch_size)
326 | tag_score = dy.transpose(dy.concatenate_cols([tag_score] * self.tag_size))
327 | alpha_t = alpha_tm1 + transpose_transition_score + tag_score
328 | alpha_tm1 = log_sum_exp_dim_0(alpha_t) # (tag_size, batch_size)
329 | mask_w_i_all_s = self.makeMask(batch_size, known_tags, tag_to_id, tags, i, B_UNK, I_UNK)
330 | alpha_tm1 = alpha_tm1 + dy.inputTensor(mask_w_i_all_s, batched=True)
331 | i = i + 1
332 |
333 | terminal_alpha = log_sum_exp_dim_0(alpha_tm1 + self.transition_matrix[self.end_id]) # (1, batch_size)
334 | return terminal_alpha
335 |
336 |
337 | def get_crf_scores(self, src_encodings):
338 | W_src2tag_readout = dy.parameter(self.W_src2tag_readout)
339 | b_src2tag_readout = dy.parameter(self.b_src2tag_readout)
340 | W_score_tag = dy.parameter(self.W_scores_readout2tag)
341 | b_score_tag = dy.parameter(self.b_scores_readout2tag)
342 |
343 | tag_embs = [dy.tanh(dy.affine_transform([b_src2tag_readout, W_src2tag_readout, src_encoding]))
344 | for src_encoding in src_encodings]
345 | tag_scores = [dy.affine_transform([b_score_tag, W_score_tag, tag_emb]) for tag_emb in tag_embs]
346 |
347 | transpose_transition_score = dy.parameter(self.transition_matrix) # (from, to)
348 |
349 | return transpose_transition_score.npvalue(), [ts.npvalue() for ts in tag_scores]
350 |
351 | def decoding(self, src_encodings,OTag, addbias=False):
352 | ''' Viterbi decoding for a single sequence. '''
353 | W_src2tag_readout = dy.parameter(self.W_src2tag_readout)
354 | b_src2tag_readout = dy.parameter(self.b_src2tag_readout)
355 | W_score_tag = dy.parameter(self.W_scores_readout2tag)
356 | b_score_tag = dy.parameter(self.b_scores_readout2tag)
357 |
358 | tag_embs = [dy.tanh(dy.affine_transform([b_src2tag_readout, W_src2tag_readout, src_encoding]))
359 | for src_encoding in src_encodings]
360 | if addbias:
361 | b_score_tag = np.zeros(self.tag_size)
362 | b_score_tag[OTag] = 0.5
363 | b_score_tag = dy.inputTensor(b_score_tag)
364 |
365 |
366 | tag_scores = [dy.affine_transform([b_score_tag, W_score_tag, tag_emb]) for tag_emb in tag_embs]
367 |
368 | back_trace_tags = []
369 | np_init_alpha = np.ones(self.tag_size) * -2000.0
370 | np_init_alpha[self.start_id] = 0.0
371 | max_tm1 = dy.inputTensor(np_init_alpha)
372 | transpose_transition_score = dy.parameter(self.transition_matrix) # (from, to)
373 |
374 | for i, tag_score in enumerate(tag_scores):
375 | max_tm1 = dy.concatenate_cols([max_tm1] * self.tag_size)
376 | max_t = max_tm1 + transpose_transition_score
377 | if i != 0:
378 | eval_score = max_t.npvalue()[:-2, :]
379 | else:
380 | eval_score = max_t.npvalue()
381 | best_tag = np.argmax(eval_score, axis=0)
382 | back_trace_tags.append(best_tag)
383 | max_tm1 = dy.inputTensor(eval_score[best_tag, range(self.tag_size)]) + tag_score
384 |
385 | terminal_max_T = max_tm1 + self.transition_matrix[self.end_id]
386 | eval_terminal = terminal_max_T.npvalue()[:-2]
387 | best_tag = np.argmax(eval_terminal, axis=0)
388 | best_path_score = eval_terminal[best_tag]
389 |
390 | best_path = [best_tag]
391 | for btpoint in reversed(back_trace_tags):
392 | best_tag = btpoint[best_tag]
393 | best_path.append(best_tag)
394 | start = best_path.pop()
395 | assert start == self.start_id
396 | best_path.reverse()
397 | return best_path_score, best_path, tag_scores
398 |
399 | def cal_accuracy(self, pred_path, true_path):
400 | return np.sum(np.equal(pred_path, true_path).astype(np.float32)) / len(pred_path)
401 |
402 |
403 | def ensemble_viterbi_decoding(l_tag_scores, l_transit_score, tag_size):
404 | back_trace_tags = []
405 | tag_size = tag_size + 2
406 | start_id = tag_size - 2
407 | end_id = tag_size - 1
408 | max_tm1 = np.ones(tag_size) * -2000.0
409 | max_tm1[start_id] = 0.0
410 |
411 | tag_scores = []
412 | for i in range(len(l_tag_scores[0])):
413 | tag_scores.append(sum([ts[i] for ts in l_tag_scores]) / len(l_tag_scores))
414 | transpose_transition_score = sum(l_transit_score) / len(l_transit_score) # (from, to)
415 |
416 | for i, tag_score in enumerate(tag_scores):
417 | max_tm1 = np.tile(np.expand_dims(max_tm1, axis=1), (1, tag_size))
418 | max_t = max_tm1 + transpose_transition_score
419 | if i != 0:
420 | eval_score = max_t[:-2, :]
421 | else:
422 | eval_score = max_t
423 | best_tag = np.argmax(eval_score, axis=0)
424 | back_trace_tags.append(best_tag)
425 | max_tm1 = eval_score[best_tag, range(tag_size)] + tag_score
426 |
427 | terminal_max_T = max_tm1 + transpose_transition_score[:, end_id]
428 | eval_terminal = terminal_max_T[:-2]
429 | best_tag = np.argmax(eval_terminal, axis=0)
430 | best_path_score = eval_terminal[best_tag]
431 |
432 | best_path = [best_tag]
433 | for btpoint in reversed(back_trace_tags):
434 | best_tag = btpoint[best_tag]
435 | best_path.append(best_tag)
436 | start = best_path.pop()
437 | assert start == start_id
438 | best_path.reverse()
439 | return best_path_score, best_path
440 |
441 |
442 | class classifier(Decoder):
443 | def __init__(self, model, input_dim, tag_size):
444 | self.W_softmax = model.add_parameters((tag_size, input_dim))
445 | self.b_softmax = model.add_parameters((tag_size))
446 |
447 | def decode_loss(self, src_encoding, tgt_tags):
448 | batch_size = len(tgt_tags)
449 | tgt_tags, tgt_mask = transpose_input(tgt_tags, 0)
450 |
451 | assert len(src_encoding) == len(tgt_tags)
452 |
453 | W_softmax = dy.parameter(self.W_softmax)
454 | b_softmax = dy.parameter(self.b_softmax)
455 |
456 | predictions = [dy.affine_transform([b_softmax, W_softmax, src_emb]) for src_emb in src_encoding]
457 |
458 | losses = [dy.pickneglogsoftmax_batch(pred, tgt) for pred, tgt in zip(predictions, tgt_tags)]
459 |
460 | loss = dy.sum_batches(dy.esum(losses)) / (batch_size * len(src_encoding))
461 |
462 | return loss
463 |
464 | def decoding(self, src_encoding):
465 | W_softmax = dy.parameter(self.W_softmax)
466 | b_softmax = dy.parameter(self.b_softmax)
467 | predictions = [dy.affine_transform([b_softmax, W_softmax, src_emb]) for src_emb in src_encoding]
468 |
469 | predictions = [np.argmax(pred.npvalue()) for pred in predictions]
470 |
471 | return None, predictions
472 |
--------------------------------------------------------------------------------