├── models
    ├── __init__.py
    ├── model_builder.py
    ├── encoders.py
    └── decoders.py
├── dataloaders
    ├── __init__.py
    └── data_loader.py
├── helper_scripts
    ├── countLinesCONLL.py
    ├── CombineAnnotatedFiles.py
    ├── removeAnnotatedSents.py
    ├── pickKTokens.py
    ├── pickKTokensRev.py
    └── SimulateAnnotations.py
├── utils
    ├── features.py
    └── util.py
├── commands
    ├── SAL_CT.sh
    ├── ETAL_FULL_CRF_CT.sh
    ├── ETAL_PARTIAL_CRF_CT.sh
    └── CFEAL_PARTIAL_CRF_CT.sh
├── README.md
├── args.py
├── main.py
└── eval
    └── conlleval.v2


/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dataloaders/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/helper_scripts/countLinesCONLL.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | import argparse
 3 | 
 4 | arg_parser = argparse.ArgumentParser()
 5 | 
 6 | arg_parser.add_argument("--input", help="Folder with the raw text data",
 7 |                         default=None,
 8 |                         type=str)
 9 | 
10 | args = arg_parser.parse_args()
11 | print("Args used for this run:")
12 | print(args)
13 | 
14 | with codecs.open(args.input,"r",encoding='utf-8') as fin:
15 | 	index = 0
16 | 	one_line = []	
17 | 	for line in fin:
18 | 		if line == "" or line == "\n":
19 | 			if len(one_line) > 0:
20 | 				index +=1
21 | 			one_line = []
22 | 		else:
23 | 			line = line.strip()
24 | 			one_line.append(line)
25 | 
26 | if len(one_line)>0:
27 | 	index = index + 1
28 | print index
29 | 		
30 | 


--------------------------------------------------------------------------------
/helper_scripts/CombineAnnotatedFiles.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | import argparse
 3 | 
 4 | 
 5 | arg_parser = argparse.ArgumentParser()
 6 | 
 7 | arg_parser.add_argument("--files", help="File 1",
 8 |                         default=None,nargs='+')
 9 | 
10 | #arg_parser.add_argument("--file2", help="File 2",
11 | #                        default=None,
12 | #                        type=str)
13 | 
14 | arg_parser.add_argument("--output", help="Output File",
15 |                         default=None,
16 |                         type=str)
17 | 
18 | args = arg_parser.parse_args()
19 | print("Args used for this run:")
20 | print(args)
21 | 
22 | 
23 | files = args.files
24 | fout = codecs.open(args.output, "w", encoding='utf-8')
25 | 
26 | for i in files:
27 |     with codecs.open(i,"r", encoding='utf-8') as fin:
28 |         for line in fin:
29 |             fout.write(line)
30 |         print "Done reading file: " + str(i)
31 |         fout.write("\n")
32 | 


--------------------------------------------------------------------------------
/utils/features.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | import numpy as np
 3 | import pdb
 4 | 
 5 | 
 6 | def get_feature_sent(lang, sent, args, cap_ratio_dict, type=None):
 7 |     dsf = []
 8 |     individual_feats = []
 9 | 
10 |     if args.cap and not args.use_discrete_features:
11 |         cap_feat = [w[0].isupper() for w in sent]
12 |         individual_feats.append(cap_feat)
13 | 
14 |     if args.cap_ratio_path is not None:
15 |         cap_feats = []
16 |         for w in sent:
17 |             # feat = np.zeros(4,)
18 |             feat = [0, 0, 0, 0]
19 |             if w in cap_ratio_dict:
20 |                 feat[cap_ratio_dict[w]] = 1
21 |             cap_feats.append(feat)
22 |         individual_feats.append(cap_feats)
23 | 
24 |     # individual_feats = zip(*individual_feats) # [(), ()]
25 |     if len(dsf) > 0 and len(individual_feats) > 0:
26 |         # individual_feats = [list(i) for i in individual_feats]
27 |         dsf = [list(i) for i in dsf]
28 |         # for i, d in zip(individual_feats, dsf):
29 |         #    print i, d
30 |         #    print len(i), len(d)
31 |         new_feat = [list(tuple(i + d)) for i, d in zip(individual_feats[0], dsf)]
32 |         # pdb.set_trace()
33 |         return new_feat
34 |     elif len(individual_feats) > 0:
35 |         return individual_feats
36 |     elif len(dsf) > 0:
37 |         return dsf
38 |     else:
39 |         return []
40 | 
41 | 
42 | def get_brown_cluster(path):
43 |     bc_dict = dict()
44 |     linear_map = dict()
45 |     with codecs.open(path, "r", "utf-8") as fin:
46 |         for line in fin:
47 |             fields = line.strip().split('\t')
48 |             if len(fields) == 3:
49 |                 word = fields[1]
50 |                 binary_string = fields[0]
51 |                 bid = int(binary_string, 2)
52 |                 if bid not in linear_map:
53 |                     linear_map[bid] = len(linear_map)
54 |                 bc_dict[word] = linear_map[bid]
55 |     return bc_dict
56 | 


--------------------------------------------------------------------------------
/helper_scripts/removeAnnotatedSents.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import codecs
 3 | 
 4 | def selectUnAnnotated(args):
 5 |     annotated_sents = set()
 6 |     with codecs.open(args.annotated, "r",encoding='utf-8') as fin:
 7 |         sent = []
 8 |         count = 0
 9 |         for line in fin:
10 |             line = line.strip()
11 |             if line == "" or line == "\n":
12 |                 annotated_sents.add(" ".join(sent))
13 |                 count +=1 
14 |                 sent =[]
15 |             else:
16 |                 tokens = line.split("\t")
17 |                 sent.append(tokens[0])
18 | 
19 |     print(count, len(annotated_sents))
20 |     fout = codecs.open("./annotated_sents.txt","w", encoding='utf-8') 
21 |     for sent in annotated_sents:
22 |         fout.write(sent + "\n")
23 | 
24 |     ffull = codecs.open("./orig_sents.txt","w", encoding='utf-8') 
25 |     with codecs.open(args.input, "r", encoding='utf-8') as fin,  codecs.open(args.output, "w", encoding='utf-8') as fout:
26 |         sent = []
27 |         tokens = []
28 |         for line in fin:
29 |             line = line.strip()
30 |             if line == "" or line == "\n":
31 |                 sentence = " ".join(tokens)
32 |                 ffull.write(sentence + "\n")
33 |                 tokens = []
34 |                 if sentence not in annotated_sents:
35 | #q                    print(sentence)
36 |                     for l in sent:
37 |                         fout.write(l + "\n")
38 |                     fout.write("\n")
39 |                 sent = []
40 |             else:
41 |                 sent.append(line)
42 |                 tokens.append(line.split("\t")[0])
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     parser = argparse.ArgumentParser()
47 |     parser.add_argument("--input",type=str)
48 |     parser.add_argument("--annotated", type=str)
49 |     parser.add_argument("--output", type=str)
50 |     args = parser.parse_args()
51 |     print(args)
52 |     selectUnAnnotated(args)
53 | 


--------------------------------------------------------------------------------
/helper_scripts/pickKTokens.py:
--------------------------------------------------------------------------------
 1 | import codecs, argparse
 2 | 
 3 | 
 4 | def pickKTokens(args):
 5 |     with codecs.open(args.input, "r", encoding='utf-8') as fin, codecs.open(args.output, "w", encoding='utf-8') as fout:
 6 |         count = args.k
 7 |         one_sent = []
 8 |         for line in fin:
 9 |             if line == "" or line == "\n":
10 |                 for s in one_sent:
11 |                     fout.write(s + "\n")
12 |                 fout.write('\n')
13 |                 one_sent = []
14 |                 if count <=0:
15 |                     break
16 | 
17 |             else:
18 |                 tokens = line.strip().split("\t")
19 |                 tag  = tokens[1]
20 |                 token = tokens[0]
21 |                 if "UNK" in tag:
22 |                     count -= 1
23 | 
24 |                 one_sent.append(line.strip())
25 | 
26 | 
27 |         if len(one_sent) > 0:
28 |             for s in one_sent:
29 |                 fout.write(s + "\n")
30 |             fout.write('\n')
31 | 
32 | def pickKTokensRev(args):
33 |     with codecs.open(args.input, "r", encoding='utf-8') as fin, codecs.open(args.output, "w", encoding='utf-8') as fout:
34 |         count = args.k
35 |         one_sent = []
36 |         for line in fin:
37 |             if line == "" or line == "\n":
38 |                 for s in one_sent:
39 |                     fout.write(s + "\n")
40 |                 fout.write('\n')
41 |                 one_sent = []
42 |                 if count <=0:
43 |                     break
44 | 
45 |             else:
46 |                 tokens = line.strip().split("\t")
47 |                 tag  = tokens[1]
48 |                 token = tokens[0]
49 |                 count -= 1
50 | 
51 |                 one_sent.append(line.strip())
52 | 
53 | 
54 |         if len(one_sent) > 0:
55 |             for s in one_sent:
56 |                 fout.write(s + "\n")
57 |             fout.write('\n')
58 | 
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     parser = argparse.ArgumentParser()
63 |     parser.add_argument("--input", type=str)
64 |     parser.add_argument("--k", type=int)
65 |     parser.add_argument("--output",type=str)
66 |     args = parser.parse_args()
67 | 
68 |     pickKTokens(args)
69 |     #pickKTokensRev(args)
70 | 


--------------------------------------------------------------------------------
/helper_scripts/pickKTokensRev.py:
--------------------------------------------------------------------------------
 1 | import codecs, argparse
 2 | 
 3 | 
 4 | def pickKTokens(args):
 5 |     with codecs.open(args.input, "r", encoding='utf-8') as fin, codecs.open(args.output, "w", encoding='utf-8') as fout:
 6 |         count = args.k
 7 |         one_sent = []
 8 |         for line in fin:
 9 |             if line == "" or line == "\n":
10 |                 for s in one_sent:
11 |                     fout.write(s + "\n")
12 |                 fout.write('\n')
13 |                 one_sent = []
14 |                 if count <=0:
15 |                     break
16 | 
17 |             else:
18 |                 tokens = line.strip().split("\t")
19 |                 tag  = tokens[1]
20 |                 token = tokens[0]
21 |                 if "UNK" in tag:
22 |                     count -= 1
23 | 
24 |                 one_sent.append(line.strip())
25 | 
26 | 
27 |         if len(one_sent) > 0:
28 |             for s in one_sent:
29 |                 fout.write(s + "\n")
30 |             fout.write('\n')
31 | 
32 | def pickKTokensRev(args):
33 |     with codecs.open(args.input, "r", encoding='utf-8') as fin, codecs.open(args.output, "w", encoding='utf-8') as fout:
34 |         count = args.k
35 |         one_sent = []
36 |         for line in fin:
37 |             if line == "" or line == "\n":
38 |                 for s in one_sent:
39 |                     fout.write(s + "\n")
40 |                 fout.write('\n')
41 |                 one_sent = []
42 |                 if count <=0:
43 |                     break
44 | 
45 |             else:
46 |                 tokens = line.strip().split("\t")
47 |                 tag  = tokens[1]
48 |                 token = tokens[0]
49 |                 count -= 1
50 | 
51 |                 one_sent.append(line.strip())
52 | 
53 | 
54 |         if len(one_sent) > 0:
55 |             for s in one_sent:
56 |                 fout.write(s + "\n")
57 |             fout.write('\n')
58 | 
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     parser = argparse.ArgumentParser()
63 |     parser.add_argument("--input", type=str)
64 |     parser.add_argument("--k", type=int)
65 |     parser.add_argument("--output",type=str)
66 |     args = parser.parse_args()
67 | 
68 |     #pickKTokens(args)
69 |     pickKTokensRev(args)
70 | 


--------------------------------------------------------------------------------
/commands/SAL_CT.sh:
--------------------------------------------------------------------------------
 1 | DIR="../data/Spanish/SAL_CT"
 2 | DATA="../data/Spanish"
 3 | 
 4 | for i in {1..20} ; do
 5 |     python2 ../helper_scripts/pickKTokensRev.py --input $DIR/to_annotate_v${i}.1_LC.conll --k 200 --output $DIR/to_annotate_v${i}.1_200.conll
 6 | 
 7 |     python2 ../helper_scripts/SimulateAnnotations.py --input $DIR/to_annotate_v${i}.1_200.conll --output $DIR/v${i}.1.conll
 8 | 
 9 |     PREV=`expr $i - 1`
10 | 
11 |     python2 ../helper_scripts/removeAnnotatedSents.py --input $DIR//unlabel_v${PREV}.1.conll --annotated $DIR/v${i}.1.conll --output $DIR/unlabel_v${i}.1.conll
12 | 
13 | 	if [ "$i" -gt 1 ]
14 | 	then
15 | 	python2 ../helper_scripts/CombineAnnotatedFiles.py --files $DIR/Entropy_v${PREV}.1.conll   $DIR/v${i}.1.conll --output $DIR/Entropy_v${i}.1.conll
16 |     else
17 |     cp $DIR/v1.1.conll $DIR/Entropy_v1.1.conll
18 |     fi
19 | 
20 |     #Train the NER Model Using FineTune
21 |     MODEL_NAME="200_SAL_CT_spa_${i}.1_finetune"
22 |     python -u ../main.py \
23 |         --dynet-seed 3278657 \
24 |         --word_emb_dim 100 \
25 |         --batch_size 10 \
26 |         --model_name ${MODEL_NAME} \
27 |         --lang es \
28 |         --fixedVocab \
29 |         --fineTune \
30 |         --test_conll \
31 |         --tot_epochs 1000 \
32 | 	--aug_lang_train_path $DATA/vocab.conll \
33 | 	--misc \
34 |         --init_lr 0.015 \
35 |         --load_from_path ../saved_models/spanish_full_transfer_baseline.model \
36 |         --valid_freq 1300 \
37 |         --pretrain_emb_path $DATA/esp.vec \
38 |         --dev_path $DATA/esp.dev \
39 |         --test_path $DATA/esp.test \
40 |         --train_path $DIR/Entropy_v${i}.1.conll  2>&1 | tee ${MODEL_NAME}.log
41 | 
42 | 
43 |     #Run the Active Learning Session
44 |     NEW=`expr $i + 1`
45 |     #!/usr/bin/env bash
46 |     MODEL_NAME="200_SAL_spa_${i}.1_finetune_activelearning"
47 |     python -u ../main.py \
48 |         --dynet-seed 3278657 \
49 |         --mode test_1 \
50 |         --fixedVocab \
51 |         --aug_lang_train_path $DATA/vocab.conll \
52 | 	--misc \
53 |   	--word_emb_dim 100 \
54 |         --model_name ${MODEL_NAME} \
55 |         --lang es \
56 |         --load_from_path  ../saved_models/200_SAL_CT_spa_${i}.1_finetune.model \
57 |         --pretrain_emb_path   $DATA/esp.vec  \
58 |         --dev_path $DATA/esp.dev  \
59 |         --test_path $DIR/unlabel_v${i}.1.conll \
60 |         --to_annotate $DIR/to_annotate_v${NEW}.1.conll \
61 |         --test_conll \
62 |         --k 200 \
63 |         --SPAN_wise \
64 |         --train_path $DIR/Entropy_v${i}.1.conll  2>&1 | tee ${MODEL_NAME}.log
65 | 
66 | done
67 | 


--------------------------------------------------------------------------------
/commands/ETAL_FULL_CRF_CT.sh:
--------------------------------------------------------------------------------
 1 | DIR="../data/Spanish/ETAL_FULL_CRF_CT"
 2 | DATA="../data/Spanish"
 3 | 
 4 | for i in {1..20} ; do
 5 |     python2 ../helper_scripts/pickKTokens.py --input $DIR/to_annotate_v${i}.1.conll --k 200 --output $DIR/to_annotate_v${i}.1_200.conll
 6 | 
 7 |     python2 ../helper_scripts/SimulateAnnotations.py --input $DIR/to_annotate_v${i}.1_200.conll --output $DIR/v${i}.1.conll
 8 | 
 9 |     PREV=`expr $i - 1`
10 | 
11 |     python2 ../helper_scripts/removeAnnotatedSents.py --input $DIR//unlabel_v${PREV}.1.conll --annotated $DIR/v${i}.1.conll --output $DIR/unlabel_v${i}.1.conll
12 | 
13 | 	if [ "$i" -gt 1 ]
14 | 	then
15 | 	python2 ../helper_scripts/CombineAnnotatedFiles.py --files $DIR/Entropy_v${PREV}.1.conll   $DIR/v${i}.1.conll --output $DIR/Entropy_v${i}.1.conll
16 |     else
17 |     cp $DIR/v1.1.conll $DIR/Entropy_v1.1.conll
18 |     fi
19 | 
20 |     #Train the NER Model Using FineTune
21 |     MODEL_NAME="200_Entropy_Full_CT_spa_${i}.1_finetune"
22 |     python -u ../main.py \
23 |         --dynet-seed 3278657 \
24 |         --word_emb_dim 100 \
25 |         --batch_size 10 \
26 |         --model_name ${MODEL_NAME} \
27 |         --lang es \
28 |         --fixedVocab \
29 |         --fineTune \
30 |         --test_conll \
31 |         --tot_epochs 1000 \
32 |         --misc \
33 | 	    --aug_lang_train_path $DATA/vocab.conll \
34 |         --init_lr 0.015 \
35 |         --load_from_path ../saved_models/spanish_full_transfer_baseline.model \
36 |         --valid_freq 1300 \
37 |         --pretrain_emb_path $DATA/esp.vec \
38 |         --dev_path $DATA/esp.dev \
39 |         --test_path $DATA/esp.test \
40 |         --train_path $DIR/Entropy_v${i}.1.conll  2>&1 | tee ${MODEL_NAME}.log
41 | 
42 | 
43 |     #Run the Active Learning Session
44 |     NEW=`expr $i + 1`
45 |     #!/usr/bin/env bash
46 |     MODEL_NAME="200_Entropy_Full_CT_spa_${i}.1_finetune_activelearning"
47 |     python -u ../main.py \
48 |         --dynet-seed 3278657 \
49 |         --mode test_1 \
50 |         --fixedVocab \
51 |         --aug_lang_train_path $DATA/vocab.conll \
52 |   	--word_emb_dim 100 \
53 |         --model_name ${MODEL_NAME} \
54 |         --lang es \
55 |         --misc \
56 |         --load_from_path  ../saved_models/200_Entropy_Full_CT_spa_${i}.1_finetune.model \
57 |         --pretrain_emb_path   $DATA/esp.vec  \
58 |         --dev_path $DATA/esp.dev  \
59 |         --test_path $DIR/unlabel_v${i}.1.conll \
60 |         --to_annotate $DIR/to_annotate_v${NEW}.1.conll \
61 |         --ngram 5 \
62 |         --test_conll \
63 |         --entropy_threshold 1e-8 \
64 |         --k 200 \
65 |         --SPAN_wise \
66 |         --train_path $DIR/Entropy_v${i}.1.conll  2>&1 | tee ${MODEL_NAME}.log
67 | 
68 | done
69 | 


--------------------------------------------------------------------------------
/commands/ETAL_PARTIAL_CRF_CT.sh:
--------------------------------------------------------------------------------
 1 | DIR="../data/Spanish/ETAL_PARTAL_CRF_CT"
 2 | DATA="../data/Spanish"
 3 | 
 4 | for i in {1..20} ; do
 5 |     python2 ../helper_scripts/pickKTokens.py --input $DIR/to_annotate_v${i}.1.conll --k 200 --output $DIR/to_annotate_v${i}.1_200.conll
 6 | 
 7 |     python2 ../helper_scripts/SimulateAnnotations.py --input $DIR/to_annotate_v${i}.1_200.conll --output $DIR/v${i}.1.conll --needUNK
 8 | 
 9 |     PREV=`expr $i - 1`
10 | 
11 |     python2 ../helper_scripts/removeAnnotatedSents.py --input $DIR//unlabel_v${PREV}.1.conll --annotated $DIR/v${i}.1.conll --output $DIR/unlabel_v${i}.1.conll
12 | 
13 | 	if [ "$i" -gt 1 ]
14 | 	then
15 | 	python2 ../helper_scripts/CombineAnnotatedFiles.py --files $DIR/Entropy_v${PREV}.1.conll   $DIR/v${i}.1.conll --output $DIR/Entropy_v${i}.1.conll
16 |     else
17 |     cp $DIR/v1.1.conll $DIR/Entropy_v1.1.conll
18 |     fi
19 | 
20 |     #Train the NER Model Using FineTune
21 |     MODEL_NAME="200_Entropy_Partial_CT_spa_${i}.1_finetune"
22 |     python -u ../main.py \
23 |         --dynet-seed 3278657 \
24 |         --word_emb_dim 100 \
25 |         --batch_size 10 \
26 |         --model_name ${MODEL_NAME} \
27 |         --lang es \
28 |         --fixedVocab \
29 |         --fineTune \
30 |         --test_conll \
31 |         --misc \
32 |         --tot_epochs 1000 \
33 | 	--aug_lang_train_path $DATA/vocab.conll \
34 |         --init_lr 0.015 \
35 |         --load_from_path ../saved_models/spanish_full_transfer_baseline.model \
36 |         --valid_freq 1300 \
37 |         --pretrain_emb_path $DATA/esp.vec \
38 |         --use_partial \
39 |         --dev_path $DATA/esp.dev \
40 |         --test_path $DATA/esp.test \
41 |         --train_path $DIR/Entropy_v${i}.1.conll  2>&1 | tee ${MODEL_NAME}.log
42 | 
43 | 
44 |     #Run the Active Learning Session
45 |     NEW=`expr $i + 1`
46 |     #!/usr/bin/env bash
47 |     MODEL_NAME="200_Entropy_Partial_CT_spa_${i}.1_finetune_activelearning"
48 |     python -u ../main.py \
49 |         --dynet-seed 3278657 \
50 |         --mode test_1 \
51 |         --fixedVocab \
52 |         --aug_lang_train_path $DATA/vocab.conll \
53 |   	--word_emb_dim 100 \
54 |         --model_name ${MODEL_NAME} \
55 |         --lang es \
56 |         --load_from_path  ../saved_models/200_Entropy_Partial_CT_spa_${i}.1_finetune.model \
57 |         --pretrain_emb_path   $DATA/esp.vec  \
58 |         --dev_path $DATA/esp.dev  \
59 |         --test_path $DIR/unlabel_v${i}.1.conll \
60 |         --to_annotate $DIR/to_annotate_v${NEW}.1.conll \
61 |         --ngram 5 \
62 |         --misc \
63 |         --test_conll \
64 |         --entropy_threshold 1e-8 \
65 |         --use_partial \
66 |         --k 200 \
67 |         --SPAN_wise \
68 |         --train_path $DIR/Entropy_v${i}.1.conll  2>&1 | tee ${MODEL_NAME}.log
69 | 
70 | done
71 | 


--------------------------------------------------------------------------------
/commands/CFEAL_PARTIAL_CRF_CT.sh:
--------------------------------------------------------------------------------
 1 | DIR="../data/Spanish/CFEAL_PARTAL_CRF_CT"
 2 | DATA="../data/Spanish"
 3 | 
 4 | for i in {1..20} ; do
 5 |     python2 ../helper_scripts/pickKTokens.py --input $DIR/to_annotate_v${i}.1.conll --k 200 --output $DIR/to_annotate_v${i}.1_200.conll
 6 | 
 7 |     python2 ../helper_scripts/SimulateAnnotations.py --input $DIR/to_annotate_v${i}.1_200.conll --output $DIR/v${i}.1.conll --needUNK
 8 | 
 9 |     PREV=`expr $i - 1`
10 | 
11 |     python2 ../helper_scripts/removeAnnotatedSents.py --input $DIR//unlabel_v${PREV}.1.conll --annotated $DIR/v${i}.1.conll --output $DIR/unlabel_v${i}.1.conll
12 | 
13 | 	if [ "$i" -gt 1 ]
14 | 	then
15 | 	python2 ../helper_scripts/CombineAnnotatedFiles.py --files $DIR/Entropy_v${PREV}.1.conll   $DIR/v${i}.1.conll --output $DIR/Entropy_v${i}.1.conll
16 |     else
17 |     cp $DIR/v1.1.conll $DIR/Entropy_v1.1.conll
18 |     fi
19 | 
20 |     #Train the NER Model Using FineTune
21 |     MODEL_NAME="200_Entropy_Partial_CT_spa_${i}.1_finetune"
22 |     python -u ../main.py \
23 |         --dynet-seed 3278657 \
24 |         --word_emb_dim 100 \
25 |         --batch_size 10 \
26 |         --model_name ${MODEL_NAME} \
27 |         --lang es \
28 |         --fixedVocab \
29 |         --fineTune \
30 |         --test_conll \
31 |         --tot_epochs 1000 \
32 | 	--aug_lang_train_path $DATA/vocab.conll \
33 | 	--misc \
34 |         --init_lr 0.015 \
35 |         --load_from_path ../saved_models/spanish_full_transfer_baseline.model \
36 |         --valid_freq 1300 \
37 |         --pretrain_emb_path $DATA/esp.vec \
38 |         --use_partial \
39 |         --dev_path $DATA/esp.dev \
40 |         --test_path $DATA/esp.test \
41 |         --train_path $DIR/Entropy_v${i}.1.conll  2>&1 | tee ${MODEL_NAME}.log
42 | 
43 | 
44 |     #Run the Active Learning Session
45 |     NEW=`expr $i + 1`
46 |     #!/usr/bin/env bash
47 |     MODEL_NAME="200_Entropy_Partial_CT_spa_${i}.1_finetune_activelearning"
48 |     python -u ../main.py \
49 |         --dynet-seed 3278657 \
50 |         --mode test_1 \
51 |         --fixedVocab \
52 |         --aug_lang_train_path $DATA/vocab.conll \
53 |   	--word_emb_dim 100 \
54 |         --model_name ${MODEL_NAME} \
55 |         --lang es \
56 |         --load_from_path  ../saved_models/200_Entropy_Partial_CT_spa_${i}.1_finetune.model \
57 |         --pretrain_emb_path   $DATA/esp.vec  \
58 |         --dev_path $DATA/esp.dev  \
59 |         --test_path $DIR/unlabel_v${i}.1.conll \
60 |         --to_annotate $DIR/to_annotate_v${NEW}.1.conll \
61 | 	--misc \
62 |         --ngram 5 \
63 |         --test_conll \
64 |         --entropy_threshold 0 \
65 |         --use_partial \
66 |         --k 200 \
67 |         --use_CFB \
68 |         --SPAN_wise \
69 |         --train_path $DIR/Entropy_v${i}.1.conll  2>&1 | tee ${MODEL_NAME}.log
70 | 
71 | done
72 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Active Learning for Entity Recognition
 2 | 
 3 | ### Requirements
 4 | python 2.7 <br>
 5 | DynetVersion commit 284838815ece9297a7100cc43035e1ea1b133a5
 6 | 
 7 | 
 8 | ### Data
 9 | In the ```data/```, create a directory per language as shown for ```data/Spanish```. Download the CoNLL train/dev/test NER datasets for that language here. To acquire LDC datasets, please get the required access.
10 | 
11 | 
12 | For storing the trained models, create directory ```saved_models``` in the parent folder.
13 | ### Embeddings
14 | Combine monolingual data acquired from Wikipedia with the plain text extracted from the labeled data. Train 100-d [Glove]((https://nlp.stanford.edu/projects/glove/)) embeddings
15 | 
16 | ### Active Learning Simulation 
17 | The best NER performance was obtained using fine-tuning training scheme. The scripts below runs simulation active learning runs for different active learning strategies:
18 | ``` cd commands``` <br>
19 | *  ETAL + Partial-CRF + CT (Proposed recipe) <br> ``` ./ETAL_PARTIAL_CRF_CT.sh ```<br>
20 | * ETAL + Full-CRF + CT <br>``` ./ETAL_FULL_CRF_CT.sh ```<br>
21 | * CFEAL + Full-CRF + CT <br>``` ./CFEAL_PARTIAL_CRF_CT.sh ```<br>
22 | * SAL + CT <br>
23 | ``` ./SAL_CT.sh ```<br>
24 | Things to note:
25 | 
26 | We load the vocabulary from the following path```--aug_lang_train_path```. Therefore, create a conll formatted file with dummy labels from the unlabeled text.
27 | For our experiments, we concatenated the transferred data with the unlabeled data (which was the entire training dataset) into a single conll formatted file. 
28 | The conll format is a tab separated two-column format as shown below: <br>
29 | 
30 | ```El   O``` <br>
31 | ```grupo    O```<br>
32 | 
33 | The LDC NER label set differ from the CoNLL label set by one tag. Therefore, add ``` --misc ``` to the argument set when running any experiments on CoNLL data. The label set has been hard-coded in the ```data_loaders/data_loader.py``` file. 
34 | 
35 | ### Cross-Lingual Transferred Data
36 | We used the model proposed by (Xie et al. 2018) to get the cross-lingually transferred data from English. 
37 | Please refer to their code [here](https://github.com/thespectrewithin/cross-lingual_NER).
38 | 
39 | For the Fine-Tune training scheme, train a base NER model on the transferred model as follows:
40 | 
41 |     MODEL_NAME="spanish_full_transfer_baseline"
42 |     python -u ../main.py \
43 |         --dynet-seed 3278657 \
44 |         --word_emb_dim 100 \
45 |         --batch_size 10 \
46 |         --model_name ${MODEL_NAME} \
47 |         --lang es \
48 |         --fixedVocab \
49 |         --test_conll \
50 |         --tot_epochs 1000 \
51 | 	--aug_lang_train_path $DATA/vocab.conll \
52 |         --init_lr 0.015 \
53 |         --valid_freq 1300 \
54 |         --misc \
55 |         --pretrain_emb_path $DATA/esp.vec \
56 |         --dev_path $DATA/esp.dev \
57 |         --test_path $DATA/esp.test \
58 |         --train_path $DIR/transferred_data.conll  2>&1 | tee ${MODEL_NAME}.log 
59 |         
60 | ### References
61 | If you make use of this software for research purposes, we will appreciate citing the following:
62 | ```
63 | @inproceedings{chaudhary19emnlp,
64 |     title = {A Little Annotation does a Lot of Good: A Study in Bootstrapping Low-resource Named Entity Recognizers},
65 |     author = {Aditi Chaudhary and Jiateng Xie and Zaid Sheikh and Graham Neubig and Jaime Carbonell},
66 |     booktitle = {Conference on Empirical Methods in Natural Language Processing (EMNLP)},
67 |     address = {Hong Kong},
68 |     month = {November},
69 |     url = {http://arxiv.org/abs/1908.08983},
70 |     year = {2019}
71 | }
72 | ```
73 | 
74 | ### Contact
75 | For any issues, please feel free to reach out to `aschaudh@andrew.cmu.edu`.
76 | 


--------------------------------------------------------------------------------
/helper_scripts/SimulateAnnotations.py:
--------------------------------------------------------------------------------
  1 | import codecs
  2 | import argparse
  3 | from copy import deepcopy
  4 | 
  5 | 
  6 | 
  7 | 
  8 | 
  9 | 
 10 | def annotate(input, output):
 11 |     gold_lines = []
 12 | 
 13 |     with codecs.open(input, "r", encoding='utf-8') as fin,codecs.open(output, "w", encoding='utf-8') as fout:
 14 |         actual_line = []
 15 |         actual_one_line = []
 16 | 
 17 |         crf_line= []
 18 |         crf_one_line = []
 19 | 
 20 |         gold_one_line = []
 21 |         prev = ""
 22 |         for line in fin:
 23 |             if line == "" or line == "\n":
 24 |                 #fout.write("\n")
 25 |                 actual_line.append(actual_one_line)
 26 |                 actual_one_line  = []
 27 | 
 28 |                 crf_line.append(crf_one_line)
 29 |                 crf_one_line = []
 30 | 
 31 |                 gold_lines.append(gold_one_line)
 32 |                 gold_one_line = []
 33 | 
 34 | 		prev = ""
 35 |             else:
 36 |                 tokens  = line.strip().split()
 37 |                 gold_one_line.append(tokens[-1])
 38 | 
 39 |                 if "UNK" in tokens[1]:   #Find the true start of the entity
 40 |                     #fout.write(tokens[0] + "\t" + tokens[-1] + '\n')
 41 |                     actual_one_line.append(tokens[0] + "\t" + tokens[-1])
 42 |                     prev = tokens[-1]
 43 | 
 44 | 
 45 |                 else:
 46 |                     #fout.write(tokens[0] + "\t" + tokens[1] + '\n')
 47 |                    # actual_one_line.append(tokens[0] + "\t" + tokens[1])
 48 |                     if prev != "" and tokens[-1].startswith("I-"):
 49 |                         BIO_tag = tokens[-1]
 50 |                         prev  =tokens[-1]
 51 |                     else:
 52 |                         if args.needUNK:
 53 |                             BIO_tag = "B-UNK"
 54 |                         else:
 55 |                             #BIO_tag = "O"
 56 |                              BIO_tag = tokens[1]
 57 |                         prev = ""
 58 |                     actual_one_line.append(tokens[0] + "\t" + BIO_tag)
 59 | 
 60 | 
 61 |         index = 0
 62 |         lines = []
 63 |         one_line = []
 64 |         for line in actual_line:
 65 |             prev = ""
 66 |             for token_tag in line:
 67 |                 current_tag = token_tag.split("\t")[-1]
 68 |                 token = token_tag.split("\t")[0]
 69 | 
 70 | 
 71 |                 if prev != "":
 72 |                     if prev == "O" and "I-" in current_tag:
 73 |                         #print("Check index :{0} for inconsistency {1}".format(index, token))
 74 |                         token_tag = token + "\t" + "B-" + current_tag.split("-")[-1]
 75 | 
 76 |                     if (prev == "B-PER" or prev == "I-PER") and current_tag in ['I-LOC, I-ORG, I-GPE']:
 77 |                         #print("Check index :{0} for inconsistency {1}".format(index, token))
 78 |                         token_tag = token + "\t" + "I-PER"
 79 | 
 80 |                     if (prev == "B-GPE" or prev == "I-GPE") and current_tag in ['I-LOC, I-ORG, I-PER']:
 81 |                         #print("Check index :{0} for inconsistency".format(index,token))
 82 |                         token_tag = token + "\t" + "I-GPE"
 83 | 
 84 |                     if (prev == "B-LOC" or prev == "I-LOC") and current_tag in ['I-PER, I-ORG, I-GPE']:
 85 |                         #print("Check index :{0} for inconsistency {1}".format(index,token))
 86 |                         token_tag = token + "\t" + "I-LOC"
 87 | 
 88 |                     if (prev == "B-ORG" or prev == "I-ORG") and current_tag in ['I-LOC, I-PER, I-GPE']:
 89 |                         #print("Check index :{0} for inconsistency {1}".format(index,token))
 90 |                         token_tag = token + "\t" + "I-ORG"
 91 | 
 92 | 
 93 | 
 94 |                 prev = current_tag
 95 | 
 96 |                 index +=1
 97 |                 one_line.append(token_tag)
 98 |                 #fout.write(token_tag + "\n")
 99 |             index += 1
100 |             lines.append(one_line)
101 |             one_line =[]
102 |             #fout.write("\n")
103 |         print(len(lines))
104 |         for line_num, line in enumerate(lines):
105 |             prev = ""
106 |             for token_num, token_tag in enumerate(line):
107 |                 token = token_tag.split("\t")[0]
108 |                 tag = token_tag.split("\t")[-1]
109 |                 if prev != "":
110 |                     if prev in ["B-UNK","O"] and tag in ["I-LOC", "I-GPE", "I-LOC", "I-MISC","I-PER","I-ORG"]:
111 |                         gold_one_line = gold_lines[line_num]
112 |                         gold_cur_tag = gold_one_line[token_num]
113 |                         temp_num = deepcopy(token_num)
114 |                         while not gold_cur_tag.startswith("B-"):
115 |                             temp_num -=1
116 |                             gold_cur_tag =  gold_one_line[temp_num]
117 |                             line[temp_num] = line[temp_num].split("\t")[0] + "\t" + gold_cur_tag
118 |                 prev = tag
119 | 
120 |             for token_tag in line:
121 |                 fout.write(token_tag + '\n')
122 |             fout.write("\n")
123 | 
124 | 
125 | if __name__ == "__main__":
126 |     parser = argparse.ArgumentParser()
127 |     parser.add_argument("--input", type=str, default=None, help="Active learning output")
128 |     parser.add_argument("--output", type=str, default=None, help ="Simulated NI with gold annotations in place of UNK")
129 |     parser.add_argument("--needUNK", default=False, action="store_true", help="Simulated NI with gold annotations in place of UNK")
130 |     args = parser.parse_args()
131 | 
132 |     annotate(args.input, args.output)
133 | 


--------------------------------------------------------------------------------
/args.py:
--------------------------------------------------------------------------------
  1 | def init_config():
  2 |     import argparse
  3 |     parser = argparse.ArgumentParser()
  4 |     parser.add_argument("--dynet-mem", default=1000, type=int)
  5 |     parser.add_argument("--dynet-seed", default=5783287, type=int)
  6 |     parser.add_argument("--dynet-gpu")
  7 | 
  8 |     parser.add_argument("--model_name", type=str, default=None)
  9 |     parser.add_argument("--eval_folder", type=str, default="../eval")
 10 |     parser.add_argument("--lang", default="english", help="the target language")
 11 |     parser.add_argument("--train_ensemble", default=False, action="store_true")
 12 |     parser.add_argument("--full_data_path", type=str, default=None, help="when train_ensemble is true, this one is the full data path from which to load vocabulary.")
 13 |     parser.add_argument("--train_path", default="../datasets/english/eng.train.bio.conll", type=str)
 14 |     # parser.add_argument("--train_path", default="../datasets/english/debug_train.bio", type=str)
 15 |     parser.add_argument("--monolingual_data_path", default=None, type=str)
 16 |     parser.add_argument("--dev_path", default="../datasets/english/eng.dev.bio.conll", type=str)
 17 |     parser.add_argument("--test_path", default="../datasets/english/eng.test.bio.conll", type=str)
 18 |     parser.add_argument("--new_test_path", default="../datasets/english/eng.test.bio.conll", type=str)
 19 |     parser.add_argument("--new_test_conll", default="../datasets/english/eng.test.bio.conll", type=str)
 20 |     parser.add_argument("--save_to_path", default="../saved_models/")
 21 |     parser.add_argument("--load_from_path", default=None)
 22 |     parser.add_argument("--train_filename_path", default=None, type=str)
 23 |     parser.add_argument("--dev_filename_path", default=None, type=str)
 24 |     parser.add_argument("--test_filename_path", default=None, type=str)
 25 | 
 26 | 
 27 |     parser.add_argument("--model_arc", default="char_cnn", choices=["char_cnn", "char_birnn", "char_birnn_cnn", "sep", "sep_cnn_only"], type=str)
 28 |     parser.add_argument("--tag_emb_dim", default=50, type=int)
 29 |     parser.add_argument("--pos_emb_dim", default=50, type=int)
 30 |     parser.add_argument("--char_emb_dim", default=30, type=int)
 31 |     parser.add_argument("--word_emb_dim", default=100, type=int)
 32 |     parser.add_argument("--cnn_filter_size", default=30, type=int)
 33 |     parser.add_argument("--cnn_win_size", default=3, type=int)
 34 |     parser.add_argument("--rnn_type", default="lstm", choices=['lstm', 'gru'], type=str)
 35 |     parser.add_argument("--hidden_dim", default=200, type=int, help="token level rnn hidden dim")
 36 |     parser.add_argument("--char_hidden_dim", default=25, type=int, help="char level rnn hidden dim")
 37 |     parser.add_argument("--layer", default=1, type=int)
 38 | 
 39 |     parser.add_argument("--replace_unk_rate", default=0.0, type=float, help="uses when not all words in the test data is covered by the pretrained embedding")
 40 |     parser.add_argument("--remove_singleton", default=False, action="store_true")
 41 |     parser.add_argument("--map_pretrain", default=False, action="store_true")
 42 |     parser.add_argument("--map_dim", default=100, type=int)
 43 |     parser.add_argument("--pretrain_fix", default=False, action="store_true")
 44 | 
 45 |     parser.add_argument("--output_dropout_rate", default=0.5, type=float, help="dropout applied to the output of birnn before crf")
 46 |     parser.add_argument("--emb_dropout_rate", default=0.3, type=float, help="dropout applied to the input of token-level birnn")
 47 |     parser.add_argument("--valid_freq", default=500, type=int)
 48 |     parser.add_argument("--tot_epochs", default=100, type=int)
 49 |     parser.add_argument("--batch_size", default=10, type=int)
 50 |     parser.add_argument("--init_lr", default=0.015, type=float)
 51 |     parser.add_argument("--lr_decay", default=False, action="store_true")
 52 |     parser.add_argument("--decay_rate", default=0.05, action="store", type=float)
 53 |     parser.add_argument("--patience", default=3, type=int)
 54 | 
 55 |     parser.add_argument("--tagging_scheme", default="bio", choices=["bio", "bioes"], type=str)
 56 | 
 57 |     parser.add_argument("--data_aug", default=False, action="store_true", help="If use data_aug, the train_path should be the combined training file")
 58 |     parser.add_argument("--aug_lang", default="english", help="the language to augment the dataset")
 59 |     parser.add_argument("--aug_lang_train_path", default=None, type=str)
 60 |     parser.add_argument("--tgt_lang_train_path", default="../datasets/english/eng.train.bio.conll", type=str)
 61 | 
 62 |     parser.add_argument("--pretrain_emb_path", type=str, default=None)
 63 |     parser.add_argument("--res_discrete_feature", default=False, action="store_true", help="residual use of discrete features")
 64 | 
 65 |     parser.add_argument("--feature_birnn_hidden_dim", default=50, type=int, action="store")
 66 | 
 67 |     parser.add_argument("--use_discrete_features", default=False, action="store_true", help="David's indicator features")
 68 |     parser.add_argument("--split_hashtag", default=False, action="store_true", help="indicator of preceding hashtags")
 69 |     parser.add_argument("--cap", default=False, action="store_true", help="capitalization feature")
 70 |     parser.add_argument("--feature_dim", type=int, default=10, help="dimension of discrete features")
 71 | 
 72 |     parser.add_argument("--use_brown_cluster", default=False, action="store_true")
 73 |     parser.add_argument("--brown_cluster_path", action="store", type=str, help="path to the brown cluster features")
 74 |     parser.add_argument("--brown_cluster_num", default=0, type=int, action="store")
 75 |     parser.add_argument("--brown_cluster_dim", default=30, type=int, action="store")
 76 | 
 77 |     # Use trained model to test
 78 |     parser.add_argument("--mode", default="train", type=str, choices=["train", "test_1"],
 79 |                         help="test_1: use one model")
 80 | 
 81 |     # Partial CRF
 82 |     parser.add_argument("--use_partial", default=False, action="store_true")
 83 | 
 84 |     # Active Learning
 85 |     parser.add_argument("--ngram", default=2, type=int)
 86 |     parser.add_argument("--to_annotate", type=str,default="./annotate.txt")
 87 |     parser.add_argument("--entropy_threshold", type=float, default=None)
 88 |     parser.add_argument("--use_CFB", default=False, action="store_true")
 89 |     parser.add_argument("--SPAN_wise", default=False, action="store_true", help="get span wise scores, even if there are duplicates.")
 90 |     parser.add_argument("--k", default=200, type=int, help="fixed number of spans to annotate")
 91 |     parser.add_argument("--debug", type=str)
 92 |     # Format of test output
 93 |     parser.add_argument("--test_conll", default=False, action="store_true")
 94 |     parser.add_argument("--fixedVocab", default=False, action="store_true", help="for loading pre-trained model")
 95 |     parser.add_argument("--fineTune", default=False, action="store_true", help="for loading pre-trained model")
 96 |     parser.add_argument("--run",default=0, type=int)
 97 |     parser.add_argument("--misc",default=False, action="store_true")
 98 |     parser.add_argument("--addbias", default=False, action="store_true")
 99 |     args = parser.parse_args()
100 | 
101 | 
102 |     if args.train_ensemble:
103 |         # model_name = ens_1_ + original
104 |         # set dynet seed manually
105 |         ens_no = int(args.model_name.split("_")[1])
106 |         # dyparams = dy.DynetParams()
107 |         # dyparams.set_random_seed(ens_no + 5783287)
108 |         # dyparams.init()
109 | 
110 |         import dynet_config
111 |         dynet_config.set(random_seed=ens_no + 5783290)
112 |         # if args.cuda:
113 |         #     dynet_config.set_gpu()
114 | 
115 |         # args.train_path = args.train_path.split(".")[0] + "_" + str(ens_no) + ".conll"
116 | 
117 |     if args.full_data_path is None:
118 |         args.full_data_path = args.train_path
119 |     args.save_to_path = args.save_to_path + args.model_name + ".model"
120 |     print(args)
121 |     return args
122 | 


--------------------------------------------------------------------------------
/models/model_builder.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'chuntingzhou and aditichaudhary'
  2 | from encoders import *
  3 | from decoders import *
  4 | from collections import defaultdict
  5 | from copy import deepcopy
  6 | 
  7 | #np.set_printoptions(threshold='nan')
  8 | 
  9 | 
 10 | class CRF_Model(object):
 11 |     def __init__(self, args, data_loader, lm_data_loader=None):
 12 |         self.save_to = args.save_to_path
 13 |         self.load_from = args.load_from_path
 14 |         tag_to_id = data_loader.tag_to_id
 15 |         self.constraints = None
 16 |         # print self.constraints
 17 | 
 18 |         #partial CRF
 19 |         self.use_partial = args.use_partial
 20 |         self.tag_to_id = tag_to_id
 21 |         self.B_UNK = data_loader.B_UNK
 22 |         self.I_UNK = data_loader.I_UNK
 23 | 
 24 |         #active learning for partial annotations
 25 |         self.entropy_spans = defaultdict(lambda: 0)
 26 |         self.full_sentences = {}
 27 |         self.use_CFB = args.use_CFB
 28 |         self.addbias = args.addbias
 29 |         self.B_tags = []
 30 |         self.I_tags = []
 31 |         self.O_tags = []
 32 |         B_tags = []
 33 |         I_tags = []
 34 |         for tag in tag_to_id:
 35 |             if "B-" in tag:
 36 |                 B_tags.append(tag)
 37 |             elif "I-" in tag:
 38 |                 I_tags.append(tag)
 39 |             elif tag == "O":
 40 |                 self.O_tags.append(tag_to_id[tag])
 41 |         B_tags = sorted(B_tags)
 42 |         I_tags = sorted(I_tags)
 43 |         self.B_tags = [tag_to_id[tag] for tag in B_tags]
 44 |         self.I_tags = [tag_to_id[tag] for tag in I_tags]
 45 | 
 46 |     def forward(self, sents, char_sents, feats, bc_feats, training=True):
 47 |         raise NotImplementedError
 48 | 
 49 |     def save(self):
 50 |         if self.save_to is not None:
 51 |             self.model.save(self.save_to)
 52 |         else:
 53 |             print('Save to path not provided!')
 54 | 
 55 |     def load(self, path=None):
 56 |         if path is None:
 57 |             path = self.load_from
 58 |         if self.load_from is not None or path is not None:
 59 |             print('Load model parameters from %s!' % path)
 60 |             self.model.populate(path)
 61 |         else:
 62 |             print('Load from path not provided!')
 63 | 
 64 |     def cal_loss(self, sents, char_sents, ner_tags, feats, bc_feats, known_tags, lm_batch=None, training=True):
 65 |         birnn_outputs = self.forward(sents, char_sents, feats, bc_feats, training=training)
 66 |         crf_loss = self.crf_decoder.decode_loss(birnn_outputs, ner_tags,self.use_partial, known_tags, self.tag_to_id, self.B_UNK, self.I_UNK)
 67 |         return crf_loss#, sum_s, sent_s
 68 | 
 69 |     def eval(self, sents, char_sents, feats, bc_feats, training=False,type="eval"):
 70 |         birnn_outputs = self.forward(sents, char_sents, feats, bc_feats, training=training)
 71 |         best_score, best_path, tag_scores = self.crf_decoder.decoding(birnn_outputs, self.O_tags, addbias=self.addbias)
 72 |         best_path_copy = deepcopy(best_path)
 73 |         if type == "test":
 74 |             alpha_value, alphas = self.crf_decoder.forward_alg(tag_scores)
 75 |             beta_value, betas = self.crf_decoder.backward_one_sequence(tag_scores)
 76 |             # print("Alpha:{0} Beta:{1}".format(alpha_value.value(), beta_value.value()))
 77 |             sent = sents[0]
 78 |             gammas = []
 79 |             sum = []
 80 |             for i in range(len(sent)):
 81 |                 gammas.append(alphas[i] + betas[i] - alpha_value)
 82 | 
 83 |             if self.use_CFB:
 84 |                 self.crf_decoder.get_uncertain_subsequences_CFB(sent, tag_scores, alphas, betas, alpha_value, gammas,
 85 |                                                                 best_path_copy, self.tag_to_id
 86 |                                                                 , self.B_UNK, self.I_UNK)
 87 | 
 88 |             else:
 89 |                 self.crf_decoder.get_uncertain_subsequences(sent, tag_scores, alphas, betas, alpha_value, gammas,
 90 |                                                        best_path_copy
 91 |                                                        , self.B_tags, self.I_tags, self.O_tags)
 92 | 
 93 | 
 94 |             return best_score - alpha_value, best_path
 95 |         else:
 96 |             return  best_score, best_path
 97 | 
 98 |     def eval_scores(self, sents, char_sents, feats, bc_feats, training=False):
 99 |         birnn_outputs = self.forward(sents, char_sents, feats, bc_feats, training=training)
100 |         tag_scores, transit_score = self.crf_decoder.get_crf_scores(birnn_outputs)
101 |         return tag_scores, transit_score
102 | 
103 | 
104 | class vanilla_NER_CRF_model(CRF_Model):
105 |     ''' Implement End-to-end Sequence Labeling via Bi-directional LSTM-CNNs-CRF. '''
106 |     def __init__(self, args, data_loader, lm_data_loader=None):
107 |         # super(vanilla_NER_CRF_model, self).__init__(args, data_loader)
108 |         self.model = dy.Model()
109 |         self.args = args
110 |         super(vanilla_NER_CRF_model, self).__init__(args, data_loader)
111 | 
112 |         self.res_discrete = args.res_discrete_feature
113 | 
114 |         ner_tag_size = data_loader.ner_vocab_size
115 |         char_vocab_size = data_loader.char_vocab_size
116 |         word_vocab_size = data_loader.word_vocab_size
117 |         word_padding_token = data_loader.word_padding_token
118 | 
119 |         char_emb_dim = args.char_emb_dim
120 |         word_emb_dim = args.word_emb_dim
121 |         tag_emb_dim = args.tag_emb_dim
122 |         birnn_input_dim = args.cnn_filter_size + args.word_emb_dim
123 |         hidden_dim = args.hidden_dim
124 |         src_ctx_dim = args.hidden_dim * 2
125 | 
126 |         cnn_filter_size = args.cnn_filter_size
127 |         cnn_win_size = args.cnn_win_size
128 |         output_dropout_rate = args.output_dropout_rate
129 |         emb_dropout_rate = args.emb_dropout_rate
130 | 
131 |         if args.use_discrete_features:
132 |             self.num_feats = data_loader.num_feats
133 |             self.feature_encoder = Discrete_Feature_Encoder(self.model, self.num_feats, args.feature_dim)
134 |             if self.res_discrete:
135 |                 src_ctx_dim += args.feature_dim * self.num_feats
136 |             else:
137 |                 birnn_input_dim += args.feature_dim * self.num_feats
138 | 
139 |         if args.use_brown_cluster:
140 |             bc_num = args.brown_cluster_num
141 |             bc_dim = args.brown_cluster_dim
142 |             # for each batch, the length of input seqs are the same, so we don't have bother with padding
143 |             self.bc_encoder = Lookup_Encoder(self.model, args, bc_num, bc_dim, word_padding_token, isFeatureEmb=True)
144 | 
145 |             if self.res_discrete:
146 |                 src_ctx_dim += bc_dim
147 |             else:
148 |                 birnn_input_dim += bc_dim
149 | 
150 |         self.char_cnn_encoder = CNN_Encoder(self.model, char_emb_dim, cnn_win_size, cnn_filter_size,
151 |                                             0.0, char_vocab_size, data_loader.char_padding_token)
152 |         if args.pretrain_emb_path is None:
153 |             self.word_lookup = Lookup_Encoder(self.model, args, word_vocab_size, word_emb_dim, word_padding_token)
154 |         else:
155 |             print("In NER CRF: Using pretrained word embedding!")
156 |             self.word_lookup = Lookup_Encoder(self.model, args, word_vocab_size, word_emb_dim, word_padding_token, data_loader.pretrain_word_emb)
157 |         self.birnn_encoder = BiRNN_Encoder(self.model, birnn_input_dim, hidden_dim, emb_dropout_rate=emb_dropout_rate,
158 |                                            output_dropout_rate=output_dropout_rate)
159 | 
160 |         self.crf_decoder = chain_CRF_decoder(args, self.model, src_ctx_dim, tag_emb_dim, ner_tag_size, constraints=self.constraints)
161 | 
162 |     def forward(self, sents, char_sents, feats, bc_feats, training=True):
163 |         char_embs = self.char_cnn_encoder.encode(char_sents, training=training)
164 |         word_embs = self.word_lookup.encode(sents)
165 | 
166 |         if self.args.use_discrete_features:
167 |             feat_embs = self.feature_encoder.encode(feats)
168 | 
169 |         if self.args.use_brown_cluster:
170 |             bc_feat_embs = self.bc_encoder.encode(bc_feats)
171 | 
172 |         if self.args.use_discrete_features and self.args.use_brown_cluster:
173 |             concat_inputs = [dy.concatenate([c, w, f, b]) for c, w, f, b in
174 |                              zip(char_embs, word_embs, feat_embs, bc_feat_embs)]
175 |         elif self.args.use_brown_cluster and not self.args.use_discrete_features:
176 |             concat_inputs = [dy.concatenate([c, w, f]) for c, w, f in
177 |                              zip(char_embs, word_embs, bc_feat_embs)]
178 |         elif self.args.use_discrete_features and not self.args.use_brown_cluster:
179 |             concat_inputs = [dy.concatenate([c, w, f]) for c, w, f in
180 |                              zip(char_embs, word_embs, feat_embs)]
181 |         else:
182 |             concat_inputs = [dy.concatenate([c, w]) for c, w in zip(char_embs, word_embs)]
183 | 
184 |         birnn_outputs = self.birnn_encoder.encode(concat_inputs, training=training)
185 |         return birnn_outputs
186 | 


--------------------------------------------------------------------------------
/dataloaders/data_loader.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'chuntingzhou and aditichaudhary'
  2 | import os
  3 | from utils.util import *
  4 | from utils.features import *
  5 | #from utils.segnerfts import orm_morph as ormnorm
  6 | 
  7 | #tagset = ['B-LOC','B-PER','B-MISC', 'B-ORG','I-LOC','I-PER','I-MISC', 'I-ORG','O']
  8 | tagset = ['B-LOC','B-PER','B-GPE', 'B-ORG','I-LOC','I-PER','I-GPE', 'I-ORG','O']
  9 | 
 10 | class NER_DataLoader():
 11 |     def __init__(self, args, special_normal=False):
 12 |         # This is data loader as well as feature extractor!!
 13 | 
 14 |         self.args = args
 15 |         if args.train_ensemble:
 16 |             self.train_path = args.full_data_path
 17 |         else:
 18 |             self.train_path = args.train_path
 19 |         self.test_path = args.test_path
 20 |         self.dev_path = args.dev_path
 21 |         self.args = args
 22 | 
 23 |         self.tag_vocab_path = self.train_path + ".tag_vocab"
 24 |         self.word_vocab_path = self.train_path + ".word_vocab"
 25 |         self.char_vocab_path = self.train_path + ".char_vocab"
 26 | 
 27 |         self.pretrained_embedding_path = args.pretrain_emb_path
 28 |         self.use_discrete_feature = args.use_discrete_features
 29 |         self.use_brown_cluster = args.use_brown_cluster
 30 | 
 31 |         self.train_senttypes = self.dev_senttypes = self.test_senttypes = None
 32 | 
 33 |         if self.use_brown_cluster:
 34 |             self.brown_cluster_dicts = get_brown_cluster(args.brown_cluster_path)
 35 |             self.brown_cluster_dicts['<unk>'] = len(self.brown_cluster_dicts)
 36 |             args.brown_cluster_num = len(self.brown_cluster_dicts)
 37 |         else:
 38 |             self.brown_cluster_dicts = None
 39 | 
 40 |         print("Generating vocabs from training file ....")
 41 |         paths_to_read = [self.train_path, self.dev_path, self.test_path]
 42 | 
 43 |         if args.fixedVocab: #Make vaocabulary from the args.aug_lang_train_path
 44 |             _, self.word_to_id, self.char_to_id = self.read_files([self.args.aug_lang_train_path])
 45 |             self.tag_to_id  = {}
 46 |             # self.word_to_id = {}
 47 |             # self.char_to_id = {}
 48 |             for tag in tagset:
 49 |                 if args.misc:
 50 |                     tag = tag.replace("GPE", "MISC")
 51 |                 self.tag_to_id[tag] = len(self.tag_to_id)
 52 |         else:
 53 |             self.tag_to_id, self.word_to_id, self.char_to_id = self.read_files(paths_to_read)
 54 |         print("Size of vocab before: %d" % len(self.word_to_id))
 55 |         self.word_to_id['<unk>'] = len(self.word_to_id) + 1
 56 |         self.char_to_id['<unk>'] = len(self.char_to_id) + 1
 57 |         self.word_to_id['<\s>'] = 0
 58 |         self.char_to_id['<pad>'] = 0
 59 |         print("Size of vocab after: %d" % len(self.word_to_id))
 60 |         self.word_padding_token = 0
 61 |         self.char_padding_token = 0
 62 | 
 63 |         if self.pretrained_embedding_path is not None:
 64 |             self.pretrain_word_emb, self.word_to_id, self.char_to_id = get_pretrained_emb(self.args.fixedVocab, self.pretrained_embedding_path,
 65 |                                                                          self.word_to_id, self.char_to_id, args.word_emb_dim)
 66 | 
 67 |         # for char vocab and word vocab, we reserve id 0 for the eos padding, and len(vocab)-1 for the <unk>
 68 |         self.id_to_tag = {v: k for k, v in self.tag_to_id.iteritems()}
 69 |         self.id_to_word = {v: k for k, v in self.word_to_id.iteritems()}
 70 |         self.id_to_char = {v: k for k, v in self.char_to_id.iteritems()}
 71 | 
 72 |         self.ner_vocab_size = len(self.id_to_tag)
 73 |         self.word_vocab_size = len(self.id_to_word)
 74 |         self.char_vocab_size = len(self.id_to_char)
 75 | 
 76 |         self.cap_ratio_dict = None
 77 | 
 78 |         #Partial CRF
 79 |         self.B_UNK = self.ner_vocab_size + 1
 80 |         self.I_UNK = self.ner_vocab_size + 2
 81 | 
 82 |         print("Size of vocab after: %d" % len(self.word_to_id))
 83 |         print("NER tag num=%d, Word vocab size=%d, Char Vocab size=%d" % (self.ner_vocab_size, self.word_vocab_size, self.char_vocab_size))
 84 | 
 85 | 
 86 |     @staticmethod
 87 |     def exists(path):
 88 |         return os.path.exists(path)
 89 | 
 90 |     def read_one_line(self, line, tag_set, word_dict, char_set):
 91 |         for w in line:
 92 |             fields = w.split()
 93 |             if len(fields) !=2:
 94 |                 print("ERROR! Incorrect number of fields in the file, required two.")
 95 |                 print(fields)
 96 |                 exit(0)
 97 |             word = fields[0]
 98 |             ner_tag = fields[-1]
 99 | 
100 |             for c in word:
101 |                 char_set.add(c)
102 |             if "UNK" not in ner_tag:
103 |                 if self.args.misc:
104 |                     ner_tag = ner_tag.replace("GPE","MISC")
105 |                 tag_set.add(ner_tag)
106 |             word_dict[word] += 1
107 | 
108 |     def get_vocab_from_set(self, a_set, shift=0):
109 |         vocab = {}
110 |         for i, elem in enumerate(a_set):
111 |             vocab[elem] = i + shift
112 | 
113 |         return vocab
114 | 
115 |     def get_vocab_from_dict(self, a_dict, shift=0, remove_singleton=False):
116 |         vocab = {}
117 |         i = 0
118 |         self.singleton_words = set()
119 | 
120 |         #Sort the defaultdict
121 |         sortedDict = sorted(a_dict.iteritems(), key=lambda (k, v): v, reverse=True)
122 |         for (k,v) in sortedDict:
123 | 
124 |         #for k, v in a_dict.iteritems():
125 |             if v == 1:
126 |                 self.singleton_words.add(i + shift)
127 |             if remove_singleton:
128 |                 if v > 1:
129 |                     # print k, v
130 |                     vocab[k] = i + shift
131 |                     i += 1
132 |             else:
133 |                 vocab[k] = i + shift
134 |                 i += 1
135 |         print("Singleton words number: %d" % len(self.singleton_words))
136 |         return vocab
137 | 
138 |     def read_files(self, paths):
139 |         # word_list = []
140 |         # char_list = []
141 |         # tag_list = []
142 |         word_dict = defaultdict(lambda: 0)
143 |         char_set = set()
144 |         tag_set = set()
145 | 
146 |         def _read_a_file(path):
147 |             with codecs.open(path, "r", "utf-8") as fin:
148 |                 to_read_line = []
149 |                 for line in fin:
150 |                     if line.strip() == "":
151 |                         self.read_one_line(to_read_line, tag_set, word_dict, char_set)
152 |                         to_read_line = []
153 |                     else:
154 |                         to_read_line.append(line.strip())
155 |                 self.read_one_line(to_read_line, tag_set, word_dict, char_set)
156 | 
157 |         for path in paths:
158 |             _read_a_file(path)
159 | 
160 |         tag_vocab = self.get_vocab_from_set(tag_set)
161 |         word_vocab = self.get_vocab_from_dict(word_dict, 1, self.args.remove_singleton)
162 |         char_vocab = self.get_vocab_from_set(char_set, 1)
163 | 
164 |         return tag_vocab, word_vocab, char_vocab
165 | 
166 |     def get_data_set(self, path, lang, source="train"):
167 |         sents = []
168 |         char_sents = []
169 |         tgt_tags = []
170 |         discrete_features = []
171 |         bc_features = []
172 |         known_tags = []
173 | 
174 |         if source == "train":
175 |             sent_types = self.train_senttypes
176 |         else:
177 |             sent_types = self.dev_senttypes
178 | 
179 |         def add_sent(one_sent, type):
180 |             temp_sent = []
181 |             temp_ner = []
182 |             temp_char = []
183 |             temp_bc = []
184 |             sent = []
185 |             temp_known_tag = []
186 |             for w in one_sent:
187 |                 fields = w.split()
188 |                 if len(fields)!=2:
189 |                     fields = w.split("\t")
190 |                 assert len(fields)==2
191 |                 word = fields[0]
192 |                 sent.append(word)
193 |                 ner_tag = fields[-1]
194 |                 if self.use_brown_cluster:
195 |                     temp_bc.append(self.brown_cluster_dicts[word] if word in self.brown_cluster_dicts else self.brown_cluster_dicts["<unk>"])
196 | 
197 |                 if self.args.fixedVocab:
198 |                     if word in self.word_to_id:
199 |                         temp_sent.append(self.word_to_id[word])
200 |                     elif word.lower() in self.word_to_id:
201 |                         temp_sent.append(self.word_to_id[word.lower()])
202 |                     else:
203 |                         temp_sent.append(self.word_to_id["<unk>"])
204 |                 else:
205 |                     temp_sent.append(self.word_to_id[word] if word in self.word_to_id else self.word_to_id["<unk>"])
206 | 
207 |                 if "B-UNK" in ner_tag:
208 |                     temp_ner.append(self.B_UNK)
209 |                 elif "I-UNK" in ner_tag:
210 |                     temp_ner.append(self.I_UNK)
211 |                 else:
212 |                     if self.args.misc:
213 |                         ner_tag = ner_tag.replace("GPE","MISC")
214 |                     temp_ner.append(self.tag_to_id[ner_tag])
215 | 
216 |                 if "UNK" in ner_tag:
217 |                     temp_known_tag.append([0])
218 |                 else:
219 |                     temp_known_tag.append([1])
220 | 
221 |                 temp_char.append([self.char_to_id[c] if c in self.char_to_id else self.char_to_id["<unk>"] for c in word])
222 | 
223 |             sents.append(temp_sent)
224 |             char_sents.append(temp_char)
225 |             tgt_tags.append(temp_ner)
226 |             bc_features.append(temp_bc)
227 |             known_tags.append(temp_known_tag)
228 |             discrete_features.append([])
229 | 
230 |             # print len(discrete_features[-1])
231 | 
232 |         with codecs.open(path, "r", "utf-8") as fin:
233 |             i = 0
234 |             one_sent = []
235 |             for line in fin:
236 |                 if line.strip() == "" or line.strip() == "\n":
237 |                     if len(one_sent) > 0:
238 |                         add_sent(one_sent, sent_types[i] if sent_types is not None else None)
239 |                         i += 1
240 |                         if i % 1000 == 0:
241 |                             print("Processed %d training data." % (i,))
242 |                     one_sent = []
243 |                 else:
244 |                     one_sent.append(line.strip())
245 | 
246 |             if len(one_sent) > 0:
247 |                 add_sent(one_sent, sent_types[i] if sent_types is not None else None)
248 |                 i += 1
249 | 
250 |         if sent_types is not None:
251 |             assert i == len(sent_types), "Not match between number of sentences and sentence types!"
252 | 
253 |         if self.use_discrete_feature:
254 |             self.num_feats = len(discrete_features[0][0])
255 |         else:
256 |             self.num_feats = 0
257 |         return sents, char_sents, tgt_tags, discrete_features, bc_features, known_tags
258 | 


--------------------------------------------------------------------------------
/utils/util.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'chuntingzhou'
  2 | import dynet as dy
  3 | import numpy as np
  4 | from collections import defaultdict
  5 | import gzip
  6 | import cPickle as pkl
  7 | import codecs
  8 | import math
  9 | import random
 10 | from random import shuffle
 11 | 
 12 | random.seed(448)
 13 | np.random.seed(1)
 14 | import operator
 15 | import re
 16 | 
 17 | MAX_CHAR_LENGTH = 45
 18 | 
 19 | # Regular expressions used to normalize digits.
 20 | DIGIT_RE = re.compile(br"\d")
 21 | 
 22 | 
 23 | # word = utils.DIGIT_RE.sub(b"0", tokens[0]) if normalize_digits else tokens[0]
 24 | 
 25 | 
 26 | def iob2(tags):
 27 |     """
 28 |     Check that tags have a valid IOB format.
 29 |     Tags in IOB1 format are converted to IOB2.
 30 |     """
 31 |     for i, tag in enumerate(tags):
 32 |         if tag == 'O':
 33 |             continue
 34 |         split = tag.split('-')
 35 |         if len(split) != 2 or split[0] not in ['I', 'B']:
 36 |             return False
 37 |         if split[0] == 'B':
 38 |             continue
 39 |         elif i == 0 or tags[i - 1] == 'O':  # conversion IOB1 to IOB2
 40 |             tags[i] = 'B' + tag[1:]
 41 |         elif tags[i - 1][1:] == tag[1:]:
 42 |             continue
 43 |         else:  # conversion IOB1 to IOB2
 44 |             tags[i] = 'B' + tag[1:]
 45 |     return True
 46 | 
 47 | 
 48 | def get_entity(label):
 49 |     entities = []
 50 |     i = 0
 51 |     while i < len(label):
 52 |         if label[i] != 'O':
 53 |             e_type = label[i][2:]
 54 |             j = i + 1
 55 |             while j < len(label) and label[j] == 'I-' + e_type:
 56 |                 j += 1
 57 |             entities.append((i, j, e_type))
 58 |             i = j
 59 |         else:
 60 |             i += 1
 61 |     return entities
 62 | 
 63 | 
 64 | def evaluate_ner(pred, gold):
 65 |     tp = 0
 66 |     fp = 0
 67 |     fn = 0
 68 |     for i in range(len(pred)):
 69 |         pred_entities = get_entity(pred[i])
 70 |         gold_entities = get_entity(gold[i])
 71 |         temp = 0
 72 |         for entity in pred_entities:
 73 |             if entity in gold_entities:
 74 |                 tp += 1
 75 |                 temp += 1
 76 |             else:
 77 |                 fp += 1
 78 |         fn += len(gold_entities) - temp
 79 |     precision = 1.0 * tp / (tp + fp)
 80 |     recall = 1.0 * tp / (tp + fn)
 81 |     f1 = 2 * precision * recall / (precision + recall)
 82 |     return precision, recall, f1
 83 | 
 84 | 
 85 | def fopen(filename, mode='r'):
 86 |     if filename.endswith('.gz'):
 87 |         return gzip.open(filename, mode)
 88 |     return open(filename, mode)
 89 | 
 90 | 
 91 | def get_pretrained_emb(fixedVocab, path_to_emb, word_to_id, char_to_id, dim):
 92 |     word_emb = []
 93 |     print("Loading pretrained embeddings from %s." % (path_to_emb))
 94 |     print("length of dict: %d" % len(word_to_id))
 95 | 
 96 |     pretrain_word_emb = {}
 97 |     pretrain_vocab = []
 98 |     for line in codecs.open(path_to_emb, "r", "utf-8", errors='replace'):
 99 |         items = line.strip().split()
100 |         if len(items) == dim + 1:
101 |             try:
102 |                 pretrain_word_emb[items[0]] = np.asarray(items[1:]).astype(np.float32)
103 |                 pretrain_vocab.append(items[0])
104 |             except ValueError:
105 |                 continue
106 | 
107 |     for _ in range(len(word_to_id)):
108 |         word_emb.append(np.random.uniform(-math.sqrt(3.0 / dim), math.sqrt(3.0 / dim), size=dim))
109 | 
110 |     not_covered = 0
111 |     print(len(word_to_id), len(word_emb))
112 | 
113 |     for word, id in word_to_id.iteritems():
114 |         if word in pretrain_word_emb:
115 |             word_emb[id] = pretrain_word_emb[word]
116 |         elif word.lower() in pretrain_word_emb:
117 |             word_emb[id] = pretrain_word_emb[word.lower()]
118 |         else:
119 |             not_covered += 1
120 |     
121 |     if fixedVocab:
122 |         #Take top 100000 from the word embeddings
123 |         num = 0
124 |         for word in pretrain_vocab:
125 |             if num > 400000:
126 |                 break
127 |             if word not in word_to_id:
128 |                 word_to_id[word] = len(word_to_id)
129 |                 word_emb.append(pretrain_word_emb[word])
130 |                 num +=1
131 | 	
132 |     else:
133 |         for word in pretrain_word_emb.keys():
134 |             if word not in word_to_id:
135 |                 word_to_id[word] = len(word_to_id)
136 |                 word_emb.append(pretrain_word_emb[word])
137 | 
138 |     emb = np.array(word_emb, dtype=np.float32)
139 | 
140 |     print("Word number not covered in pretrain embedding: %d" % not_covered)
141 |     return emb, word_to_id, char_to_id
142 | 
143 | 
144 | def pkl_dump(obj, path):
145 |     with open(path, "wb") as fout:
146 |         pkl.dump(obj, fout)
147 | 
148 | 
149 | def pkl_load(path):
150 |     with open(path, "rb") as fin:
151 |         obj = pkl.load(fin)
152 |     return obj
153 | 
154 | 
155 | def log_sum_exp_dim_0(x):
156 |     # numerically stable log_sum_exp
157 |     dims = x.dim()
158 |     max_score = dy.max_dim(x, 0)  # (dim_1, batch_size)
159 |     if len(dims[0]) == 1:
160 |         max_score_extend = max_score
161 |     else:
162 |         max_score_reshape = dy.reshape(max_score, (1, dims[0][1]), batch_size=dims[1])
163 |         max_score_extend = dy.concatenate([max_score_reshape] * dims[0][0])
164 |     x = x - max_score_extend
165 |     exp_x = dy.exp(x)
166 |     # (dim_1, batch_size), if no dim_1, return ((1,), batch_size)
167 |     log_sum_exp_x = dy.log(dy.mean_dim(exp_x, d=[0], b=False) * dims[0][0])
168 |     return log_sum_exp_x + max_score
169 | 
170 | 
171 | def data_iterator(data_pair, batch_size):
172 |     batches = make_bucket_batches(data_pair, batch_size)
173 |     for batch in batches:
174 |         yield batch
175 | 
176 | 
177 | def make_bucket_batches(data_collections, batch_size):
178 |     # Data are bucketed according to the length of the first item in the data_collections.
179 |     buckets = defaultdict(list)
180 |     tot_items = len(data_collections[0])
181 |     for data_item in data_collections:
182 |         src = data_item[0]
183 |         buckets[len(src)].append(data_item)
184 | 
185 |     batches = []
186 |     # np.random.seed(2)
187 |     for src_len in buckets:
188 |         bucket = buckets[src_len]
189 |         np.random.shuffle(bucket)
190 | 
191 |         num_batches = int(np.ceil(len(bucket) * 1.0 / batch_size))
192 |         for i in range(num_batches):
193 |             cur_batch_size = batch_size if i < num_batches - 1 else len(bucket) - batch_size * i
194 |             batches.append([[bucket[i * batch_size + j][k] for j in range(cur_batch_size)] for k in range(tot_items)])
195 |     np.random.shuffle(batches)
196 |     return batches
197 | 
198 | 
199 | def transpose_input(seq, padding_token=0):
200 |     # input seq: list of samples [[w1, w2, ..], [w1, w2, ..]]
201 |     max_len = max([len(sent) for sent in seq])
202 |     seq_pad = []
203 |     seq_mask = []
204 |     for i in range(max_len):
205 |         pad_temp = [sent[i] if i < len(sent) else padding_token for sent in seq]
206 |         mask_temp = [1.0 if i < len(sent) else 0.0 for sent in seq]
207 |         seq_pad.append(pad_temp)
208 |         seq_mask.append(mask_temp)
209 | 
210 |     return seq_pad, seq_mask
211 | 
212 | 
213 | def transpose_discrete_features(feature_batch):
214 |     # Discrete features are zero-one features
215 |     # TODO: Other integer features, create lookup tables
216 |     # tgt_batch: [[[feature of word 1 of sent 1], [feature of word 2 of sent 2], ]]
217 |     # return: [(feature_num, batchsize)]
218 |     max_sent_len = max([len(s) for s in feature_batch])
219 |     feature_num = len(feature_batch[0][0])
220 |     batch_size = len(feature_batch)
221 |     features = []  # each: (feature_num, batch_size)
222 |     for i in range(max_sent_len):
223 |         w_i_feature = [dy.inputTensor(sent[i], batched=True) if i < len(sent) else dy.zeros(feature_num) for sent in feature_batch]
224 |         w_i_feature = dy.reshape(dy.concatenate(w_i_feature, d=1), (feature_num,), batch_size=batch_size)
225 |         features.append(w_i_feature)
226 | 
227 |     return features
228 | 
229 | 
230 | def transpose_and_batch_embs(input_embs, emb_size):
231 |     # input_embs: [[w1_emb, w2_emb, ]], embs are dy.expressions
232 |     max_len = max(len(sent) for sent in input_embs)
233 |     batch_size = len(input_embs)
234 |     padded_seq_emb = []
235 |     seq_masks = []
236 |     for i in range(max_len):
237 |         w_i_emb = [sent[i] if i < len(sent) else dy.zeros(emb_size) for sent in input_embs]
238 |         w_i_emb = dy.reshape(dy.concatenate(w_i_emb, d=1), (emb_size,), batch_size=batch_size)
239 |         w_i_mask = [1.0 if i < len(sent) else 0.0 for sent in input_embs]
240 |         padded_seq_emb.append(w_i_emb)
241 |         seq_masks.append(w_i_mask)
242 | 
243 |     return padded_seq_emb, seq_masks
244 | 
245 | 
246 | def transpose_char_input(tgt_batch, padding_token):
247 |     # The tgt_batch may not be padded with <sow> and <eow>
248 |     # tgt_batch: [[[<sow>, <sos>, <eow>], [<sow>, s,h,e, <eow>],
249 |     # [<sow>, i,s, <eow>], [<sow>, p,r,e,t,t,y, <eow>], [<sow>, <eos>, <eow>]], [[],[],[]]]
250 |     max_sent_len = max([len(s) for s in tgt_batch])
251 |     sent_w_batch = []  # each is list of list: max_word_len, batch_size
252 |     sent_mask_batch = []  # each is list of list: max_word_len, batch_size
253 |     max_w_lens = []
254 |     SOW_PAD = 0
255 |     EOW_PAD = 1
256 |     EOS_PAD = 2
257 |     for i in range(max_sent_len):
258 |         max_len_w = max([len(sent[i]) for sent in tgt_batch if i < len(sent)])
259 |         max_w_lens.append(max_len_w)
260 |         w_batch = []
261 |         mask_batch = []
262 |         for j in range(0, max_len_w):
263 |             temp_j_w = []
264 |             for sent in tgt_batch:
265 |                 if i < len(sent) and j < len(sent[i]):
266 |                     temp_j_w.append(sent[i][j])
267 |                 elif i >= len(sent):
268 |                     if j == 0:
269 |                         temp_j_w.append(SOW_PAD)
270 |                     elif j == max_len_w - 1:
271 |                         temp_j_w.append(EOW_PAD)
272 |                     else:
273 |                         temp_j_w.append(EOS_PAD)
274 |                 else:
275 |                     temp_j_w.append(EOW_PAD)
276 |             # w_batch = [sent[i][j] if i < len(sent) and j < len(sent[i]) else self.EOW for sent in tgt_batch]
277 |             # print "temp: ", temp_j_w
278 |             w_batch.append(temp_j_w)
279 |             mask_batch.append([1. if i < len(sent) and j < len(sent[i]) else 0.0 for sent in tgt_batch])
280 |         sent_w_batch.append(w_batch)
281 |         sent_mask_batch.append(mask_batch)
282 |     return sent_w_batch, sent_mask_batch, max_sent_len, max_w_lens
283 | 
284 | def get_vocab_from_set(a_set, shift=0):
285 |     vocab = {}
286 |     for i, elem in enumerate(a_set):
287 |         vocab[elem] = i + shift
288 | 
289 |     return vocab
290 | 
291 | if __name__ == "__main__":
292 |     # from scipy.misc import logsumexp
293 |     # import numpy as np
294 |     #
295 |     # # a = np.random.rand(3, 4, 2)
296 |     # # b = logsumexp(a, axis=0)
297 |     # # a_t = dy.inputTensor(a, batched=True)
298 |     # # b_t = log_sum_exp_dim_0(a_t)
299 |     # # print "numpy "
300 |     # # print b
301 |     # # print "dynet "
302 |     # # print b_t.value(), b_t.dim()
303 |     # # print dy.pick_batch_elem(b_t, 1).npvalue()
304 |     #
305 |     # a = np.random.rand(3, 2)
306 |     # b = logsumexp(a, axis=0)
307 |     # a_t = dy.inputTensor(a, batched=True)
308 |     # b_t = log_sum_exp_dim_0(a_t)
309 |     # print "numpy "
310 |     # print b
311 |     # print "dynet "
312 |     # print b_t.value(), b_t.dim()
313 |     # print dy.pick_batch_elem(b_t, 1).npvalue()
314 |     dim = 100
315 |     # 9 1000
316 | 
317 |     path_to_emb = "../datasets/english/glove.6B/glove.6B.100d.txt"
318 |     pretrain_word_emb = {}
319 |     i = 1
320 |     for line in codecs.open(path_to_emb, "r", 'utf-8', errors='replace'):
321 |         items = line.strip().split()
322 |         if len(items) == dim + 1:
323 |             try:
324 |                 pretrain_word_emb[items[0]] = np.asarray(items[1:]).astype(np.float32)
325 |             except ValueError:
326 |                 continue
327 |             print items[0], i, pretrain_word_emb[items[0]][:3]
328 |         i += 1
329 | 
330 | # gradient clipping
331 | # turn off the dropout
332 | # use smaller initial lr
333 | # variational dropout
334 | 


--------------------------------------------------------------------------------
/models/encoders.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'chuntingzhou'
  2 | from utils.util import *
  3 | 
  4 | ''' Designing idea: the encoder should be agnostic to the input, it can be either
  5 |     arbitrary spans, characters, or words, or even raw feature. However, user has to specify
  6 |     whether to have the lookup table for any input.
  7 | 
  8 |     There are also two ways to feed in multiple input features:
  9 |     (a) First concatenate all features for each position, and then use them as features for one encoder, e.g. bilstm
 10 |     (b) Use multiple encoders for multiple features then combine outputs from multiple encoders, either concat them
 11 |         or feed them to another encoder.'''
 12 | 
 13 | 
 14 | class Encoder():
 15 |     def __init__(self):
 16 |         pass
 17 | 
 18 |     def encode(self):
 19 |         raise NotImplementedError
 20 | 
 21 | # class concat_input_encoder(encoder):
 22 | #     def __init__(self, model, lookups, lookup_table_dims):
 23 | #         # length of elements in lookup_table_dims == number of elements in lookups which are true
 24 | #         self.num_inputs = len(lookups)
 25 | #         self.lookups = lookups
 26 | #         self.lookup_params = []
 27 | #         for i, lookup in enumerate(lookups):
 28 | #             if lookup == 1:
 29 | #                 # add loop up parameters
 30 | #                 self.lookup_params.append(model.add_lookup_parameters((lookup_table_dims[i][0], lookup_table_dims[i][1])))
 31 | #             elif lookup == 2:
 32 | #                 # add normal transformation parameters
 33 | #                 # dims: discrete_feature_num, continuous_emb_dim
 34 | #                 # the input should concatenate all the discrete features together first
 35 | #                 self.lookup_params.append(model.add_parameters((lookup_table_dims[i][0], lookup_table_dims[i][1])))
 36 | #             else:
 37 | #                 self.lookup_params.append(0)
 38 | #
 39 | #     def prepare_inputs(self, inputs):
 40 | #         # inputs: (a)
 41 | #         input_features = []
 42 | #         for i, lookup in enumerate(self.lookups):
 43 | #             if lookup == 1:
 44 | 
 45 | 
 46 | class Lookup_Encoder(Encoder):
 47 |     def __init__(self, model, args, vocab_size, emb_size, padding_token=None, pretrain_embedding=None, isFeatureEmb=False):
 48 |         Encoder.__init__(self)
 49 |         self.padding_token = padding_token
 50 |         self.map_pretrain = args.map_pretrain
 51 |         self.pretrain_fix = args.pretrain_fix
 52 |         self.isFeatureEmb = isFeatureEmb
 53 |         if args.map_pretrain:
 54 |             self.W_map = model.add_parameters((args.map_dim, emb_size))
 55 |             self.b_map = model.add_parameters(args.map_dim)
 56 |             self.b_map.zero()
 57 |         if pretrain_embedding is not None:
 58 |             self.lookup_table = model.lookup_parameters_from_numpy(pretrain_embedding)
 59 |         else:
 60 |             self.lookup_table = model.add_lookup_parameters((vocab_size, emb_size))
 61 | 
 62 |     def encode(self, input_seqs):
 63 |         transpose_inputs, _ = transpose_input(input_seqs, self.padding_token)
 64 |         embs = [dy.lookup_batch(self.lookup_table, wids) for wids in transpose_inputs]
 65 |         if self.pretrain_fix and not self.isFeatureEmb:
 66 |             embs = [dy.nobackprop(emb) for emb in embs]
 67 |         # TODO: initialize <unk> with ones vector, initialize W_map with identity matrix
 68 |         if self.map_pretrain and not self.isFeatureEmb:
 69 |             if not self.pretrain_fix:
 70 |                 embs = [dy.nobackprop(emb) for emb in embs]
 71 |             W_map = dy.parameter(self.W_map)
 72 |             b_map = dy.parameter(self.b_map)
 73 |             embs = [dy.affine_transform([b_map, W_map, emb]) for emb in embs]
 74 |         return embs
 75 | 
 76 | 
 77 | class Discrete_Feature_Encoder(Encoder):
 78 |     def __init__(self, model, num_feats, to_dim):
 79 |         Encoder.__init__(self)
 80 |         self.num_feats = num_feats
 81 |         self.to_dim = to_dim
 82 |         self.W_feat_emb = model.add_parameters((to_dim, num_feats))
 83 | 
 84 |     def encode(self, input_feats):
 85 |         batch_size = len(input_feats)
 86 |         # after transpose: input_feats: [(num_feats, batch_size)]
 87 |         input_feats = transpose_discrete_features(input_feats)
 88 |         W_feat_emb = dy.parameter(self.W_feat_emb)
 89 |         output_emb = []
 90 |         for wif in input_feats:
 91 |             extend_wif = dy.transpose(dy.concatenate_cols([wif for _ in range(self.to_dim)]))
 92 |             feature_emb = dy.cmult(extend_wif, W_feat_emb)
 93 |             output_emb.append(dy.reshape(feature_emb, (self.to_dim * self.num_feats, ), batch_size=batch_size))
 94 |         return output_emb
 95 | 
 96 | 
 97 | class CNN_Encoder(Encoder):
 98 |     def __init__(self, model, emb_size, win_size=3, filter_size=64, dropout=0.5, vocab_size=0, padding_token=0, lookup_emb=None):
 99 |         Encoder.__init__(self)
100 |         self.vocab_size = vocab_size # if 0, no lookup tables
101 |         self.win_size = win_size
102 |         self.filter_size = filter_size
103 |         self.emb_size = emb_size
104 |         self.dropout_rate = dropout
105 |         self.paddding_token = padding_token
106 |         if vocab_size != 0:
107 |             print("In CNN encoder: creating lookup embedding!")
108 |             self.lookup_emb = model.add_lookup_parameters((vocab_size, 1, 1, emb_size))
109 |         else:
110 |             assert lookup_emb is not None
111 |             print("In CNN encoder: reusing lookup embedding!")
112 |             self.lookup_emb = lookup_emb
113 | 
114 |         self.W_cnn = model.add_parameters((1, win_size, emb_size, filter_size))
115 |         self.b_cnn = model.add_parameters((filter_size))
116 |         self.b_cnn.zero()
117 | 
118 |     def _cnn_emb(self, input_embs, training):
119 |         # input_embs: (h, time_step, dim, batch_size), h=1
120 |         if self.dropout_rate > 0 and training:
121 |             input_embs = dy.dropout(input_embs, self.dropout_rate)
122 |         W_cnn = dy.parameter(self.W_cnn)
123 |         b_cnn = dy.parameter(self.b_cnn)
124 | 
125 |         cnn_encs = dy.conv2d_bias(input_embs, W_cnn, b_cnn, stride=(1, 1), is_valid=False)
126 |         tanh_cnn_encs = dy.tanh(cnn_encs)
127 |         max_pool_out = dy.reshape(dy.max_dim(tanh_cnn_encs, d=1), (self.filter_size,))
128 |         # rec_pool_out = dy.rectify(max_pool_out)
129 |         return max_pool_out
130 | 
131 |     def encode(self, input_seqs, training=True, char=True):
132 |         batch_size = len(input_seqs)
133 |         sents_embs = []
134 |         if char:
135 |             # we don't batch at first, we batch after cnn
136 |             for sent in input_seqs:
137 |                 sent_emb = []
138 |                 for w in sent:
139 |                     if len(w) < self.win_size:
140 |                         w += [self.paddding_token] * (self.win_size - len(w))
141 |                     input_embs = dy.concatenate([dy.lookup(self.lookup_emb, c) for c in w], d=1)
142 |                     w_emb = self._cnn_emb(input_embs, training)  # (filter_size, 1)
143 |                     sent_emb.append(w_emb)
144 |                 sents_embs.append(sent_emb)
145 |             sents_embs, sents_mask = transpose_and_batch_embs(sents_embs, self.filter_size) # [(filter_size, batch_size)]
146 |         else:
147 |             for sent in input_seqs:
148 |                 if self.vocab_size != 0:
149 |                     if len(sent) < self.win_size:
150 |                         sent += [0] * (self.win_size - len(sent))
151 |                     input_embs = dy.concatenate([dy.lookup(self.lookup_emb, w) for w in sent], d=1)
152 |                 else:
153 |                     # input_seqs: [(emb_size, batch_size)]
154 |                     if len(sent) < self.win_size:
155 |                         sent += [dy.zeros(self.emb_size)] * (self.win_size - len(sent))
156 |                     input_embs = dy.transpose(dy.concatenate_cols(sent)) # (time_step, emb_size, bs)
157 |                     input_embs = dy.reshape(input_embs, (1, len(sent), self.emb_size), )
158 | 
159 |                 sent_emb = self._cnn_emb(input_embs, training)  # (filter_size, 1)
160 |                 sents_embs.append(sent_emb)
161 |             sents_embs = dy.reshape(dy.concatenate(sents_embs, d=1), (self.filter_size,), batch_size =batch_size) # (filter_size, batch_size)
162 | 
163 |         return sents_embs
164 | 
165 | 
166 | class BiRNN_Encoder(Encoder):
167 |     def __init__(self,
168 |                  model,
169 |                  input_dim,
170 |                  hidden_dim,
171 |                  emb_dropout_rate=0.3,
172 |                  output_dropout_rate=0.5,
173 |                  padding_token=None,
174 |                  vocab_size=0,
175 |                  emb_size=0,
176 |                  layer=1,
177 |                  rnn="lstm",
178 |                  vocab_emb=None):
179 |         Encoder.__init__(self)
180 |         # self.birnn = dy.BiRNNBuilder(layer, input_dim, hidden_dim, model, dy.LSTMBuilder if rnn == "lstm" else dy.GRUBuilder)
181 |         self.fwd_RNN = dy.LSTMBuilder(layer, input_dim, hidden_dim, model) if rnn == "lstm" else dy.GRUBuilder(layer, input_dim, hidden_dim, model)
182 |         self.bwd_RNN = dy.LSTMBuilder(layer, input_dim, hidden_dim, model) if rnn == "lstm" else dy.GRUBuilder(layer, input_dim, hidden_dim, model)
183 | 
184 |         self.input_dim = input_dim
185 |         self.vocab_size = vocab_size
186 |         self.padding_token = padding_token
187 |         self.drop_out_rate = output_dropout_rate
188 |         self.emb_drop_rate = emb_dropout_rate
189 |         self.hidden_dim = hidden_dim
190 |         if vocab_size > 0:
191 |             print("In BiRNN, creating lookup table!")
192 |             self.vocab_emb = model.add_lookup_parameters((vocab_size, emb_size))
193 |         else:
194 |             if vocab_emb is not None:
195 |                 # assert vocab_emb is not None
196 |                 self.vocab_emb = vocab_emb
197 |             else:
198 |                 self.vocab_emb = None
199 | 
200 |     def encode(self, input_seqs, training=True, char=False):
201 |         if char:
202 |             return self.encode_word(input_seqs, training=training)
203 |         else:
204 |             return self.encode_seq(input_seqs, training=training)
205 | 
206 |     def encode_seq(self, input_seqs, training=True, char=False):
207 |         if self.vocab_emb is not None:
208 |             # input_seqs = [[w1, w2],[]]
209 |             transpose_inputs, _ = transpose_input(input_seqs, self.padding_token)
210 |             if self.vocab_size != 0:
211 |                 w_embs = [dy.dropout(dy.lookup_batch(self.vocab_emb, wids),
212 |                                      self.emb_drop_rate) if self.emb_drop_rate > 0. and training
213 |                           else dy.lookup_batch(self.vocab_emb, wids)
214 |                           for wids in transpose_inputs]
215 |             else:
216 |                 # print "In our case, use parameters shared by CNN char encoder, need conversion!"
217 |                 vocab_emb = dy.parameter(self.vocab_emb)
218 |                 vocab_size = vocab_emb.dim()[0][-1]
219 |                 # print "In BiRNN Char vocab size: ", vocab_size
220 |                 vocab_emb = dy.reshape(vocab_emb, (self.input_dim, vocab_size))  # expression, not lookup_parameters
221 | 
222 |                 # for wids in transpose_inputs:
223 |                 #     print wids
224 |                 #     print vocab_emb.dim()
225 |                 #     a = dy.pick_batch(vocab_emb, wids, dim=1)
226 |                 #     print a.value()
227 |                 # Special case handler: use pick_batch
228 |                 w_embs = [dy.dropout(dy.pick_batch(vocab_emb, wids, dim=1),
229 |                                      self.emb_drop_rate) if self.emb_drop_rate > 0. and training
230 |                           else dy.pick_batch(vocab_emb, wids, dim=1)
231 |                           for wids in transpose_inputs]
232 |                 # print "In BiRNN char: ", w_embs[0].dim()
233 |         else:
234 |             w_embs = [dy.dropout(emb, self.emb_drop_rate) if self.emb_drop_rate > 0. and training else emb for emb in input_seqs]
235 |         # if vocab_size = 0: input_seqs = [(input_dim, batch_size)]
236 | 
237 |         w_embs_r = w_embs[::-1]
238 |         # birnn_outputs = [dy.dropout(emb, self.drop_out_rate) if self.drop_out_rate > 0. else emb for emb in self.birnn.transduce(w_embs)]
239 |         fwd_vectors = self.fwd_RNN.initial_state().transduce(w_embs)
240 |         bwd_vectors = self.bwd_RNN.initial_state().transduce(w_embs_r)[::-1]
241 | 
242 |         if char:
243 |             return dy.concatenate([fwd_vectors[-1], bwd_vectors[0]])
244 | 
245 |         birnn_outputs = [dy.dropout(dy.concatenate([fwd_v, bwd_v]), self.drop_out_rate) if self.drop_out_rate > 0.0 and training
246 |                          else dy.concatenate([fwd_v, bwd_v])
247 |                          for (fwd_v, bwd_v) in zip(fwd_vectors, bwd_vectors)]
248 |         return birnn_outputs
249 | 
250 |     def encode_word(self, input_seqs, training=True):
251 |         # embedding dropout rate is 0.0, because we dropout at the later stage of RNN
252 |         sents_embs = []
253 | 
254 |         for sent in input_seqs:
255 |             sent_emb = []
256 |             for w in sent:
257 |                 w_emb = self.encode_seq([w], training=training, char=True)
258 |                 sent_emb.append(w_emb)
259 |             sents_embs.append(sent_emb)
260 |         sents_embs, sents_mask = transpose_and_batch_embs(sents_embs, self.hidden_dim*2)  # [(hidden_dim*2, batch_size)]
261 |         return sents_embs


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'chuntingzhou and aditichaudhary'
  2 | import sys
  3 | reload(sys)
  4 | sys.setdefaultencoding('utf-8')
  5 | 
  6 | def evaluate(data_loader, path, model, model_name,type="dev"):
  7 |     sents, char_sents, tgt_tags, discrete_features, bc_feats,_ = data_loader.get_data_set(path, args.lang, source="dev")
  8 | 
  9 |     prefix = model_name + "_" + str(uid)
 10 |     # tot_acc = 0.0
 11 |     predictions = []
 12 |     gold_standards = []
 13 |     sentences = []
 14 |     i = 0
 15 |     sentence_gold = {}
 16 | 
 17 |     score_sent = {}
 18 |     for sent, char_sent, tgt_tag, discrete_feature, bc_feat in zip(sents, char_sents, tgt_tags, discrete_features, bc_feats):
 19 |         dy.renew_cg()
 20 |         sent, char_sent, discrete_feature, bc_feat = [sent], [char_sent], [discrete_feature], [bc_feat]
 21 |         best_score, best_path = model.eval(sent, char_sent, discrete_feature, bc_feat, training=False,type=type)
 22 | 
 23 |         assert len(best_path) == len(tgt_tag)
 24 |         # acc = model.crf_decoder.cal_accuracy(best_path, tgt_tag)
 25 |         # tot_acc += acc
 26 |         predictions.append(best_path)
 27 |         gold_standards.append(tgt_tag)
 28 | 
 29 |         sentences.append(sent)
 30 |         sent_key = " ".join([str(x) for x in sent[0]])
 31 |         sentence_gold[sent_key] = tgt_tag
 32 |         score_sent[sent_key] = best_score
 33 | 
 34 |         i += 1
 35 |         if i % 1000 == 0:
 36 |             print("Testing processed %d lines " % i)
 37 | 
 38 |     pred_output_fname = "%s/%s_pred_output.txt" % (args.eval_folder,prefix)
 39 |     eval_output_fname = "%s_eval_score.txt" % (prefix)
 40 |     with open(pred_output_fname, "w") as fout:
 41 |         for sent, pred, gold in zip(sentences, predictions, gold_standards):
 42 |             for s, p, g in zip(sent[0], pred, gold):
 43 |                 fout.write(data_loader.id_to_word[int(s)] + " " + data_loader.id_to_tag[g] + " " + data_loader.id_to_tag[p] + "\n")
 44 |             fout.write("\n")
 45 | 
 46 |     os.system("%s/conlleval.v2 < %s > %s" % (args.eval_folder,pred_output_fname, eval_output_fname))
 47 | 
 48 |     with open(eval_output_fname, "r") as fin:
 49 |         lid = 0
 50 |         for line in fin:
 51 |             if lid == 1:
 52 |                 fields = line.split(";")
 53 |                 acc = float(fields[0].split(":")[1].strip()[:-1])
 54 |                 precision = float(fields[1].split(":")[1].strip()[:-1])
 55 |                 recall = float(fields[2].split(":")[1].strip()[:-1])
 56 |                 f1 = float(fields[3].split(":")[1].strip())
 57 |             lid += 1
 58 | 
 59 |     output = open(eval_output_fname, "r").read().strip()
 60 |     print(output)
 61 |     if type == "dev":
 62 |         os.system("rm %s" % (eval_output_fname,))
 63 |         os.system("rm %s" % (pred_output_fname,))   
 64 | 
 65 | 
 66 |     return acc, precision, recall, f1,sentence_gold, score_sent
 67 | 
 68 | 
 69 | def replace_singletons(data_loader, sents, replace_rate):
 70 |     new_batch_sents = []
 71 |     for sent in sents:
 72 |         new_sent = []
 73 |         for word in sent:
 74 |             if word in data_loader.singleton_words:
 75 |                 new_sent.append(word if np.random.uniform(0., 1.) > replace_rate else data_loader.word_to_id["<unk>"])
 76 |             else:
 77 |                 new_sent.append(word)
 78 |         new_batch_sents.append(new_sent)
 79 |     return new_batch_sents
 80 | 
 81 | 
 82 | def main(args):
 83 |     prefix = args.model_name + "_" + str(uid)
 84 |     print("PREFIX: %s" % prefix)
 85 |     final_darpa_output_fname = "%s/%s_output.conll" % (args.eval_folder,prefix)
 86 |     best_output_fname = "%s/best_%s_output.conll" % (args.eval_folder,prefix)
 87 |     ner_data_loader = NER_DataLoader(args)
 88 |     print(ner_data_loader.id_to_tag)
 89 | 
 90 |     #Loading training data from CoNLL format
 91 |     if not args.data_aug:
 92 |         sents, char_sents, tgt_tags, discrete_features, bc_features,known_tags = ner_data_loader.get_data_set(args.train_path, args.lang)
 93 |     else:
 94 |         sents_tgt, char_sents_tgt, tags_tgt, dfs_tgt, bc_feats_tgt,known_tags_tgt = ner_data_loader.get_data_set(args.tgt_lang_train_path, args.lang)
 95 |         sents_aug, char_sents_aug, tags_aug, dfs_aug, bc_feats_aug, known_tags_aug= ner_data_loader.get_data_set(args.aug_lang_train_path, args.aug_lang)
 96 |         sents, char_sents, tgt_tags, discrete_features, bc_features,known_tags = sents_tgt+sents_aug, char_sents_tgt+char_sents_aug, tags_tgt+tags_aug, dfs_tgt+dfs_aug, bc_feats_tgt+bc_feats_aug,known_tags_tgt+known_tags_aug
 97 | 
 98 | 
 99 |     print("Data set size (train): %d" % len(sents))
100 |     print("Number of discrete features: ", ner_data_loader.num_feats)
101 |     epoch = bad_counter = updates = tot_example = cum_loss = 0
102 |     patience = args.patience
103 | 
104 |     display_freq = 100
105 |     valid_freq = args.valid_freq
106 |     batch_size = args.batch_size
107 | 
108 | 
109 |     print("Using Char CNN model!")
110 |     model = vanilla_NER_CRF_model(args, ner_data_loader)
111 |     inital_lr = args.init_lr
112 | 
113 |     if args.fineTune:
114 |         print("Loading pre-trained model!")
115 |         model.load()
116 | 
117 |         if len(sents) < 100:
118 |             inital_lr = 0.0001
119 |         else:
120 |             inital_lr = args.init_lr #+ inital_lr * len(sents) / 1500.0
121 | 
122 | 
123 |     trainer = dy.MomentumSGDTrainer(model.model, inital_lr, 0.9)
124 | 
125 |     def _check_batch_token(batch, id_to_vocab):
126 |         for line in batch:
127 |             print([id_to_vocab[i] for i in line])
128 | 
129 |     def _check_batch_char(batch, id_to_vocab):
130 |         for line in batch:
131 |             print([u" ".join([id_to_vocab[c] for c in w]) for w in line])
132 | 
133 |     lr_decay = args.decay_rate
134 | 
135 |     # decay_patience = 3
136 |     # decay_num = 0
137 |     valid_history = []
138 |     best_results = [0.0, 0.0, 0.0, 0.0]
139 |     while epoch <= args.tot_epochs:
140 |         batches = make_bucket_batches(
141 |                 zip(sents, char_sents, tgt_tags, discrete_features, bc_features, known_tags), batch_size)
142 | 
143 |         for b_sents, b_char_sents, b_ner_tags, b_feats, b_bc_feats, b_known_tags in batches:
144 |             dy.renew_cg()
145 | 
146 |             if args.replace_unk_rate > 0.0:
147 |                 b_sents = replace_singletons(ner_data_loader, b_sents, args.replace_unk_rate)
148 |             # _check_batch_token(b_sents, ner_data_loader.id_to_word)
149 |             # _check_batch_token(b_ner_tags, ner_data_loader.id_to_tag)
150 |             # _check_batch_char(b_char_sents, ner_data_loader.id_to_char)
151 | 
152 |             loss = model.cal_loss(b_sents, b_char_sents, b_ner_tags, b_feats, b_bc_feats, b_known_tags, training=True)
153 |             loss_val = loss.value()
154 |             cum_loss += loss_val * len(b_sents)
155 |             tot_example += len(b_sents)
156 | 
157 |             updates += 1
158 |             loss.backward()
159 |             trainer.update()
160 | 
161 |             if updates % display_freq == 0:
162 |                 print("Epoch = %d, Updates = %d, CRF Loss=%f, Accumulative Loss=%f." % (epoch, updates, loss_val, cum_loss*1.0/tot_example))
163 |             if updates % valid_freq == 0:
164 |                 acc, precision, recall, f1,_,_ = evaluate(ner_data_loader, args.dev_path, model, args.model_name)
165 | 
166 |                 if len(valid_history) == 0 or f1 > max(valid_history):
167 |                     bad_counter = 0
168 |                     best_results = [acc, precision, recall, f1]
169 |                     if updates > 0:
170 |                         print("Saving the best model so far.......")
171 |                         model.save()
172 |                 else:
173 |                     bad_counter += 1
174 |                     if args.lr_decay and bad_counter >= 3 and os.path.exists(args.save_to_path):
175 |                         bad_counter = 0
176 |                         model.load()
177 |                         lr = inital_lr / (1 + epoch * lr_decay)
178 |                         print("Epoch = %d, Learning Rate = %f." % (epoch, lr))
179 |                         trainer = dy.MomentumSGDTrainer(model.model, lr)
180 | 
181 |                 if bad_counter > patience:
182 |                     print("Early stop!")
183 |                     print("Best on validation: acc=%f, prec=%f, recall=%f, f1=%f" % tuple(best_results))
184 | 
185 |                     acc, precision, recall, f1,sentence_gold, score_sent = evaluate(ner_data_loader, args.test_path, model, args.model_name,"test")
186 |                     if args.SPAN_wise:
187 |                         createAnnotationOutput_SPAN_wise(args, model, ner_data_loader, sentence_gold, score_sent)
188 | 
189 |                     exit(0)
190 |                 valid_history.append(f1)
191 |         epoch += 1
192 | 
193 | 
194 | 
195 |     _,_,_,_,sentence_gold, score_sent = evaluate(ner_data_loader, args.test_path, model, args.model_name,"test")
196 |     if args.SPAN_wise:
197 |         createAnnotationOutput_SPAN_wise(args, model, ner_data_loader, sentence_gold, score_sent)
198 |     print("All Epochs done.")
199 | 
200 | def createAnnotationOutput_SPAN_wise(args, model, data_loader, sentence_gold, score_sent):
201 |     # normalize all the entropy_spans ONLY DONE for the CFB
202 | 
203 | 
204 |     reverse = True #For ETAL we look at the highest entropy ones, hence sorting is reversed
205 |     if args.use_CFB: #For CFEAL we look at the least confident, hence sorting is not reversed
206 |         reverse = False
207 | 
208 | 
209 |     # Order the sentences by entropy of the spans
210 |     fout=  codecs.open(args.to_annotate, "w", encoding='utf-8')
211 | 
212 |     sorted_spans = sorted(model.crf_decoder.most_uncertain_entropy_spans,  key=lambda k:model.crf_decoder.most_uncertain_entropy_spans[k],reverse=reverse)
213 |     print("Total unique spans: {0}".format(len(sorted_spans)))
214 |     count_span = args.k
215 |     count_tokens = args.k
216 | 	
217 |     #DEBUG Print Span Entropy in the sorted order of selected spans
218 |     fdebug = codecs.open("./" + args.model_name + "_span_entropy_debug.txt", "w", encoding='utf-8')
219 | 
220 |     for sorted_span in sorted_spans:
221 | 
222 |         span_words= []
223 |         if count_tokens <=0:
224 |             break
225 |         (span_entropy,sentence_key, start, end,best_path) = model.crf_decoder.most_uncertain_entropy_spans[sorted_span]
226 |         gold_path = sentence_gold[sentence_key]
227 |         sent = sentence_key.split()
228 | 
229 |         for t in sorted_span.split():
230 |             span_words.append(data_loader.id_to_word[int(t)])
231 |         fdebug.write(" ".join(span_words) + " " + str(span_entropy) + "\n")
232 | 
233 |         first = True
234 |         path = deepcopy(best_path)
235 |         for i in range(start, end):
236 |             if first:
237 |                 path[i] = -10 #Id for B-UNK
238 |                 first = False
239 |             else:
240 |                 path[i] = -11 #Id for I-UNK
241 | 
242 |         idx = 0
243 |         for token, tag in zip(sent, path):
244 | 
245 |             if tag == -10:
246 |                 tag_label = "B-UNK"
247 |                 count_span -= 1
248 |                 count_tokens -= 1
249 |             elif tag == -11:
250 |                 tag_label = "I-UNK"
251 |                 count_tokens -= 1
252 |             else:
253 |                 tag_label = data_loader.id_to_tag[tag]
254 | 
255 |             gold_tag_label = data_loader.id_to_tag[gold_path[idx]]
256 |             idx += 1
257 |             fout.write(data_loader.id_to_word[int(token)] + "\t" + tag_label + "\t" + gold_tag_label + "\n")
258 | 
259 |         fout.write("\n")
260 | 
261 |     print("Total unique spans for exercise: {0}".format(args.k))
262 | 
263 |     #SAL: Select most uncertain sequence
264 |     basename = os.path.basename(args.to_annotate).replace(".conll", "")
265 |     LC_output_file = os.path.dirname(args.to_annotate) + "/" + basename + "_LC.conll"
266 |     count_tokens = args.k
267 |     with codecs.open(LC_output_file, "w", encoding='utf-8') as fout:
268 |         idx = 0
269 |         for sentence_key in sorted(score_sent.keys(), reverse=False):
270 |             if count_tokens<=0:
271 |                 break
272 |             sent = sentence_key.split()
273 |             gold_path = sentence_gold[sentence_key]
274 |             token_count = 0
275 |             for token in sent:
276 |                 count_tokens -= 1
277 |                 gold_tag_label = data_loader.id_to_tag[gold_path[token_count]]
278 |                 token_count += 1
279 |                 fout.write(data_loader.id_to_word[int(token)] + "\t" + "UNK " + gold_tag_label + "\n")
280 |             fout.write("\n")
281 |             idx += 1
282 | 
283 | 
284 | def test_single_model(args):
285 |     ner_data_loader = NER_DataLoader(args)
286 |     # ugly: get discrete number features
287 |     _, _, _, _, _,_ = ner_data_loader.get_data_set(args.train_path, args.lang)
288 | 
289 |     print("Using Char CNN model!")
290 |     model = vanilla_NER_CRF_model(args, ner_data_loader)
291 |     model.load()
292 | 
293 |     _,_,_,_,sentence_gold, score_sent = evaluate(ner_data_loader, args.test_path, model, args.model_name,"test")
294 |     if args.SPAN_wise:
295 |         createAnnotationOutput_SPAN_wise(args, model, ner_data_loader, sentence_gold, score_sent)
296 | 
297 | 
298 | 
299 | 
300 | from args import init_config
301 | 
302 | args = init_config()
303 | from models.model_builder import *
304 | import os
305 | import uuid
306 | from dataloaders.data_loader import *
307 | uid = uuid.uuid4().get_hex()[:6]
308 | 
309 | if __name__ == "__main__":
310 |     # args = init_config()
311 |     if args.mode == "train":
312 |         if args.load_from_path is not None:
313 |             args.load_from_path = args.load_from_path
314 |         else:
315 |             args.load_from_path = args.save_to_path
316 |         main(args)
317 | 
318 |     elif args.mode == "test_1":
319 |         test_single_model(args)
320 | 
321 |     else:
322 |         raise NotImplementedError
323 | 


--------------------------------------------------------------------------------
/eval/conlleval.v2:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w
  2 | # conlleval: evaluate result of processing CoNLL-2000 shared task
  3 | # usage:     conlleval [-l] [-r] [-d delimiterTag] [-o oTag] < file
  4 | #            README: http://cnts.uia.ac.be/conll2000/chunking/output.html
  5 | # options:   l: generate LaTeX output for tables like in
  6 | #               http://cnts.uia.ac.be/conll2003/ner/example.tex
  7 | #            r: accept raw result tags (without B- and I- prefix;
  8 | #                                       assumes one word per chunk)
  9 | #            d: alternative delimiter tag (default is single space)
 10 | #            o: alternative outside tag (default is O)
 11 | # note:      the file should contain lines with items separated
 12 | #            by $delimiter characters (default space). The final
 13 | #            two items should contain the correct tag and the 
 14 | #            guessed tag in that order. Sentences should be
 15 | #            separated from each other by empty lines or lines
 16 | #            with $boundary fields (default -X-).
 17 | # url:       http://lcg-www.uia.ac.be/conll2000/chunking/
 18 | # started:   1998-09-25
 19 | # version:   2004-01-26
 20 | # author:    Erik Tjong Kim Sang <erikt@uia.ua.ac.be>
 21 | 
 22 | use strict;
 23 | 
 24 | my $false = 0;
 25 | my $true = 42;
 26 | 
 27 | my $boundary = "-X-";     # sentence boundary
 28 | my $correct;              # current corpus chunk tag (I,O,B)
 29 | my $correctChunk = 0;     # number of correctly identified chunks
 30 | my $correctTags = 0;      # number of correct chunk tags
 31 | my $correctType;          # type of current corpus chunk tag (NP,VP,etc.)
 32 | my $delimiter = " ";      # field delimiter
 33 | my $FB1 = 0.0;            # FB1 score (Van Rijsbergen 1979)
 34 | my $firstItem;            # first feature (for sentence boundary checks)
 35 | my $foundCorrect = 0;     # number of chunks in corpus
 36 | my $foundGuessed = 0;     # number of identified chunks
 37 | my $guessed;              # current guessed chunk tag
 38 | my $guessedType;          # type of current guessed chunk tag
 39 | my $i;                    # miscellaneous counter
 40 | my $inCorrect = $false;   # currently processed chunk is correct until now
 41 | my $lastCorrect = "O";    # previous chunk tag in corpus
 42 | my $latex = 0;            # generate LaTeX formatted output
 43 | my $lastCorrectType = ""; # type of previously identified chunk tag
 44 | my $lastGuessed = "O";    # previously identified chunk tag
 45 | my $lastGuessedType = ""; # type of previous chunk tag in corpus
 46 | my $lastType;             # temporary storage for detecting duplicates
 47 | my $line;                 # line
 48 | my $nbrOfFeatures = -1;   # number of features per line
 49 | my $precision = 0.0;      # precision score
 50 | my $oTag = "O";           # outside tag, default O
 51 | my $raw = 0;              # raw input: add B to every token
 52 | my $recall = 0.0;         # recall score
 53 | my $tokenCounter = 0;     # token counter (ignores sentence breaks)
 54 | 
 55 | my %correctChunk = ();    # number of correctly identified chunks per type
 56 | my %foundCorrect = ();    # number of chunks in corpus per type
 57 | my %foundGuessed = ();    # number of identified chunks per type
 58 | 
 59 | my @features;             # features on line
 60 | my @sortedTypes;          # sorted list of chunk type names
 61 | 
 62 | # sanity check
 63 | while (@ARGV and $ARGV[0] =~ /^-/) {
 64 |    if ($ARGV[0] eq "-l") { $latex = 1; shift(@ARGV); }
 65 |    elsif ($ARGV[0] eq "-r") { $raw = 1; shift(@ARGV); }
 66 |    elsif ($ARGV[0] eq "-d") { 
 67 |       shift(@ARGV); 
 68 |       if (not defined $ARGV[0]) { 
 69 |          die "conlleval: -d requires delimiter character"; 
 70 |       }
 71 |       $delimiter = shift(@ARGV);
 72 |    } elsif ($ARGV[0] eq "-o") {
 73 |       shift(@ARGV);
 74 |       if (not defined $ARGV[0]) {
 75 |          die "conlleval: -o requires delimiter character";
 76 |       }
 77 |       $oTag = shift(@ARGV);
 78 |    } else { die "conlleval: unknown argument $ARGV[0]\n"; }
 79 | }
 80 | if (@ARGV) { die "conlleval: unexpected command line argument\n"; }
 81 | # process input
 82 | while (<STDIN>) {
 83 |    chomp($line = $_);
 84 |    @features = split(/$delimiter/,$line);
 85 |    if ($nbrOfFeatures < 0) { $nbrOfFeatures = $#features; }
 86 |    elsif ($nbrOfFeatures != $#features and @features != 0) {
 87 |       printf STDERR "unexpected number of features: %d (%d)\n",
 88 |          $#features+1,$nbrOfFeatures+1;
 89 |       exit(1);
 90 |    }
 91 |    if (@features == 0 or 
 92 |        $features[0] eq $boundary) { @features = ($boundary,"O","O"); }
 93 |    if (@features < 2) { 
 94 |       die "conlleval: unexpected number of features in line $line\n"; 
 95 |    }
 96 |    if ($raw) {
 97 |       if ($features[$#features] eq $oTag) { $features[$#features] = "O"; } 
 98 |       if ($features[$#features-1] eq $oTag) { $features[$#features-1] = "O"; } 
 99 |       if ($features[$#features] ne "O") { 
100 |          $features[$#features] = "B-$features[$#features]";
101 |       }
102 |       if ($features[$#features-1] ne "O") { 
103 |          $features[$#features-1] = "B-$features[$#features-1]";
104 |       }
105 |    }
106 |    # 20040126 ET code which allows hyphens in the types
107 |    if ($features[$#features] =~ /^([^-]*)-(.*)$/) {
108 |       $guessed = $1;
109 |       $guessedType = $2;
110 |    } else { 
111 |       $guessed = $features[$#features]; 
112 |       $guessedType = ""; 
113 |    }
114 |    pop(@features);
115 |    if ($features[$#features] =~ /^([^-]*)-(.*)$/) {
116 |       $correct = $1;
117 |       $correctType = $2;
118 |    } else { 
119 |       $correct = $features[$#features]; 
120 |       $correctType = ""; 
121 |    }
122 |    pop(@features);
123 | #  ($guessed,$guessedType) = split(/-/,pop(@features));
124 | #  ($correct,$correctType) = split(/-/,pop(@features));
125 |    $guessedType = $guessedType ? $guessedType : "";
126 |    $correctType = $correctType ? $correctType : "";
127 |    $firstItem = shift(@features);
128 | 
129 |    # 1999-06-26 sentence breaks should always be counted as out of chunk
130 |    if ( $firstItem eq $boundary ) { $guessed = "O"; }
131 | 
132 |    if ($inCorrect) {
133 |       if ( &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and
134 |            &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and
135 |            $lastGuessedType eq $lastCorrectType) {
136 |          $inCorrect=$false;
137 |          $correctChunk++;
138 |          $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ?
139 |              $correctChunk{$lastCorrectType}+1 : 1;
140 |       } elsif ( 
141 |            &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) != 
142 |            &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) or
143 |            $guessedType ne $correctType ) {
144 |          $inCorrect=$false; 
145 |       }
146 |    }
147 | 
148 |    if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and 
149 |         &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and
150 |         $guessedType eq $correctType) { $inCorrect = $true; }
151 | 
152 |    if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) ) {
153 |       $foundCorrect++; 
154 |       $foundCorrect{$correctType} = $foundCorrect{$correctType} ?
155 |           $foundCorrect{$correctType}+1 : 1;
156 |    }
157 |    if ( &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) ) {
158 |       $foundGuessed++; 
159 |       $foundGuessed{$guessedType} = $foundGuessed{$guessedType} ?
160 |           $foundGuessed{$guessedType}+1 : 1;
161 |    }
162 |    if ( $firstItem ne $boundary ) { 
163 |       if ( $correct eq $guessed and $guessedType eq $correctType ) { 
164 |          $correctTags++; 
165 |       }
166 |       $tokenCounter++; 
167 |    }
168 | 
169 |    $lastGuessed = $guessed;
170 |    $lastCorrect = $correct;
171 |    $lastGuessedType = $guessedType;
172 |    $lastCorrectType = $correctType;
173 | }
174 | if ($inCorrect) { 
175 |    $correctChunk++;
176 |    $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ?
177 |        $correctChunk{$lastCorrectType}+1 : 1;
178 | }
179 | 
180 | if (not $latex) {
181 |    # compute overall precision, recall and FB1 (default values are 0.0)
182 |    $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0);
183 |    $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0);
184 |    $FB1 = 2*$precision*$recall/($precision+$recall)
185 |       if ($precision+$recall > 0);
186 |    
187 |    # print overall performance
188 |    printf "processed $tokenCounter tokens with $foundCorrect phrases; ";
189 |    printf "found: $foundGuessed phrases; correct: $correctChunk.\n";
190 |    if ($tokenCounter>0) {
191 |       printf "accuracy: %6.2f%%; ",100*$correctTags/$tokenCounter;
192 |       printf "precision: %6.2f%%; ",$precision;
193 |       printf "recall: %6.2f%%; ",$recall;
194 |       printf "FB1: %6.2f\n",$FB1;
195 |    }
196 | }
197 | 
198 | # sort chunk type names
199 | undef($lastType);
200 | @sortedTypes = ();
201 | foreach $i (sort (keys %foundCorrect,keys %foundGuessed)) {
202 |    if (not($lastType) or $lastType ne $i) { 
203 |       push(@sortedTypes,($i));
204 |    }
205 |    $lastType = $i;
206 | }
207 | # print performance per chunk type
208 | if (not $latex) {
209 |    for $i (@sortedTypes) {
210 |       $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0;
211 |       if (not($foundGuessed{$i})) { $foundGuessed{$i} = 0; $precision = 0.0; }
212 |       else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; }
213 |       if (not($foundCorrect{$i})) { $recall = 0.0; }
214 |       else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; }
215 |       if ($precision+$recall == 0.0) { $FB1 = 0.0; }
216 |       else { $FB1 = 2*$precision*$recall/($precision+$recall); }
217 |       printf "%17s: ",$i;
218 |       printf "precision: %6.2f%%; ",$precision;
219 |       printf "recall: %6.2f%%; ",$recall;
220 |       printf "FB1: %6.2f  %d\n",$FB1,$foundGuessed{$i};
221 |    }
222 | } else {
223 |    print "        & Precision &  Recall  & F\$_{\\beta=1} \\\\\\hline";
224 |    for $i (@sortedTypes) {
225 |       $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0;
226 |       if (not($foundGuessed{$i})) { $precision = 0.0; }
227 |       else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; }
228 |       if (not($foundCorrect{$i})) { $recall = 0.0; }
229 |       else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; }
230 |       if ($precision+$recall == 0.0) { $FB1 = 0.0; }
231 |       else { $FB1 = 2*$precision*$recall/($precision+$recall); }
232 |       printf "\n%-7s &  %6.2f\\%% & %6.2f\\%% & %6.2f \\\\",
233 |              $i,$precision,$recall,$FB1;
234 |    }
235 |    print "\\hline\n";
236 |    $precision = 0.0;
237 |    $recall = 0;
238 |    $FB1 = 0.0;
239 |    $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0);
240 |    $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0);
241 |    $FB1 = 2*$precision*$recall/($precision+$recall)
242 |       if ($precision+$recall > 0);
243 |    printf "Overall &  %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline\n",
244 |           $precision,$recall,$FB1;
245 | }
246 | 
247 | exit 0;
248 | 
249 | # endOfChunk: checks if a chunk ended between the previous and current word
250 | # arguments:  previous and current chunk tags, previous and current types
251 | # note:       this code is capable of handling other chunk representations
252 | #             than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
253 | #             Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
254 | 
255 | sub endOfChunk {
256 |    my $prevTag = shift(@_);
257 |    my $tag = shift(@_);
258 |    my $prevType = shift(@_);
259 |    my $type = shift(@_);
260 |    my $chunkEnd = $false;
261 | 
262 |    if ( $prevTag eq "B" and $tag eq "B" ) { $chunkEnd = $true; }
263 |    if ( $prevTag eq "B" and $tag eq "O" ) { $chunkEnd = $true; }
264 |    if ( $prevTag eq "B" and $tag eq "S" ) { $chunkEnd = $true; }
265 | 
266 |    if ( $prevTag eq "I" and $tag eq "B" ) { $chunkEnd = $true; }
267 |    if ( $prevTag eq "I" and $tag eq "S" ) { $chunkEnd = $true; }
268 |    if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; }
269 | 
270 |    if ( $prevTag eq "E" and $tag eq "E" ) { $chunkEnd = $true; }
271 |    if ( $prevTag eq "E" and $tag eq "I" ) { $chunkEnd = $true; }
272 |    if ( $prevTag eq "E" and $tag eq "O" ) { $chunkEnd = $true; }
273 |    if ( $prevTag eq "E" and $tag eq "S" ) { $chunkEnd = $true; }
274 |    if ( $prevTag eq "E" and $tag eq "B" ) { $chunkEnd = $true; }
275 | 
276 |    if ( $prevTag eq "S" and $tag eq "E" ) { $chunkEnd = $true; }
277 |    if ( $prevTag eq "S" and $tag eq "I" ) { $chunkEnd = $true; }
278 |    if ( $prevTag eq "S" and $tag eq "O" ) { $chunkEnd = $true; }
279 |    if ( $prevTag eq "S" and $tag eq "S" ) { $chunkEnd = $true; }
280 |    if ( $prevTag eq "S" and $tag eq "B" ) { $chunkEnd = $true; }
281 |    
282 | 
283 |    if ($prevTag ne "O" and $prevTag ne "." and $prevType ne $type) { 
284 |       $chunkEnd = $true; 
285 |    }
286 | 
287 |    # corrected 1998-12-22: these chunks are assumed to have length 1
288 |    if ( $prevTag eq "]" ) { $chunkEnd = $true; }
289 |    if ( $prevTag eq "[" ) { $chunkEnd = $true; }
290 | 
291 |    return($chunkEnd);   
292 | }
293 | 
294 | # startOfChunk: checks if a chunk started between the previous and current word
295 | # arguments:    previous and current chunk tags, previous and current types
296 | # note:         this code is capable of handling other chunk representations
297 | #               than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
298 | #               Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
299 | 
300 | sub startOfChunk {
301 |    my $prevTag = shift(@_);
302 |    my $tag = shift(@_);
303 |    my $prevType = shift(@_);
304 |    my $type = shift(@_);
305 |    my $chunkStart = $false;
306 | 
307 |    if ( $prevTag eq "B" and $tag eq "B" ) { $chunkStart = $true; }
308 |    if ( $prevTag eq "I" and $tag eq "B" ) { $chunkStart = $true; }
309 |    if ( $prevTag eq "O" and $tag eq "B" ) { $chunkStart = $true; }
310 |    if ( $prevTag eq "S" and $tag eq "B" ) { $chunkStart = $true; }
311 |    if ( $prevTag eq "E" and $tag eq "B" ) { $chunkStart = $true; }
312 |    
313 |    if ( $prevTag eq "B" and $tag eq "S" ) { $chunkStart = $true; }
314 |    if ( $prevTag eq "I" and $tag eq "S" ) { $chunkStart = $true; }
315 |    if ( $prevTag eq "O" and $tag eq "S" ) { $chunkStart = $true; }
316 |    if ( $prevTag eq "S" and $tag eq "S" ) { $chunkStart = $true; }
317 |    if ( $prevTag eq "E" and $tag eq "S" ) { $chunkStart = $true; }
318 | 
319 |    if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; }
320 |    if ( $prevTag eq "S" and $tag eq "I" ) { $chunkStart = $true; }
321 |    if ( $prevTag eq "E" and $tag eq "I" ) { $chunkStart = $true; }
322 | 
323 |    if ( $prevTag eq "S" and $tag eq "E" ) { $chunkStart = $true; }
324 |    if ( $prevTag eq "E" and $tag eq "E" ) { $chunkStart = $true; }
325 |    if ( $prevTag eq "O" and $tag eq "E" ) { $chunkStart = $true; }
326 | 
327 |    if ($tag ne "O" and $tag ne "." and $prevType ne $type) { 
328 |       $chunkStart = $true; 
329 |    }
330 | 
331 |    # corrected 1998-12-22: these chunks are assumed to have length 1
332 |    if ( $tag eq "[" ) { $chunkStart = $true; }
333 |    if ( $tag eq "]" ) { $chunkStart = $true; }
334 | 
335 |    return($chunkStart);   
336 | }
337 | 


--------------------------------------------------------------------------------
/models/decoders.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'chuntingzhou and aditichaudhary'
  2 | from utils.util import *
  3 | import numpy as np
  4 | from collections import defaultdict
  5 | from scipy.special import logsumexp
  6 | 
  7 | 
  8 | class Decoder():
  9 |     def __init__(self, tag_size):
 10 |         # type: () -> object
 11 |         pass
 12 | 
 13 |     def decode_loss(self):
 14 |         raise NotImplementedError
 15 | 
 16 |     def decoding(self):
 17 |         raise NotImplementedError
 18 | 
 19 | 
 20 | def constrained_transition_init(transition_matrix, contraints):
 21 |     '''
 22 |     :param transition_matrix: numpy array, (from, to)
 23 |     :param contraints: [[from_indexes], [to_indexes]]
 24 |     :return: newly initialized transition matrix
 25 |     '''
 26 |     for cons in contraints:
 27 |         transition_matrix[cons[0], cons[1]] = -1000.0
 28 |     return transition_matrix
 29 | 
 30 | class chain_CRF_decoder(Decoder):
 31 |     ''' For NER and POS Tagging. '''
 32 | 
 33 |     def __init__(self, args, model, src_output_dim, tag_emb_dim, tag_size, constraints=None):
 34 |         Decoder.__init__(self, tag_size)
 35 |         self.model = model
 36 |         self.start_id = tag_size
 37 |         self.end_id = tag_size + 1
 38 |         self.tag_size = tag_size + 2
 39 |         tag_size = tag_size + 2
 40 |         self.args = args
 41 | 
 42 |         # optional: transform the hidden space of src encodings into the tag embedding space
 43 |         self.W_src2tag_readout = model.add_parameters((tag_emb_dim, src_output_dim))
 44 |         self.b_src2tag_readout = model.add_parameters((tag_emb_dim))
 45 |         self.b_src2tag_readout.zero()
 46 | 
 47 |         self.W_scores_readout2tag = model.add_parameters((tag_size, tag_emb_dim))
 48 |         self.b_scores_readout2tag = model.add_parameters((tag_size))
 49 |         self.b_scores_readout2tag.zero()
 50 | 
 51 |         # (to, from), trans[i] is the transition score to i
 52 |         init_transition_matrix = np.random.randn(tag_size, tag_size) # from, to
 53 |         init_transition_matrix[:, self.end_id] = -1000.0
 54 |         init_transition_matrix[self.start_id, :] = -1000.0
 55 |         if False and constraints is not None:
 56 |             init_transition_matrix = constrained_transition_init(init_transition_matrix, constraints)
 57 |         # print init_transition_matrix
 58 |         #self.transition_matrix = model.add_lookup_parameters((tag_size, tag_size),
 59 |         #                                                    init=dy.NumpyInitializer(init_transition_matrix))
 60 | 	self.transition_matrix = model.lookup_parameters_from_numpy(init_transition_matrix) # (to, from)
 61 | 
 62 |         self.ngram = args.ngram
 63 | 
 64 |         self.entropy_threshold = args.entropy_threshold
 65 |         if args.entropy_threshold is not None and args.use_CFB:
 66 |             self.entropy_threshold = args.entropy_threshold * -1
 67 | 
 68 |         self.prob_threshold = np.NINF
 69 |         self.entropy_spans = defaultdict(lambda: 0)
 70 |         self.most_uncertain_entropy_spans = {}
 71 |         self.entropy_spans_number = defaultdict(lambda: 0)
 72 |         self.full_sentences = defaultdict(list)
 73 |         self.avg_spans_in_sent_entropy = defaultdict(list)
 74 |         self.SPAN_wise = args.SPAN_wise
 75 | 
 76 |     def forward_alg(self, tag_scores):
 77 |         ''' Forward DP for CRF.
 78 |         tag_scores (list of batched dy.Tensor): (tag_size, batchsize)
 79 |         '''
 80 |         # Be aware: if a is lookup_parameter with 2 dimension, then a[i] returns one row;
 81 |         # if b = dy.parameter(a), then b[i] returns one column; which means dy.parameter(a) already transpose a
 82 |         # transpose_transition_score = self.transition_matrix
 83 |         transpose_transition_score = dy.parameter(self.transition_matrix) # (from, to)
 84 | 
 85 |         # alpha(t', s) = the score of sequence from t=0 to t=t' in log space
 86 |         # np_init_alphas = -100.0 * np.ones((self.tag_size, batch_size))
 87 |         # np_init_alphas[self.start_id, :] = 0.0
 88 |         # alpha_tm1 = dy.inputTensor(np_init_alphas, batched=True)
 89 | 	alphas = []
 90 | 
 91 |         alpha_tm1 = transpose_transition_score[self.start_id] + tag_scores[0]
 92 |         # self.transition_matrix[i]: from i, column
 93 |         # transpose_score[i]: to i, row
 94 |         # transpose_score: to, from
 95 | 	alphas.append(alpha_tm1)
 96 | 
 97 |         for tag_score in tag_scores[1:]:
 98 |             # extend for each transit <to>
 99 |             alpha_tm1 = dy.concatenate_cols([alpha_tm1] * self.tag_size)  # (from, to, batch_size)
100 |             # each column i of tag_score will be the repeated emission score to tag i
101 |             tag_score = dy.transpose(dy.concatenate_cols([tag_score] * self.tag_size))
102 |             alpha_t = alpha_tm1 + transpose_transition_score + tag_score
103 |             alpha_tm1 = log_sum_exp_dim_0(alpha_t)  # (tag_size, batch_size)
104 | 	    alphas.append(alpha_tm1)
105 | 
106 |         terminal_alpha = log_sum_exp_dim_0(alpha_tm1 + self.transition_matrix[self.end_id])  # (1, batch_size)
107 |         return terminal_alpha,alphas
108 | 
109 |     def score_one_sequence(self, tag_scores, tags, batch_size):
110 |         ''' tags: list of tag ids at each time step '''
111 |         # print tags, batch_size
112 |         # print batch_size
113 |         # print "scoring one sentence"
114 |         tags = [[self.start_id] * batch_size] + tags  # len(tag_scores) = len(tags) - 1
115 |         score = dy.inputTensor(np.zeros(batch_size), batched=True)
116 |         # tag_scores = dy.concatenate_cols(tag_scores) # tot_tags, sent_len, batch_size
117 |         # print "tag dim: ", tag_scores.dim()
118 |         for i in range(len(tags) - 1):
119 |             score += dy.pick_batch(dy.lookup_batch(self.transition_matrix, tags[i + 1]), tags[i]) \
120 |                     + dy.pick_batch(tag_scores[i], tags[i + 1])
121 |         score += dy.pick_batch(dy.lookup_batch(self.transition_matrix, [self.end_id]*batch_size), tags[-1])
122 |         return score
123 | 
124 |     def backward_one_sequence(self, tag_scores):
125 |         ''' Backward DP for CRF.
126 |         tag_scores (list of batched dy.Tensor): (tag_size, batchsize)
127 |         '''
128 |         # Be aware: if a is lookup_parameter with 2 dimension, then a[i] returns one row;
129 |         # if b = dy.parameter(a), then b[i] returns one column; which means dy.parameter(a) already transpose a
130 |         transpose_transition_score = dy.parameter(self.transition_matrix)
131 |         # transpose_transition_score = dy.parameter(self.transition_matrix)
132 | 
133 |         # alpha(t', s) = the score of sequence from t=0 to t=t' in log space
134 |         # np_init_alphas = -100.0 * np.ones((self.tag_size, batch_size))
135 |         # np_init_alphas[self.start_id, :] = 0.0
136 |         # alpha_tm1 = dy.inputTensor(np_init_alphas, batched=True)
137 |         betas = []
138 |         # beta_tp1 = self.transition_matrix[self.end_id] + tag_scores[-1]
139 |         # beta_tp1 = dy.inputTensor(np.zeros(self.tag_size))
140 |         beta_tp1 = self.transition_matrix[self.end_id]
141 |         betas.append(beta_tp1)
142 |         # self.transition_matrix[i]: from i, column
143 |         # transpose_score[i]: to i, row
144 |         # transpose_score: to, from
145 |         seq_len = len(tag_scores)
146 |         tag_scores.reverse()
147 |         for tag_score in tag_scores[0:seq_len - 1]:
148 |             # extend for each transit <to>
149 |             beta_tp1 = dy.concatenate_cols([beta_tp1] * self.tag_size)  # (to, from, batch_size)
150 |             # each column i of tag_score will be the repeated emission score to tag i
151 |             tag_score = dy.concatenate_cols([tag_score] * self.tag_size)  # (to, from)
152 |             beta_t = beta_tp1 + dy.transpose(transpose_transition_score) + tag_score  # (to, from)
153 |             beta_tp1 = log_sum_exp_dim_0(beta_t)  # (tag_size, batch_size)
154 |             betas.append(beta_tp1)
155 | 
156 |         # betas.append(beta_tp1 + transpose_transition_score[self.start_id] + tag_scores[-1])
157 |         terminal_beta = log_sum_exp_dim_0(
158 |             beta_tp1 + transpose_transition_score[self.start_id] + tag_scores[-1])  # (1, batch_size)
159 |         betas.reverse()
160 |         return terminal_beta, betas
161 | 
162 |     def get_uncertain_subsequences(self, sents, tag_scores, alphas, betas, Z, gammas,
163 |                                    best_path, B_tags, I_tags, O_tags):
164 |         # predicted_path = deepcopy(best_path)
165 |         # transition_B_O = np.array(dy.pick_batch(dy.lookup_batch(self.transition_matrix, O_tags), B_tags).value())
166 |         # transition_I_O = np.array(dy.pick_batch(dy.lookup_batch(self.transition_matrix, O_tags), I_tags).value())
167 |         # transition_B_I = np.array(dy.pick_batch(dy.lookup_batch(self.transition_matrix, I_tags), B_tags).value())
168 |         # transition_I_I = np.array(dy.pick_batch(dy.lookup_batch(self.transition_matrix, I_tags), I_tags).value())
169 | 
170 | 
171 |         first = True
172 |         Z = Z.value()
173 |         for i in range(len(sents)):
174 |             # log_p_alpha = np.array(alphas[i].value())[B_tags]
175 |             # transition = transition_B_O
176 |             log_pin = logsumexp(np.array(gammas[i].value())[B_tags]) #Prob (y=start_entity|x)= log_sum{tags}#  (e^log(P=B-tag|x))
177 | 
178 |             for j in range(i + 1, len(sents)):
179 |                 if (j - i) > self.ngram:
180 |                     break
181 | 
182 |                 log_p_out = np.array(gammas[j].value())[O_tags]
183 |                 log_p = log_pin + log_p_out
184 |                 p = np.exp(log_p)
185 |                 if p > 1.0:
186 |                     #print(p, log_p)
187 |                     H= 0.0
188 |                 else:
189 |                     H = -(p * log_p) - ((1-p)* np.log(1-p))
190 | 
191 |                 if H > self.entropy_threshold:
192 |                     # best_path = deepcopy(predicted_path)
193 |                     span = " ".join([str(x) for x in sents[i:j]])
194 |                     sent = " ".join([str(x) for x in sents])
195 |                     self.entropy_spans[span] += H
196 |                     self.entropy_spans_number[span] += 1
197 | 
198 |                     if self.SPAN_wise:
199 |                         if span in self.most_uncertain_entropy_spans:
200 |                             (existing_entropy, _,_,_,_) = self.most_uncertain_entropy_spans[span]
201 |                             #if H > existing_entropy:
202 |                             #    self.most_uncertain_entropy_spans[span] = (H, sent, i,j,best_path)
203 |                             self.most_uncertain_entropy_spans[span] =  (self.entropy_spans[span], sent, i, j, best_path)
204 |                         else:
205 |                             self.most_uncertain_entropy_spans[span] = (H, sent, i,j,best_path)
206 | 
207 |                     # for k in range(i,j+1):
208 |                     #     best_path[k] = -10
209 |                     self.full_sentences[sent].append((i, j, best_path, self.entropy_spans[span]))
210 |                     self.avg_spans_in_sent_entropy[sent].append(span)
211 |                     # self.full_sentences[span] = (sents,best_path,predicted_path, self.entropy_spans[span])
212 | 
213 |                 log_pin += logsumexp(np.array(gammas[j].value())[I_tags])
214 |                 if log_pin < np.log(1e-4):
215 |                     break
216 | 
217 | 
218 |     def get_uncertain_subsequences_CFB(self, sents, tag_scores, alphas, betas, Z, gammas,
219 |                                        best_path, tag_to_id,B_UNK, I_UNK):
220 |         first = True
221 |         Z = Z.value()
222 |         entropy_spans_number = defaultdict(lambda :0)
223 |         for i in range(len(sents)):
224 |             known_tags =np.array([[0]] * len(sents)).reshape((len(sents),1,1))
225 |             known_tags[i][0][0] = 1
226 |             tags = np.array([B_UNK] * len(sents)).reshape((len(sents),1))
227 |             tags[i][0] = best_path[i]
228 |             for j in range(i + 1, len(sents)):
229 |                 if (j - i + 1) > self.ngram:
230 |                     break
231 |                 tags[j][0] = best_path[j]
232 |                 known_tags[j][0][0] = 1
233 |                 Z_span = self.score_one_sequence_partial(tag_scores, tags, 1, known_tags, tag_to_id, B_UNK, I_UNK)
234 |                 confidence = Z_span.value() - Z
235 | 
236 |                 if confidence < self.entropy_threshold:
237 |                     # best_path = deepcopy(predicted_path)
238 |                     span = " ".join([str(x) for x in sents[i:j + 1]])
239 |                     sent = " ".join([str(x) for x in sents])
240 | 
241 |                     if self.SPAN_wise:
242 |                         if span in self.most_uncertain_entropy_spans:
243 |                             (existing_threshold, _,_,_,_) = self.most_uncertain_entropy_spans[span]
244 |                             if confidence < existing_threshold:
245 |                                 self.most_uncertain_entropy_spans[span] = (confidence, sent, i, j,best_path)
246 | 			else:
247 | 			    self.most_uncertain_entropy_spans[span] = (confidence, sent, i, j,best_path)
248 | 
249 |                     self.entropy_spans[span] += confidence
250 |                     self.entropy_spans_number[span] += 1
251 |                     #self.full_sentences[sent].append((i, j, best_path, self.entropy_spans[span]))
252 |                     self.full_sentences[sent].append((i, j, best_path, span))
253 |                     #self.avg_spans_in_sent_entropy[sent].append(self.entropy_spans[span])
254 |                     self.avg_spans_in_sent_entropy[sent].append(span)
255 | 
256 | 
257 |     def decode_loss(self, src_encodings, tgt_tags, use_partial, known_tags, tag_to_id, B_UNK, I_UNK):
258 |         # This is the batched version which requires bucketed batch input with the same length.
259 |         '''
260 |         The length of src_encodings and tgt_tags are time_steps.
261 |         src_encodings: list of dynet.Tensor (src_output_dim, batch_size)
262 |         tgt_tags: list of tag ids [(1, batch_size)]
263 |         return: average of negative log likelihood
264 |         '''
265 |         # TODO: transpose tgt tags first
266 |         batch_size = len(tgt_tags)
267 |         tgt_tags, tgt_mask = transpose_input(tgt_tags, 0)
268 |         known_tags, _ = transpose_input(known_tags, 0)
269 | 
270 |         W_src2tag_readout = dy.parameter(self.W_src2tag_readout)
271 |         b_src2tag_readout = dy.parameter(self.b_src2tag_readout)
272 |         W_score_tag = dy.parameter(self.W_scores_readout2tag)
273 |         b_score_tag = dy.parameter(self.b_scores_readout2tag)
274 | 
275 |         tag_embs = [dy.tanh(dy.affine_transform([b_src2tag_readout, W_src2tag_readout, src_encoding])) for src_encoding
276 |                     in src_encodings]
277 | 
278 |         tag_scores = [dy.affine_transform([b_score_tag, W_score_tag, tag_emb]) for tag_emb in tag_embs]
279 | 
280 |         # scores over all paths, all scores are in log-space
281 |         forward_scores,_ = self.forward_alg(tag_scores)
282 | 
283 |         if use_partial:
284 |             gold_score = self.score_one_sequence_partial(tag_scores, tgt_tags, batch_size, known_tags, tag_to_id, B_UNK,
285 |                                                          I_UNK)
286 |         else:
287 |             gold_score = self.score_one_sequence(tag_scores, tgt_tags, batch_size)
288 | 
289 |         # negative log likelihood
290 |         loss = dy.sum_batches(forward_scores - gold_score) / batch_size
291 |         return loss #, dy.sum_batches(forward_scores)/batch_size, dy.sum_batches(gold_score) / batch_size
292 | 
293 |     def makeMask(self, batch_size, known_tags, tag_to_id, tags, index, B_UNK, I_UNK):
294 |         mask_w_0 = np.array([[-1000] * self.tag_size])
295 |         mask_w_0 = np.transpose(mask_w_0)
296 |         mask_w_0_all_s = np.reshape(np.array([mask_w_0] * batch_size), (self.tag_size, batch_size))
297 | 
298 |         mask_idx = []
299 |         tag_vals = []
300 |         for idx, w0_si in enumerate(known_tags[index]):
301 |             if w0_si[0] == 1:
302 |                 mask_idx.append(idx)
303 |                 tag_vals.append(tags[index][idx])
304 |             else:
305 |                 if tags[index][idx] == B_UNK:
306 |                     if self.args.misc:
307 |                         possible_labels = ["B-LOC", "B-PER", "B-ORG", "B-MISC", "O","I-LOC", "I-PER", "I-ORG", "I-MISC"]
308 |                     else:
309 |                         possible_labels = ["B-LOC", "B-PER", "B-ORG", "B-GPE", "O","I-LOC", "I-PER", "I-ORG", "I-GPE"]
310 |                     for pl in possible_labels:
311 |                         mask_idx.append(idx)
312 |                         tag_vals.append(tag_to_id[pl])
313 |         mask_w_0_all_s[tag_vals, mask_idx] = 0
314 |         return mask_w_0_all_s
315 | 
316 |     def score_one_sequence_partial(self, tag_scores, tags, batch_size, known_tags, tag_to_id, B_UNK, I_UNK):
317 |         transpose_transition_score = dy.parameter(self.transition_matrix)
318 | 
319 |         alpha_tm1 = transpose_transition_score[self.start_id] + tag_scores[0]
320 | 
321 |         mask_w_0_all_s = self.makeMask(batch_size, known_tags, tag_to_id, tags, 0, B_UNK, I_UNK)
322 |         i = 1
323 |         alpha_tm1 = alpha_tm1 + dy.inputTensor(mask_w_0_all_s, batched=True)
324 |         for tag_score in tag_scores[1:]:
325 |             alpha_tm1 = dy.concatenate_cols([alpha_tm1] * self.tag_size)  # (from, to, batch_size)
326 |             tag_score = dy.transpose(dy.concatenate_cols([tag_score] * self.tag_size))
327 |             alpha_t = alpha_tm1 + transpose_transition_score + tag_score
328 |             alpha_tm1 = log_sum_exp_dim_0(alpha_t)  # (tag_size, batch_size)
329 |             mask_w_i_all_s = self.makeMask(batch_size, known_tags, tag_to_id, tags, i, B_UNK, I_UNK)
330 |             alpha_tm1 = alpha_tm1 + dy.inputTensor(mask_w_i_all_s, batched=True)
331 |             i = i + 1
332 | 
333 |         terminal_alpha = log_sum_exp_dim_0(alpha_tm1 + self.transition_matrix[self.end_id])  # (1, batch_size)
334 |         return terminal_alpha
335 | 
336 | 
337 |     def get_crf_scores(self, src_encodings):
338 |         W_src2tag_readout = dy.parameter(self.W_src2tag_readout)
339 |         b_src2tag_readout = dy.parameter(self.b_src2tag_readout)
340 |         W_score_tag = dy.parameter(self.W_scores_readout2tag)
341 |         b_score_tag = dy.parameter(self.b_scores_readout2tag)
342 | 
343 |         tag_embs = [dy.tanh(dy.affine_transform([b_src2tag_readout, W_src2tag_readout, src_encoding]))
344 |                     for src_encoding in src_encodings]
345 |         tag_scores = [dy.affine_transform([b_score_tag, W_score_tag, tag_emb]) for tag_emb in tag_embs]
346 | 
347 |         transpose_transition_score = dy.parameter(self.transition_matrix)  # (from, to)
348 | 
349 |         return transpose_transition_score.npvalue(), [ts.npvalue() for ts in tag_scores]
350 | 
351 |     def decoding(self, src_encodings,OTag, addbias=False):
352 |         ''' Viterbi decoding for a single sequence. '''
353 |         W_src2tag_readout = dy.parameter(self.W_src2tag_readout)
354 |         b_src2tag_readout = dy.parameter(self.b_src2tag_readout)
355 |         W_score_tag = dy.parameter(self.W_scores_readout2tag)
356 |         b_score_tag = dy.parameter(self.b_scores_readout2tag)
357 | 
358 |         tag_embs = [dy.tanh(dy.affine_transform([b_src2tag_readout, W_src2tag_readout, src_encoding]))
359 |                     for src_encoding in src_encodings]
360 |         if addbias:
361 |             b_score_tag = np.zeros(self.tag_size)
362 |             b_score_tag[OTag] = 0.5
363 |             b_score_tag = dy.inputTensor(b_score_tag)
364 | 
365 | 
366 |         tag_scores = [dy.affine_transform([b_score_tag, W_score_tag, tag_emb]) for tag_emb in tag_embs]
367 | 
368 |         back_trace_tags = []
369 |         np_init_alpha = np.ones(self.tag_size) * -2000.0
370 |         np_init_alpha[self.start_id] = 0.0
371 |         max_tm1 = dy.inputTensor(np_init_alpha)
372 |         transpose_transition_score = dy.parameter(self.transition_matrix)  # (from, to)
373 | 
374 |         for i, tag_score in enumerate(tag_scores):
375 |             max_tm1 = dy.concatenate_cols([max_tm1] * self.tag_size)
376 |             max_t = max_tm1 + transpose_transition_score
377 |             if i != 0:
378 |                 eval_score = max_t.npvalue()[:-2, :]
379 |             else:
380 |                 eval_score = max_t.npvalue()
381 |             best_tag = np.argmax(eval_score, axis=0)
382 |             back_trace_tags.append(best_tag)
383 |             max_tm1 = dy.inputTensor(eval_score[best_tag, range(self.tag_size)]) + tag_score
384 | 
385 |         terminal_max_T = max_tm1 + self.transition_matrix[self.end_id]
386 |         eval_terminal = terminal_max_T.npvalue()[:-2]
387 |         best_tag = np.argmax(eval_terminal, axis=0)
388 |         best_path_score = eval_terminal[best_tag]
389 | 
390 |         best_path = [best_tag]
391 |         for btpoint in reversed(back_trace_tags):
392 |             best_tag = btpoint[best_tag]
393 |             best_path.append(best_tag)
394 |         start = best_path.pop()
395 |         assert start == self.start_id
396 |         best_path.reverse()
397 |         return best_path_score, best_path, tag_scores
398 | 
399 |     def cal_accuracy(self, pred_path, true_path):
400 |         return np.sum(np.equal(pred_path, true_path).astype(np.float32)) / len(pred_path)
401 | 
402 | 
403 | def ensemble_viterbi_decoding(l_tag_scores, l_transit_score, tag_size):
404 |     back_trace_tags = []
405 |     tag_size = tag_size + 2
406 |     start_id = tag_size - 2
407 |     end_id = tag_size - 1
408 |     max_tm1 = np.ones(tag_size) * -2000.0
409 |     max_tm1[start_id] = 0.0
410 | 
411 |     tag_scores = []
412 |     for i in range(len(l_tag_scores[0])):
413 |         tag_scores.append(sum([ts[i] for ts in l_tag_scores]) / len(l_tag_scores))
414 |     transpose_transition_score = sum(l_transit_score) / len(l_transit_score)  # (from, to)
415 | 
416 |     for i, tag_score in enumerate(tag_scores):
417 |         max_tm1 = np.tile(np.expand_dims(max_tm1, axis=1), (1, tag_size))
418 |         max_t = max_tm1 + transpose_transition_score
419 |         if i != 0:
420 |             eval_score = max_t[:-2, :]
421 |         else:
422 |             eval_score = max_t
423 |         best_tag = np.argmax(eval_score, axis=0)
424 |         back_trace_tags.append(best_tag)
425 |         max_tm1 = eval_score[best_tag, range(tag_size)] + tag_score
426 | 
427 |     terminal_max_T = max_tm1 + transpose_transition_score[:, end_id]
428 |     eval_terminal = terminal_max_T[:-2]
429 |     best_tag = np.argmax(eval_terminal, axis=0)
430 |     best_path_score = eval_terminal[best_tag]
431 | 
432 |     best_path = [best_tag]
433 |     for btpoint in reversed(back_trace_tags):
434 |         best_tag = btpoint[best_tag]
435 |         best_path.append(best_tag)
436 |     start = best_path.pop()
437 |     assert start == start_id
438 |     best_path.reverse()
439 |     return best_path_score, best_path
440 | 
441 | 
442 | class classifier(Decoder):
443 |     def __init__(self, model, input_dim, tag_size):
444 |         self.W_softmax = model.add_parameters((tag_size, input_dim))
445 |         self.b_softmax = model.add_parameters((tag_size))
446 | 
447 |     def decode_loss(self, src_encoding, tgt_tags):
448 |         batch_size = len(tgt_tags)
449 |         tgt_tags, tgt_mask = transpose_input(tgt_tags, 0)
450 | 
451 |         assert len(src_encoding) == len(tgt_tags)
452 | 
453 |         W_softmax = dy.parameter(self.W_softmax)
454 |         b_softmax = dy.parameter(self.b_softmax)
455 | 
456 |         predictions = [dy.affine_transform([b_softmax, W_softmax, src_emb]) for src_emb in src_encoding]
457 | 
458 |         losses = [dy.pickneglogsoftmax_batch(pred, tgt) for pred, tgt in zip(predictions, tgt_tags)]
459 | 
460 |         loss = dy.sum_batches(dy.esum(losses)) / (batch_size * len(src_encoding))
461 | 
462 |         return loss
463 | 
464 |     def decoding(self, src_encoding):
465 |         W_softmax = dy.parameter(self.W_softmax)
466 |         b_softmax = dy.parameter(self.b_softmax)
467 |         predictions = [dy.affine_transform([b_softmax, W_softmax, src_emb]) for src_emb in src_encoding]
468 | 
469 |         predictions = [np.argmax(pred.npvalue()) for pred in predictions]
470 | 
471 |         return None, predictions
472 | 


--------------------------------------------------------------------------------