├── models ├── __init__.py ├── model_builder.py ├── encoders.py └── decoders.py ├── dataloaders ├── __init__.py └── data_loader.py ├── helper_scripts ├── countLinesCONLL.py ├── CombineAnnotatedFiles.py ├── removeAnnotatedSents.py ├── pickKTokens.py ├── pickKTokensRev.py └── SimulateAnnotations.py ├── utils ├── features.py └── util.py ├── commands ├── SAL_CT.sh ├── ETAL_FULL_CRF_CT.sh ├── ETAL_PARTIAL_CRF_CT.sh └── CFEAL_PARTIAL_CRF_CT.sh ├── README.md ├── args.py ├── main.py └── eval └── conlleval.v2 /models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataloaders/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /helper_scripts/countLinesCONLL.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import argparse 3 | 4 | arg_parser = argparse.ArgumentParser() 5 | 6 | arg_parser.add_argument("--input", help="Folder with the raw text data", 7 | default=None, 8 | type=str) 9 | 10 | args = arg_parser.parse_args() 11 | print("Args used for this run:") 12 | print(args) 13 | 14 | with codecs.open(args.input,"r",encoding='utf-8') as fin: 15 | index = 0 16 | one_line = [] 17 | for line in fin: 18 | if line == "" or line == "\n": 19 | if len(one_line) > 0: 20 | index +=1 21 | one_line = [] 22 | else: 23 | line = line.strip() 24 | one_line.append(line) 25 | 26 | if len(one_line)>0: 27 | index = index + 1 28 | print index 29 | 30 | -------------------------------------------------------------------------------- /helper_scripts/CombineAnnotatedFiles.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import argparse 3 | 4 | 5 | arg_parser = argparse.ArgumentParser() 6 | 7 | arg_parser.add_argument("--files", help="File 1", 8 | default=None,nargs='+') 9 | 10 | #arg_parser.add_argument("--file2", help="File 2", 11 | # default=None, 12 | # type=str) 13 | 14 | arg_parser.add_argument("--output", help="Output File", 15 | default=None, 16 | type=str) 17 | 18 | args = arg_parser.parse_args() 19 | print("Args used for this run:") 20 | print(args) 21 | 22 | 23 | files = args.files 24 | fout = codecs.open(args.output, "w", encoding='utf-8') 25 | 26 | for i in files: 27 | with codecs.open(i,"r", encoding='utf-8') as fin: 28 | for line in fin: 29 | fout.write(line) 30 | print "Done reading file: " + str(i) 31 | fout.write("\n") 32 | -------------------------------------------------------------------------------- /utils/features.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import numpy as np 3 | import pdb 4 | 5 | 6 | def get_feature_sent(lang, sent, args, cap_ratio_dict, type=None): 7 | dsf = [] 8 | individual_feats = [] 9 | 10 | if args.cap and not args.use_discrete_features: 11 | cap_feat = [w[0].isupper() for w in sent] 12 | individual_feats.append(cap_feat) 13 | 14 | if args.cap_ratio_path is not None: 15 | cap_feats = [] 16 | for w in sent: 17 | # feat = np.zeros(4,) 18 | feat = [0, 0, 0, 0] 19 | if w in cap_ratio_dict: 20 | feat[cap_ratio_dict[w]] = 1 21 | cap_feats.append(feat) 22 | individual_feats.append(cap_feats) 23 | 24 | # individual_feats = zip(*individual_feats) # [(), ()] 25 | if len(dsf) > 0 and len(individual_feats) > 0: 26 | # individual_feats = [list(i) for i in individual_feats] 27 | dsf = [list(i) for i in dsf] 28 | # for i, d in zip(individual_feats, dsf): 29 | # print i, d 30 | # print len(i), len(d) 31 | new_feat = [list(tuple(i + d)) for i, d in zip(individual_feats[0], dsf)] 32 | # pdb.set_trace() 33 | return new_feat 34 | elif len(individual_feats) > 0: 35 | return individual_feats 36 | elif len(dsf) > 0: 37 | return dsf 38 | else: 39 | return [] 40 | 41 | 42 | def get_brown_cluster(path): 43 | bc_dict = dict() 44 | linear_map = dict() 45 | with codecs.open(path, "r", "utf-8") as fin: 46 | for line in fin: 47 | fields = line.strip().split('\t') 48 | if len(fields) == 3: 49 | word = fields[1] 50 | binary_string = fields[0] 51 | bid = int(binary_string, 2) 52 | if bid not in linear_map: 53 | linear_map[bid] = len(linear_map) 54 | bc_dict[word] = linear_map[bid] 55 | return bc_dict 56 | -------------------------------------------------------------------------------- /helper_scripts/removeAnnotatedSents.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import codecs 3 | 4 | def selectUnAnnotated(args): 5 | annotated_sents = set() 6 | with codecs.open(args.annotated, "r",encoding='utf-8') as fin: 7 | sent = [] 8 | count = 0 9 | for line in fin: 10 | line = line.strip() 11 | if line == "" or line == "\n": 12 | annotated_sents.add(" ".join(sent)) 13 | count +=1 14 | sent =[] 15 | else: 16 | tokens = line.split("\t") 17 | sent.append(tokens[0]) 18 | 19 | print(count, len(annotated_sents)) 20 | fout = codecs.open("./annotated_sents.txt","w", encoding='utf-8') 21 | for sent in annotated_sents: 22 | fout.write(sent + "\n") 23 | 24 | ffull = codecs.open("./orig_sents.txt","w", encoding='utf-8') 25 | with codecs.open(args.input, "r", encoding='utf-8') as fin, codecs.open(args.output, "w", encoding='utf-8') as fout: 26 | sent = [] 27 | tokens = [] 28 | for line in fin: 29 | line = line.strip() 30 | if line == "" or line == "\n": 31 | sentence = " ".join(tokens) 32 | ffull.write(sentence + "\n") 33 | tokens = [] 34 | if sentence not in annotated_sents: 35 | #q print(sentence) 36 | for l in sent: 37 | fout.write(l + "\n") 38 | fout.write("\n") 39 | sent = [] 40 | else: 41 | sent.append(line) 42 | tokens.append(line.split("\t")[0]) 43 | 44 | 45 | if __name__ == "__main__": 46 | parser = argparse.ArgumentParser() 47 | parser.add_argument("--input",type=str) 48 | parser.add_argument("--annotated", type=str) 49 | parser.add_argument("--output", type=str) 50 | args = parser.parse_args() 51 | print(args) 52 | selectUnAnnotated(args) 53 | -------------------------------------------------------------------------------- /helper_scripts/pickKTokens.py: -------------------------------------------------------------------------------- 1 | import codecs, argparse 2 | 3 | 4 | def pickKTokens(args): 5 | with codecs.open(args.input, "r", encoding='utf-8') as fin, codecs.open(args.output, "w", encoding='utf-8') as fout: 6 | count = args.k 7 | one_sent = [] 8 | for line in fin: 9 | if line == "" or line == "\n": 10 | for s in one_sent: 11 | fout.write(s + "\n") 12 | fout.write('\n') 13 | one_sent = [] 14 | if count <=0: 15 | break 16 | 17 | else: 18 | tokens = line.strip().split("\t") 19 | tag = tokens[1] 20 | token = tokens[0] 21 | if "UNK" in tag: 22 | count -= 1 23 | 24 | one_sent.append(line.strip()) 25 | 26 | 27 | if len(one_sent) > 0: 28 | for s in one_sent: 29 | fout.write(s + "\n") 30 | fout.write('\n') 31 | 32 | def pickKTokensRev(args): 33 | with codecs.open(args.input, "r", encoding='utf-8') as fin, codecs.open(args.output, "w", encoding='utf-8') as fout: 34 | count = args.k 35 | one_sent = [] 36 | for line in fin: 37 | if line == "" or line == "\n": 38 | for s in one_sent: 39 | fout.write(s + "\n") 40 | fout.write('\n') 41 | one_sent = [] 42 | if count <=0: 43 | break 44 | 45 | else: 46 | tokens = line.strip().split("\t") 47 | tag = tokens[1] 48 | token = tokens[0] 49 | count -= 1 50 | 51 | one_sent.append(line.strip()) 52 | 53 | 54 | if len(one_sent) > 0: 55 | for s in one_sent: 56 | fout.write(s + "\n") 57 | fout.write('\n') 58 | 59 | 60 | 61 | if __name__ == "__main__": 62 | parser = argparse.ArgumentParser() 63 | parser.add_argument("--input", type=str) 64 | parser.add_argument("--k", type=int) 65 | parser.add_argument("--output",type=str) 66 | args = parser.parse_args() 67 | 68 | pickKTokens(args) 69 | #pickKTokensRev(args) 70 | -------------------------------------------------------------------------------- /helper_scripts/pickKTokensRev.py: -------------------------------------------------------------------------------- 1 | import codecs, argparse 2 | 3 | 4 | def pickKTokens(args): 5 | with codecs.open(args.input, "r", encoding='utf-8') as fin, codecs.open(args.output, "w", encoding='utf-8') as fout: 6 | count = args.k 7 | one_sent = [] 8 | for line in fin: 9 | if line == "" or line == "\n": 10 | for s in one_sent: 11 | fout.write(s + "\n") 12 | fout.write('\n') 13 | one_sent = [] 14 | if count <=0: 15 | break 16 | 17 | else: 18 | tokens = line.strip().split("\t") 19 | tag = tokens[1] 20 | token = tokens[0] 21 | if "UNK" in tag: 22 | count -= 1 23 | 24 | one_sent.append(line.strip()) 25 | 26 | 27 | if len(one_sent) > 0: 28 | for s in one_sent: 29 | fout.write(s + "\n") 30 | fout.write('\n') 31 | 32 | def pickKTokensRev(args): 33 | with codecs.open(args.input, "r", encoding='utf-8') as fin, codecs.open(args.output, "w", encoding='utf-8') as fout: 34 | count = args.k 35 | one_sent = [] 36 | for line in fin: 37 | if line == "" or line == "\n": 38 | for s in one_sent: 39 | fout.write(s + "\n") 40 | fout.write('\n') 41 | one_sent = [] 42 | if count <=0: 43 | break 44 | 45 | else: 46 | tokens = line.strip().split("\t") 47 | tag = tokens[1] 48 | token = tokens[0] 49 | count -= 1 50 | 51 | one_sent.append(line.strip()) 52 | 53 | 54 | if len(one_sent) > 0: 55 | for s in one_sent: 56 | fout.write(s + "\n") 57 | fout.write('\n') 58 | 59 | 60 | 61 | if __name__ == "__main__": 62 | parser = argparse.ArgumentParser() 63 | parser.add_argument("--input", type=str) 64 | parser.add_argument("--k", type=int) 65 | parser.add_argument("--output",type=str) 66 | args = parser.parse_args() 67 | 68 | #pickKTokens(args) 69 | pickKTokensRev(args) 70 | -------------------------------------------------------------------------------- /commands/SAL_CT.sh: -------------------------------------------------------------------------------- 1 | DIR="../data/Spanish/SAL_CT" 2 | DATA="../data/Spanish" 3 | 4 | for i in {1..20} ; do 5 | python2 ../helper_scripts/pickKTokensRev.py --input $DIR/to_annotate_v${i}.1_LC.conll --k 200 --output $DIR/to_annotate_v${i}.1_200.conll 6 | 7 | python2 ../helper_scripts/SimulateAnnotations.py --input $DIR/to_annotate_v${i}.1_200.conll --output $DIR/v${i}.1.conll 8 | 9 | PREV=`expr $i - 1` 10 | 11 | python2 ../helper_scripts/removeAnnotatedSents.py --input $DIR//unlabel_v${PREV}.1.conll --annotated $DIR/v${i}.1.conll --output $DIR/unlabel_v${i}.1.conll 12 | 13 | if [ "$i" -gt 1 ] 14 | then 15 | python2 ../helper_scripts/CombineAnnotatedFiles.py --files $DIR/Entropy_v${PREV}.1.conll $DIR/v${i}.1.conll --output $DIR/Entropy_v${i}.1.conll 16 | else 17 | cp $DIR/v1.1.conll $DIR/Entropy_v1.1.conll 18 | fi 19 | 20 | #Train the NER Model Using FineTune 21 | MODEL_NAME="200_SAL_CT_spa_${i}.1_finetune" 22 | python -u ../main.py \ 23 | --dynet-seed 3278657 \ 24 | --word_emb_dim 100 \ 25 | --batch_size 10 \ 26 | --model_name ${MODEL_NAME} \ 27 | --lang es \ 28 | --fixedVocab \ 29 | --fineTune \ 30 | --test_conll \ 31 | --tot_epochs 1000 \ 32 | --aug_lang_train_path $DATA/vocab.conll \ 33 | --misc \ 34 | --init_lr 0.015 \ 35 | --load_from_path ../saved_models/spanish_full_transfer_baseline.model \ 36 | --valid_freq 1300 \ 37 | --pretrain_emb_path $DATA/esp.vec \ 38 | --dev_path $DATA/esp.dev \ 39 | --test_path $DATA/esp.test \ 40 | --train_path $DIR/Entropy_v${i}.1.conll 2>&1 | tee ${MODEL_NAME}.log 41 | 42 | 43 | #Run the Active Learning Session 44 | NEW=`expr $i + 1` 45 | #!/usr/bin/env bash 46 | MODEL_NAME="200_SAL_spa_${i}.1_finetune_activelearning" 47 | python -u ../main.py \ 48 | --dynet-seed 3278657 \ 49 | --mode test_1 \ 50 | --fixedVocab \ 51 | --aug_lang_train_path $DATA/vocab.conll \ 52 | --misc \ 53 | --word_emb_dim 100 \ 54 | --model_name ${MODEL_NAME} \ 55 | --lang es \ 56 | --load_from_path ../saved_models/200_SAL_CT_spa_${i}.1_finetune.model \ 57 | --pretrain_emb_path $DATA/esp.vec \ 58 | --dev_path $DATA/esp.dev \ 59 | --test_path $DIR/unlabel_v${i}.1.conll \ 60 | --to_annotate $DIR/to_annotate_v${NEW}.1.conll \ 61 | --test_conll \ 62 | --k 200 \ 63 | --SPAN_wise \ 64 | --train_path $DIR/Entropy_v${i}.1.conll 2>&1 | tee ${MODEL_NAME}.log 65 | 66 | done 67 | -------------------------------------------------------------------------------- /commands/ETAL_FULL_CRF_CT.sh: -------------------------------------------------------------------------------- 1 | DIR="../data/Spanish/ETAL_FULL_CRF_CT" 2 | DATA="../data/Spanish" 3 | 4 | for i in {1..20} ; do 5 | python2 ../helper_scripts/pickKTokens.py --input $DIR/to_annotate_v${i}.1.conll --k 200 --output $DIR/to_annotate_v${i}.1_200.conll 6 | 7 | python2 ../helper_scripts/SimulateAnnotations.py --input $DIR/to_annotate_v${i}.1_200.conll --output $DIR/v${i}.1.conll 8 | 9 | PREV=`expr $i - 1` 10 | 11 | python2 ../helper_scripts/removeAnnotatedSents.py --input $DIR//unlabel_v${PREV}.1.conll --annotated $DIR/v${i}.1.conll --output $DIR/unlabel_v${i}.1.conll 12 | 13 | if [ "$i" -gt 1 ] 14 | then 15 | python2 ../helper_scripts/CombineAnnotatedFiles.py --files $DIR/Entropy_v${PREV}.1.conll $DIR/v${i}.1.conll --output $DIR/Entropy_v${i}.1.conll 16 | else 17 | cp $DIR/v1.1.conll $DIR/Entropy_v1.1.conll 18 | fi 19 | 20 | #Train the NER Model Using FineTune 21 | MODEL_NAME="200_Entropy_Full_CT_spa_${i}.1_finetune" 22 | python -u ../main.py \ 23 | --dynet-seed 3278657 \ 24 | --word_emb_dim 100 \ 25 | --batch_size 10 \ 26 | --model_name ${MODEL_NAME} \ 27 | --lang es \ 28 | --fixedVocab \ 29 | --fineTune \ 30 | --test_conll \ 31 | --tot_epochs 1000 \ 32 | --misc \ 33 | --aug_lang_train_path $DATA/vocab.conll \ 34 | --init_lr 0.015 \ 35 | --load_from_path ../saved_models/spanish_full_transfer_baseline.model \ 36 | --valid_freq 1300 \ 37 | --pretrain_emb_path $DATA/esp.vec \ 38 | --dev_path $DATA/esp.dev \ 39 | --test_path $DATA/esp.test \ 40 | --train_path $DIR/Entropy_v${i}.1.conll 2>&1 | tee ${MODEL_NAME}.log 41 | 42 | 43 | #Run the Active Learning Session 44 | NEW=`expr $i + 1` 45 | #!/usr/bin/env bash 46 | MODEL_NAME="200_Entropy_Full_CT_spa_${i}.1_finetune_activelearning" 47 | python -u ../main.py \ 48 | --dynet-seed 3278657 \ 49 | --mode test_1 \ 50 | --fixedVocab \ 51 | --aug_lang_train_path $DATA/vocab.conll \ 52 | --word_emb_dim 100 \ 53 | --model_name ${MODEL_NAME} \ 54 | --lang es \ 55 | --misc \ 56 | --load_from_path ../saved_models/200_Entropy_Full_CT_spa_${i}.1_finetune.model \ 57 | --pretrain_emb_path $DATA/esp.vec \ 58 | --dev_path $DATA/esp.dev \ 59 | --test_path $DIR/unlabel_v${i}.1.conll \ 60 | --to_annotate $DIR/to_annotate_v${NEW}.1.conll \ 61 | --ngram 5 \ 62 | --test_conll \ 63 | --entropy_threshold 1e-8 \ 64 | --k 200 \ 65 | --SPAN_wise \ 66 | --train_path $DIR/Entropy_v${i}.1.conll 2>&1 | tee ${MODEL_NAME}.log 67 | 68 | done 69 | -------------------------------------------------------------------------------- /commands/ETAL_PARTIAL_CRF_CT.sh: -------------------------------------------------------------------------------- 1 | DIR="../data/Spanish/ETAL_PARTAL_CRF_CT" 2 | DATA="../data/Spanish" 3 | 4 | for i in {1..20} ; do 5 | python2 ../helper_scripts/pickKTokens.py --input $DIR/to_annotate_v${i}.1.conll --k 200 --output $DIR/to_annotate_v${i}.1_200.conll 6 | 7 | python2 ../helper_scripts/SimulateAnnotations.py --input $DIR/to_annotate_v${i}.1_200.conll --output $DIR/v${i}.1.conll --needUNK 8 | 9 | PREV=`expr $i - 1` 10 | 11 | python2 ../helper_scripts/removeAnnotatedSents.py --input $DIR//unlabel_v${PREV}.1.conll --annotated $DIR/v${i}.1.conll --output $DIR/unlabel_v${i}.1.conll 12 | 13 | if [ "$i" -gt 1 ] 14 | then 15 | python2 ../helper_scripts/CombineAnnotatedFiles.py --files $DIR/Entropy_v${PREV}.1.conll $DIR/v${i}.1.conll --output $DIR/Entropy_v${i}.1.conll 16 | else 17 | cp $DIR/v1.1.conll $DIR/Entropy_v1.1.conll 18 | fi 19 | 20 | #Train the NER Model Using FineTune 21 | MODEL_NAME="200_Entropy_Partial_CT_spa_${i}.1_finetune" 22 | python -u ../main.py \ 23 | --dynet-seed 3278657 \ 24 | --word_emb_dim 100 \ 25 | --batch_size 10 \ 26 | --model_name ${MODEL_NAME} \ 27 | --lang es \ 28 | --fixedVocab \ 29 | --fineTune \ 30 | --test_conll \ 31 | --misc \ 32 | --tot_epochs 1000 \ 33 | --aug_lang_train_path $DATA/vocab.conll \ 34 | --init_lr 0.015 \ 35 | --load_from_path ../saved_models/spanish_full_transfer_baseline.model \ 36 | --valid_freq 1300 \ 37 | --pretrain_emb_path $DATA/esp.vec \ 38 | --use_partial \ 39 | --dev_path $DATA/esp.dev \ 40 | --test_path $DATA/esp.test \ 41 | --train_path $DIR/Entropy_v${i}.1.conll 2>&1 | tee ${MODEL_NAME}.log 42 | 43 | 44 | #Run the Active Learning Session 45 | NEW=`expr $i + 1` 46 | #!/usr/bin/env bash 47 | MODEL_NAME="200_Entropy_Partial_CT_spa_${i}.1_finetune_activelearning" 48 | python -u ../main.py \ 49 | --dynet-seed 3278657 \ 50 | --mode test_1 \ 51 | --fixedVocab \ 52 | --aug_lang_train_path $DATA/vocab.conll \ 53 | --word_emb_dim 100 \ 54 | --model_name ${MODEL_NAME} \ 55 | --lang es \ 56 | --load_from_path ../saved_models/200_Entropy_Partial_CT_spa_${i}.1_finetune.model \ 57 | --pretrain_emb_path $DATA/esp.vec \ 58 | --dev_path $DATA/esp.dev \ 59 | --test_path $DIR/unlabel_v${i}.1.conll \ 60 | --to_annotate $DIR/to_annotate_v${NEW}.1.conll \ 61 | --ngram 5 \ 62 | --misc \ 63 | --test_conll \ 64 | --entropy_threshold 1e-8 \ 65 | --use_partial \ 66 | --k 200 \ 67 | --SPAN_wise \ 68 | --train_path $DIR/Entropy_v${i}.1.conll 2>&1 | tee ${MODEL_NAME}.log 69 | 70 | done 71 | -------------------------------------------------------------------------------- /commands/CFEAL_PARTIAL_CRF_CT.sh: -------------------------------------------------------------------------------- 1 | DIR="../data/Spanish/CFEAL_PARTAL_CRF_CT" 2 | DATA="../data/Spanish" 3 | 4 | for i in {1..20} ; do 5 | python2 ../helper_scripts/pickKTokens.py --input $DIR/to_annotate_v${i}.1.conll --k 200 --output $DIR/to_annotate_v${i}.1_200.conll 6 | 7 | python2 ../helper_scripts/SimulateAnnotations.py --input $DIR/to_annotate_v${i}.1_200.conll --output $DIR/v${i}.1.conll --needUNK 8 | 9 | PREV=`expr $i - 1` 10 | 11 | python2 ../helper_scripts/removeAnnotatedSents.py --input $DIR//unlabel_v${PREV}.1.conll --annotated $DIR/v${i}.1.conll --output $DIR/unlabel_v${i}.1.conll 12 | 13 | if [ "$i" -gt 1 ] 14 | then 15 | python2 ../helper_scripts/CombineAnnotatedFiles.py --files $DIR/Entropy_v${PREV}.1.conll $DIR/v${i}.1.conll --output $DIR/Entropy_v${i}.1.conll 16 | else 17 | cp $DIR/v1.1.conll $DIR/Entropy_v1.1.conll 18 | fi 19 | 20 | #Train the NER Model Using FineTune 21 | MODEL_NAME="200_Entropy_Partial_CT_spa_${i}.1_finetune" 22 | python -u ../main.py \ 23 | --dynet-seed 3278657 \ 24 | --word_emb_dim 100 \ 25 | --batch_size 10 \ 26 | --model_name ${MODEL_NAME} \ 27 | --lang es \ 28 | --fixedVocab \ 29 | --fineTune \ 30 | --test_conll \ 31 | --tot_epochs 1000 \ 32 | --aug_lang_train_path $DATA/vocab.conll \ 33 | --misc \ 34 | --init_lr 0.015 \ 35 | --load_from_path ../saved_models/spanish_full_transfer_baseline.model \ 36 | --valid_freq 1300 \ 37 | --pretrain_emb_path $DATA/esp.vec \ 38 | --use_partial \ 39 | --dev_path $DATA/esp.dev \ 40 | --test_path $DATA/esp.test \ 41 | --train_path $DIR/Entropy_v${i}.1.conll 2>&1 | tee ${MODEL_NAME}.log 42 | 43 | 44 | #Run the Active Learning Session 45 | NEW=`expr $i + 1` 46 | #!/usr/bin/env bash 47 | MODEL_NAME="200_Entropy_Partial_CT_spa_${i}.1_finetune_activelearning" 48 | python -u ../main.py \ 49 | --dynet-seed 3278657 \ 50 | --mode test_1 \ 51 | --fixedVocab \ 52 | --aug_lang_train_path $DATA/vocab.conll \ 53 | --word_emb_dim 100 \ 54 | --model_name ${MODEL_NAME} \ 55 | --lang es \ 56 | --load_from_path ../saved_models/200_Entropy_Partial_CT_spa_${i}.1_finetune.model \ 57 | --pretrain_emb_path $DATA/esp.vec \ 58 | --dev_path $DATA/esp.dev \ 59 | --test_path $DIR/unlabel_v${i}.1.conll \ 60 | --to_annotate $DIR/to_annotate_v${NEW}.1.conll \ 61 | --misc \ 62 | --ngram 5 \ 63 | --test_conll \ 64 | --entropy_threshold 0 \ 65 | --use_partial \ 66 | --k 200 \ 67 | --use_CFB \ 68 | --SPAN_wise \ 69 | --train_path $DIR/Entropy_v${i}.1.conll 2>&1 | tee ${MODEL_NAME}.log 70 | 71 | done 72 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Active Learning for Entity Recognition 2 | 3 | ### Requirements 4 | python 2.7
5 | DynetVersion commit 284838815ece9297a7100cc43035e1ea1b133a5 6 | 7 | 8 | ### Data 9 | In the ```data/```, create a directory per language as shown for ```data/Spanish```. Download the CoNLL train/dev/test NER datasets for that language here. To acquire LDC datasets, please get the required access. 10 | 11 | 12 | For storing the trained models, create directory ```saved_models``` in the parent folder. 13 | ### Embeddings 14 | Combine monolingual data acquired from Wikipedia with the plain text extracted from the labeled data. Train 100-d [Glove]((https://nlp.stanford.edu/projects/glove/)) embeddings 15 | 16 | ### Active Learning Simulation 17 | The best NER performance was obtained using fine-tuning training scheme. The scripts below runs simulation active learning runs for different active learning strategies: 18 | ``` cd commands```
19 | * ETAL + Partial-CRF + CT (Proposed recipe)
``` ./ETAL_PARTIAL_CRF_CT.sh ```
20 | * ETAL + Full-CRF + CT
``` ./ETAL_FULL_CRF_CT.sh ```
21 | * CFEAL + Full-CRF + CT
``` ./CFEAL_PARTIAL_CRF_CT.sh ```
22 | * SAL + CT
23 | ``` ./SAL_CT.sh ```
24 | Things to note: 25 | 26 | We load the vocabulary from the following path```--aug_lang_train_path```. Therefore, create a conll formatted file with dummy labels from the unlabeled text. 27 | For our experiments, we concatenated the transferred data with the unlabeled data (which was the entire training dataset) into a single conll formatted file. 28 | The conll format is a tab separated two-column format as shown below:
29 | 30 | ```El O```
31 | ```grupo O```
32 | 33 | The LDC NER label set differ from the CoNLL label set by one tag. Therefore, add ``` --misc ``` to the argument set when running any experiments on CoNLL data. The label set has been hard-coded in the ```data_loaders/data_loader.py``` file. 34 | 35 | ### Cross-Lingual Transferred Data 36 | We used the model proposed by (Xie et al. 2018) to get the cross-lingually transferred data from English. 37 | Please refer to their code [here](https://github.com/thespectrewithin/cross-lingual_NER). 38 | 39 | For the Fine-Tune training scheme, train a base NER model on the transferred model as follows: 40 | 41 | MODEL_NAME="spanish_full_transfer_baseline" 42 | python -u ../main.py \ 43 | --dynet-seed 3278657 \ 44 | --word_emb_dim 100 \ 45 | --batch_size 10 \ 46 | --model_name ${MODEL_NAME} \ 47 | --lang es \ 48 | --fixedVocab \ 49 | --test_conll \ 50 | --tot_epochs 1000 \ 51 | --aug_lang_train_path $DATA/vocab.conll \ 52 | --init_lr 0.015 \ 53 | --valid_freq 1300 \ 54 | --misc \ 55 | --pretrain_emb_path $DATA/esp.vec \ 56 | --dev_path $DATA/esp.dev \ 57 | --test_path $DATA/esp.test \ 58 | --train_path $DIR/transferred_data.conll 2>&1 | tee ${MODEL_NAME}.log 59 | 60 | ### References 61 | If you make use of this software for research purposes, we will appreciate citing the following: 62 | ``` 63 | @inproceedings{chaudhary19emnlp, 64 | title = {A Little Annotation does a Lot of Good: A Study in Bootstrapping Low-resource Named Entity Recognizers}, 65 | author = {Aditi Chaudhary and Jiateng Xie and Zaid Sheikh and Graham Neubig and Jaime Carbonell}, 66 | booktitle = {Conference on Empirical Methods in Natural Language Processing (EMNLP)}, 67 | address = {Hong Kong}, 68 | month = {November}, 69 | url = {http://arxiv.org/abs/1908.08983}, 70 | year = {2019} 71 | } 72 | ``` 73 | 74 | ### Contact 75 | For any issues, please feel free to reach out to `aschaudh@andrew.cmu.edu`. 76 | -------------------------------------------------------------------------------- /helper_scripts/SimulateAnnotations.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import argparse 3 | from copy import deepcopy 4 | 5 | 6 | 7 | 8 | 9 | 10 | def annotate(input, output): 11 | gold_lines = [] 12 | 13 | with codecs.open(input, "r", encoding='utf-8') as fin,codecs.open(output, "w", encoding='utf-8') as fout: 14 | actual_line = [] 15 | actual_one_line = [] 16 | 17 | crf_line= [] 18 | crf_one_line = [] 19 | 20 | gold_one_line = [] 21 | prev = "" 22 | for line in fin: 23 | if line == "" or line == "\n": 24 | #fout.write("\n") 25 | actual_line.append(actual_one_line) 26 | actual_one_line = [] 27 | 28 | crf_line.append(crf_one_line) 29 | crf_one_line = [] 30 | 31 | gold_lines.append(gold_one_line) 32 | gold_one_line = [] 33 | 34 | prev = "" 35 | else: 36 | tokens = line.strip().split() 37 | gold_one_line.append(tokens[-1]) 38 | 39 | if "UNK" in tokens[1]: #Find the true start of the entity 40 | #fout.write(tokens[0] + "\t" + tokens[-1] + '\n') 41 | actual_one_line.append(tokens[0] + "\t" + tokens[-1]) 42 | prev = tokens[-1] 43 | 44 | 45 | else: 46 | #fout.write(tokens[0] + "\t" + tokens[1] + '\n') 47 | # actual_one_line.append(tokens[0] + "\t" + tokens[1]) 48 | if prev != "" and tokens[-1].startswith("I-"): 49 | BIO_tag = tokens[-1] 50 | prev =tokens[-1] 51 | else: 52 | if args.needUNK: 53 | BIO_tag = "B-UNK" 54 | else: 55 | #BIO_tag = "O" 56 | BIO_tag = tokens[1] 57 | prev = "" 58 | actual_one_line.append(tokens[0] + "\t" + BIO_tag) 59 | 60 | 61 | index = 0 62 | lines = [] 63 | one_line = [] 64 | for line in actual_line: 65 | prev = "" 66 | for token_tag in line: 67 | current_tag = token_tag.split("\t")[-1] 68 | token = token_tag.split("\t")[0] 69 | 70 | 71 | if prev != "": 72 | if prev == "O" and "I-" in current_tag: 73 | #print("Check index :{0} for inconsistency {1}".format(index, token)) 74 | token_tag = token + "\t" + "B-" + current_tag.split("-")[-1] 75 | 76 | if (prev == "B-PER" or prev == "I-PER") and current_tag in ['I-LOC, I-ORG, I-GPE']: 77 | #print("Check index :{0} for inconsistency {1}".format(index, token)) 78 | token_tag = token + "\t" + "I-PER" 79 | 80 | if (prev == "B-GPE" or prev == "I-GPE") and current_tag in ['I-LOC, I-ORG, I-PER']: 81 | #print("Check index :{0} for inconsistency".format(index,token)) 82 | token_tag = token + "\t" + "I-GPE" 83 | 84 | if (prev == "B-LOC" or prev == "I-LOC") and current_tag in ['I-PER, I-ORG, I-GPE']: 85 | #print("Check index :{0} for inconsistency {1}".format(index,token)) 86 | token_tag = token + "\t" + "I-LOC" 87 | 88 | if (prev == "B-ORG" or prev == "I-ORG") and current_tag in ['I-LOC, I-PER, I-GPE']: 89 | #print("Check index :{0} for inconsistency {1}".format(index,token)) 90 | token_tag = token + "\t" + "I-ORG" 91 | 92 | 93 | 94 | prev = current_tag 95 | 96 | index +=1 97 | one_line.append(token_tag) 98 | #fout.write(token_tag + "\n") 99 | index += 1 100 | lines.append(one_line) 101 | one_line =[] 102 | #fout.write("\n") 103 | print(len(lines)) 104 | for line_num, line in enumerate(lines): 105 | prev = "" 106 | for token_num, token_tag in enumerate(line): 107 | token = token_tag.split("\t")[0] 108 | tag = token_tag.split("\t")[-1] 109 | if prev != "": 110 | if prev in ["B-UNK","O"] and tag in ["I-LOC", "I-GPE", "I-LOC", "I-MISC","I-PER","I-ORG"]: 111 | gold_one_line = gold_lines[line_num] 112 | gold_cur_tag = gold_one_line[token_num] 113 | temp_num = deepcopy(token_num) 114 | while not gold_cur_tag.startswith("B-"): 115 | temp_num -=1 116 | gold_cur_tag = gold_one_line[temp_num] 117 | line[temp_num] = line[temp_num].split("\t")[0] + "\t" + gold_cur_tag 118 | prev = tag 119 | 120 | for token_tag in line: 121 | fout.write(token_tag + '\n') 122 | fout.write("\n") 123 | 124 | 125 | if __name__ == "__main__": 126 | parser = argparse.ArgumentParser() 127 | parser.add_argument("--input", type=str, default=None, help="Active learning output") 128 | parser.add_argument("--output", type=str, default=None, help ="Simulated NI with gold annotations in place of UNK") 129 | parser.add_argument("--needUNK", default=False, action="store_true", help="Simulated NI with gold annotations in place of UNK") 130 | args = parser.parse_args() 131 | 132 | annotate(args.input, args.output) 133 | -------------------------------------------------------------------------------- /args.py: -------------------------------------------------------------------------------- 1 | def init_config(): 2 | import argparse 3 | parser = argparse.ArgumentParser() 4 | parser.add_argument("--dynet-mem", default=1000, type=int) 5 | parser.add_argument("--dynet-seed", default=5783287, type=int) 6 | parser.add_argument("--dynet-gpu") 7 | 8 | parser.add_argument("--model_name", type=str, default=None) 9 | parser.add_argument("--eval_folder", type=str, default="../eval") 10 | parser.add_argument("--lang", default="english", help="the target language") 11 | parser.add_argument("--train_ensemble", default=False, action="store_true") 12 | parser.add_argument("--full_data_path", type=str, default=None, help="when train_ensemble is true, this one is the full data path from which to load vocabulary.") 13 | parser.add_argument("--train_path", default="../datasets/english/eng.train.bio.conll", type=str) 14 | # parser.add_argument("--train_path", default="../datasets/english/debug_train.bio", type=str) 15 | parser.add_argument("--monolingual_data_path", default=None, type=str) 16 | parser.add_argument("--dev_path", default="../datasets/english/eng.dev.bio.conll", type=str) 17 | parser.add_argument("--test_path", default="../datasets/english/eng.test.bio.conll", type=str) 18 | parser.add_argument("--new_test_path", default="../datasets/english/eng.test.bio.conll", type=str) 19 | parser.add_argument("--new_test_conll", default="../datasets/english/eng.test.bio.conll", type=str) 20 | parser.add_argument("--save_to_path", default="../saved_models/") 21 | parser.add_argument("--load_from_path", default=None) 22 | parser.add_argument("--train_filename_path", default=None, type=str) 23 | parser.add_argument("--dev_filename_path", default=None, type=str) 24 | parser.add_argument("--test_filename_path", default=None, type=str) 25 | 26 | 27 | parser.add_argument("--model_arc", default="char_cnn", choices=["char_cnn", "char_birnn", "char_birnn_cnn", "sep", "sep_cnn_only"], type=str) 28 | parser.add_argument("--tag_emb_dim", default=50, type=int) 29 | parser.add_argument("--pos_emb_dim", default=50, type=int) 30 | parser.add_argument("--char_emb_dim", default=30, type=int) 31 | parser.add_argument("--word_emb_dim", default=100, type=int) 32 | parser.add_argument("--cnn_filter_size", default=30, type=int) 33 | parser.add_argument("--cnn_win_size", default=3, type=int) 34 | parser.add_argument("--rnn_type", default="lstm", choices=['lstm', 'gru'], type=str) 35 | parser.add_argument("--hidden_dim", default=200, type=int, help="token level rnn hidden dim") 36 | parser.add_argument("--char_hidden_dim", default=25, type=int, help="char level rnn hidden dim") 37 | parser.add_argument("--layer", default=1, type=int) 38 | 39 | parser.add_argument("--replace_unk_rate", default=0.0, type=float, help="uses when not all words in the test data is covered by the pretrained embedding") 40 | parser.add_argument("--remove_singleton", default=False, action="store_true") 41 | parser.add_argument("--map_pretrain", default=False, action="store_true") 42 | parser.add_argument("--map_dim", default=100, type=int) 43 | parser.add_argument("--pretrain_fix", default=False, action="store_true") 44 | 45 | parser.add_argument("--output_dropout_rate", default=0.5, type=float, help="dropout applied to the output of birnn before crf") 46 | parser.add_argument("--emb_dropout_rate", default=0.3, type=float, help="dropout applied to the input of token-level birnn") 47 | parser.add_argument("--valid_freq", default=500, type=int) 48 | parser.add_argument("--tot_epochs", default=100, type=int) 49 | parser.add_argument("--batch_size", default=10, type=int) 50 | parser.add_argument("--init_lr", default=0.015, type=float) 51 | parser.add_argument("--lr_decay", default=False, action="store_true") 52 | parser.add_argument("--decay_rate", default=0.05, action="store", type=float) 53 | parser.add_argument("--patience", default=3, type=int) 54 | 55 | parser.add_argument("--tagging_scheme", default="bio", choices=["bio", "bioes"], type=str) 56 | 57 | parser.add_argument("--data_aug", default=False, action="store_true", help="If use data_aug, the train_path should be the combined training file") 58 | parser.add_argument("--aug_lang", default="english", help="the language to augment the dataset") 59 | parser.add_argument("--aug_lang_train_path", default=None, type=str) 60 | parser.add_argument("--tgt_lang_train_path", default="../datasets/english/eng.train.bio.conll", type=str) 61 | 62 | parser.add_argument("--pretrain_emb_path", type=str, default=None) 63 | parser.add_argument("--res_discrete_feature", default=False, action="store_true", help="residual use of discrete features") 64 | 65 | parser.add_argument("--feature_birnn_hidden_dim", default=50, type=int, action="store") 66 | 67 | parser.add_argument("--use_discrete_features", default=False, action="store_true", help="David's indicator features") 68 | parser.add_argument("--split_hashtag", default=False, action="store_true", help="indicator of preceding hashtags") 69 | parser.add_argument("--cap", default=False, action="store_true", help="capitalization feature") 70 | parser.add_argument("--feature_dim", type=int, default=10, help="dimension of discrete features") 71 | 72 | parser.add_argument("--use_brown_cluster", default=False, action="store_true") 73 | parser.add_argument("--brown_cluster_path", action="store", type=str, help="path to the brown cluster features") 74 | parser.add_argument("--brown_cluster_num", default=0, type=int, action="store") 75 | parser.add_argument("--brown_cluster_dim", default=30, type=int, action="store") 76 | 77 | # Use trained model to test 78 | parser.add_argument("--mode", default="train", type=str, choices=["train", "test_1"], 79 | help="test_1: use one model") 80 | 81 | # Partial CRF 82 | parser.add_argument("--use_partial", default=False, action="store_true") 83 | 84 | # Active Learning 85 | parser.add_argument("--ngram", default=2, type=int) 86 | parser.add_argument("--to_annotate", type=str,default="./annotate.txt") 87 | parser.add_argument("--entropy_threshold", type=float, default=None) 88 | parser.add_argument("--use_CFB", default=False, action="store_true") 89 | parser.add_argument("--SPAN_wise", default=False, action="store_true", help="get span wise scores, even if there are duplicates.") 90 | parser.add_argument("--k", default=200, type=int, help="fixed number of spans to annotate") 91 | parser.add_argument("--debug", type=str) 92 | # Format of test output 93 | parser.add_argument("--test_conll", default=False, action="store_true") 94 | parser.add_argument("--fixedVocab", default=False, action="store_true", help="for loading pre-trained model") 95 | parser.add_argument("--fineTune", default=False, action="store_true", help="for loading pre-trained model") 96 | parser.add_argument("--run",default=0, type=int) 97 | parser.add_argument("--misc",default=False, action="store_true") 98 | parser.add_argument("--addbias", default=False, action="store_true") 99 | args = parser.parse_args() 100 | 101 | 102 | if args.train_ensemble: 103 | # model_name = ens_1_ + original 104 | # set dynet seed manually 105 | ens_no = int(args.model_name.split("_")[1]) 106 | # dyparams = dy.DynetParams() 107 | # dyparams.set_random_seed(ens_no + 5783287) 108 | # dyparams.init() 109 | 110 | import dynet_config 111 | dynet_config.set(random_seed=ens_no + 5783290) 112 | # if args.cuda: 113 | # dynet_config.set_gpu() 114 | 115 | # args.train_path = args.train_path.split(".")[0] + "_" + str(ens_no) + ".conll" 116 | 117 | if args.full_data_path is None: 118 | args.full_data_path = args.train_path 119 | args.save_to_path = args.save_to_path + args.model_name + ".model" 120 | print(args) 121 | return args 122 | -------------------------------------------------------------------------------- /models/model_builder.py: -------------------------------------------------------------------------------- 1 | __author__ = 'chuntingzhou and aditichaudhary' 2 | from encoders import * 3 | from decoders import * 4 | from collections import defaultdict 5 | from copy import deepcopy 6 | 7 | #np.set_printoptions(threshold='nan') 8 | 9 | 10 | class CRF_Model(object): 11 | def __init__(self, args, data_loader, lm_data_loader=None): 12 | self.save_to = args.save_to_path 13 | self.load_from = args.load_from_path 14 | tag_to_id = data_loader.tag_to_id 15 | self.constraints = None 16 | # print self.constraints 17 | 18 | #partial CRF 19 | self.use_partial = args.use_partial 20 | self.tag_to_id = tag_to_id 21 | self.B_UNK = data_loader.B_UNK 22 | self.I_UNK = data_loader.I_UNK 23 | 24 | #active learning for partial annotations 25 | self.entropy_spans = defaultdict(lambda: 0) 26 | self.full_sentences = {} 27 | self.use_CFB = args.use_CFB 28 | self.addbias = args.addbias 29 | self.B_tags = [] 30 | self.I_tags = [] 31 | self.O_tags = [] 32 | B_tags = [] 33 | I_tags = [] 34 | for tag in tag_to_id: 35 | if "B-" in tag: 36 | B_tags.append(tag) 37 | elif "I-" in tag: 38 | I_tags.append(tag) 39 | elif tag == "O": 40 | self.O_tags.append(tag_to_id[tag]) 41 | B_tags = sorted(B_tags) 42 | I_tags = sorted(I_tags) 43 | self.B_tags = [tag_to_id[tag] for tag in B_tags] 44 | self.I_tags = [tag_to_id[tag] for tag in I_tags] 45 | 46 | def forward(self, sents, char_sents, feats, bc_feats, training=True): 47 | raise NotImplementedError 48 | 49 | def save(self): 50 | if self.save_to is not None: 51 | self.model.save(self.save_to) 52 | else: 53 | print('Save to path not provided!') 54 | 55 | def load(self, path=None): 56 | if path is None: 57 | path = self.load_from 58 | if self.load_from is not None or path is not None: 59 | print('Load model parameters from %s!' % path) 60 | self.model.populate(path) 61 | else: 62 | print('Load from path not provided!') 63 | 64 | def cal_loss(self, sents, char_sents, ner_tags, feats, bc_feats, known_tags, lm_batch=None, training=True): 65 | birnn_outputs = self.forward(sents, char_sents, feats, bc_feats, training=training) 66 | crf_loss = self.crf_decoder.decode_loss(birnn_outputs, ner_tags,self.use_partial, known_tags, self.tag_to_id, self.B_UNK, self.I_UNK) 67 | return crf_loss#, sum_s, sent_s 68 | 69 | def eval(self, sents, char_sents, feats, bc_feats, training=False,type="eval"): 70 | birnn_outputs = self.forward(sents, char_sents, feats, bc_feats, training=training) 71 | best_score, best_path, tag_scores = self.crf_decoder.decoding(birnn_outputs, self.O_tags, addbias=self.addbias) 72 | best_path_copy = deepcopy(best_path) 73 | if type == "test": 74 | alpha_value, alphas = self.crf_decoder.forward_alg(tag_scores) 75 | beta_value, betas = self.crf_decoder.backward_one_sequence(tag_scores) 76 | # print("Alpha:{0} Beta:{1}".format(alpha_value.value(), beta_value.value())) 77 | sent = sents[0] 78 | gammas = [] 79 | sum = [] 80 | for i in range(len(sent)): 81 | gammas.append(alphas[i] + betas[i] - alpha_value) 82 | 83 | if self.use_CFB: 84 | self.crf_decoder.get_uncertain_subsequences_CFB(sent, tag_scores, alphas, betas, alpha_value, gammas, 85 | best_path_copy, self.tag_to_id 86 | , self.B_UNK, self.I_UNK) 87 | 88 | else: 89 | self.crf_decoder.get_uncertain_subsequences(sent, tag_scores, alphas, betas, alpha_value, gammas, 90 | best_path_copy 91 | , self.B_tags, self.I_tags, self.O_tags) 92 | 93 | 94 | return best_score - alpha_value, best_path 95 | else: 96 | return best_score, best_path 97 | 98 | def eval_scores(self, sents, char_sents, feats, bc_feats, training=False): 99 | birnn_outputs = self.forward(sents, char_sents, feats, bc_feats, training=training) 100 | tag_scores, transit_score = self.crf_decoder.get_crf_scores(birnn_outputs) 101 | return tag_scores, transit_score 102 | 103 | 104 | class vanilla_NER_CRF_model(CRF_Model): 105 | ''' Implement End-to-end Sequence Labeling via Bi-directional LSTM-CNNs-CRF. ''' 106 | def __init__(self, args, data_loader, lm_data_loader=None): 107 | # super(vanilla_NER_CRF_model, self).__init__(args, data_loader) 108 | self.model = dy.Model() 109 | self.args = args 110 | super(vanilla_NER_CRF_model, self).__init__(args, data_loader) 111 | 112 | self.res_discrete = args.res_discrete_feature 113 | 114 | ner_tag_size = data_loader.ner_vocab_size 115 | char_vocab_size = data_loader.char_vocab_size 116 | word_vocab_size = data_loader.word_vocab_size 117 | word_padding_token = data_loader.word_padding_token 118 | 119 | char_emb_dim = args.char_emb_dim 120 | word_emb_dim = args.word_emb_dim 121 | tag_emb_dim = args.tag_emb_dim 122 | birnn_input_dim = args.cnn_filter_size + args.word_emb_dim 123 | hidden_dim = args.hidden_dim 124 | src_ctx_dim = args.hidden_dim * 2 125 | 126 | cnn_filter_size = args.cnn_filter_size 127 | cnn_win_size = args.cnn_win_size 128 | output_dropout_rate = args.output_dropout_rate 129 | emb_dropout_rate = args.emb_dropout_rate 130 | 131 | if args.use_discrete_features: 132 | self.num_feats = data_loader.num_feats 133 | self.feature_encoder = Discrete_Feature_Encoder(self.model, self.num_feats, args.feature_dim) 134 | if self.res_discrete: 135 | src_ctx_dim += args.feature_dim * self.num_feats 136 | else: 137 | birnn_input_dim += args.feature_dim * self.num_feats 138 | 139 | if args.use_brown_cluster: 140 | bc_num = args.brown_cluster_num 141 | bc_dim = args.brown_cluster_dim 142 | # for each batch, the length of input seqs are the same, so we don't have bother with padding 143 | self.bc_encoder = Lookup_Encoder(self.model, args, bc_num, bc_dim, word_padding_token, isFeatureEmb=True) 144 | 145 | if self.res_discrete: 146 | src_ctx_dim += bc_dim 147 | else: 148 | birnn_input_dim += bc_dim 149 | 150 | self.char_cnn_encoder = CNN_Encoder(self.model, char_emb_dim, cnn_win_size, cnn_filter_size, 151 | 0.0, char_vocab_size, data_loader.char_padding_token) 152 | if args.pretrain_emb_path is None: 153 | self.word_lookup = Lookup_Encoder(self.model, args, word_vocab_size, word_emb_dim, word_padding_token) 154 | else: 155 | print("In NER CRF: Using pretrained word embedding!") 156 | self.word_lookup = Lookup_Encoder(self.model, args, word_vocab_size, word_emb_dim, word_padding_token, data_loader.pretrain_word_emb) 157 | self.birnn_encoder = BiRNN_Encoder(self.model, birnn_input_dim, hidden_dim, emb_dropout_rate=emb_dropout_rate, 158 | output_dropout_rate=output_dropout_rate) 159 | 160 | self.crf_decoder = chain_CRF_decoder(args, self.model, src_ctx_dim, tag_emb_dim, ner_tag_size, constraints=self.constraints) 161 | 162 | def forward(self, sents, char_sents, feats, bc_feats, training=True): 163 | char_embs = self.char_cnn_encoder.encode(char_sents, training=training) 164 | word_embs = self.word_lookup.encode(sents) 165 | 166 | if self.args.use_discrete_features: 167 | feat_embs = self.feature_encoder.encode(feats) 168 | 169 | if self.args.use_brown_cluster: 170 | bc_feat_embs = self.bc_encoder.encode(bc_feats) 171 | 172 | if self.args.use_discrete_features and self.args.use_brown_cluster: 173 | concat_inputs = [dy.concatenate([c, w, f, b]) for c, w, f, b in 174 | zip(char_embs, word_embs, feat_embs, bc_feat_embs)] 175 | elif self.args.use_brown_cluster and not self.args.use_discrete_features: 176 | concat_inputs = [dy.concatenate([c, w, f]) for c, w, f in 177 | zip(char_embs, word_embs, bc_feat_embs)] 178 | elif self.args.use_discrete_features and not self.args.use_brown_cluster: 179 | concat_inputs = [dy.concatenate([c, w, f]) for c, w, f in 180 | zip(char_embs, word_embs, feat_embs)] 181 | else: 182 | concat_inputs = [dy.concatenate([c, w]) for c, w in zip(char_embs, word_embs)] 183 | 184 | birnn_outputs = self.birnn_encoder.encode(concat_inputs, training=training) 185 | return birnn_outputs 186 | -------------------------------------------------------------------------------- /dataloaders/data_loader.py: -------------------------------------------------------------------------------- 1 | __author__ = 'chuntingzhou and aditichaudhary' 2 | import os 3 | from utils.util import * 4 | from utils.features import * 5 | #from utils.segnerfts import orm_morph as ormnorm 6 | 7 | #tagset = ['B-LOC','B-PER','B-MISC', 'B-ORG','I-LOC','I-PER','I-MISC', 'I-ORG','O'] 8 | tagset = ['B-LOC','B-PER','B-GPE', 'B-ORG','I-LOC','I-PER','I-GPE', 'I-ORG','O'] 9 | 10 | class NER_DataLoader(): 11 | def __init__(self, args, special_normal=False): 12 | # This is data loader as well as feature extractor!! 13 | 14 | self.args = args 15 | if args.train_ensemble: 16 | self.train_path = args.full_data_path 17 | else: 18 | self.train_path = args.train_path 19 | self.test_path = args.test_path 20 | self.dev_path = args.dev_path 21 | self.args = args 22 | 23 | self.tag_vocab_path = self.train_path + ".tag_vocab" 24 | self.word_vocab_path = self.train_path + ".word_vocab" 25 | self.char_vocab_path = self.train_path + ".char_vocab" 26 | 27 | self.pretrained_embedding_path = args.pretrain_emb_path 28 | self.use_discrete_feature = args.use_discrete_features 29 | self.use_brown_cluster = args.use_brown_cluster 30 | 31 | self.train_senttypes = self.dev_senttypes = self.test_senttypes = None 32 | 33 | if self.use_brown_cluster: 34 | self.brown_cluster_dicts = get_brown_cluster(args.brown_cluster_path) 35 | self.brown_cluster_dicts[''] = len(self.brown_cluster_dicts) 36 | args.brown_cluster_num = len(self.brown_cluster_dicts) 37 | else: 38 | self.brown_cluster_dicts = None 39 | 40 | print("Generating vocabs from training file ....") 41 | paths_to_read = [self.train_path, self.dev_path, self.test_path] 42 | 43 | if args.fixedVocab: #Make vaocabulary from the args.aug_lang_train_path 44 | _, self.word_to_id, self.char_to_id = self.read_files([self.args.aug_lang_train_path]) 45 | self.tag_to_id = {} 46 | # self.word_to_id = {} 47 | # self.char_to_id = {} 48 | for tag in tagset: 49 | if args.misc: 50 | tag = tag.replace("GPE", "MISC") 51 | self.tag_to_id[tag] = len(self.tag_to_id) 52 | else: 53 | self.tag_to_id, self.word_to_id, self.char_to_id = self.read_files(paths_to_read) 54 | print("Size of vocab before: %d" % len(self.word_to_id)) 55 | self.word_to_id[''] = len(self.word_to_id) + 1 56 | self.char_to_id[''] = len(self.char_to_id) + 1 57 | self.word_to_id['<\s>'] = 0 58 | self.char_to_id[''] = 0 59 | print("Size of vocab after: %d" % len(self.word_to_id)) 60 | self.word_padding_token = 0 61 | self.char_padding_token = 0 62 | 63 | if self.pretrained_embedding_path is not None: 64 | self.pretrain_word_emb, self.word_to_id, self.char_to_id = get_pretrained_emb(self.args.fixedVocab, self.pretrained_embedding_path, 65 | self.word_to_id, self.char_to_id, args.word_emb_dim) 66 | 67 | # for char vocab and word vocab, we reserve id 0 for the eos padding, and len(vocab)-1 for the 68 | self.id_to_tag = {v: k for k, v in self.tag_to_id.iteritems()} 69 | self.id_to_word = {v: k for k, v in self.word_to_id.iteritems()} 70 | self.id_to_char = {v: k for k, v in self.char_to_id.iteritems()} 71 | 72 | self.ner_vocab_size = len(self.id_to_tag) 73 | self.word_vocab_size = len(self.id_to_word) 74 | self.char_vocab_size = len(self.id_to_char) 75 | 76 | self.cap_ratio_dict = None 77 | 78 | #Partial CRF 79 | self.B_UNK = self.ner_vocab_size + 1 80 | self.I_UNK = self.ner_vocab_size + 2 81 | 82 | print("Size of vocab after: %d" % len(self.word_to_id)) 83 | print("NER tag num=%d, Word vocab size=%d, Char Vocab size=%d" % (self.ner_vocab_size, self.word_vocab_size, self.char_vocab_size)) 84 | 85 | 86 | @staticmethod 87 | def exists(path): 88 | return os.path.exists(path) 89 | 90 | def read_one_line(self, line, tag_set, word_dict, char_set): 91 | for w in line: 92 | fields = w.split() 93 | if len(fields) !=2: 94 | print("ERROR! Incorrect number of fields in the file, required two.") 95 | print(fields) 96 | exit(0) 97 | word = fields[0] 98 | ner_tag = fields[-1] 99 | 100 | for c in word: 101 | char_set.add(c) 102 | if "UNK" not in ner_tag: 103 | if self.args.misc: 104 | ner_tag = ner_tag.replace("GPE","MISC") 105 | tag_set.add(ner_tag) 106 | word_dict[word] += 1 107 | 108 | def get_vocab_from_set(self, a_set, shift=0): 109 | vocab = {} 110 | for i, elem in enumerate(a_set): 111 | vocab[elem] = i + shift 112 | 113 | return vocab 114 | 115 | def get_vocab_from_dict(self, a_dict, shift=0, remove_singleton=False): 116 | vocab = {} 117 | i = 0 118 | self.singleton_words = set() 119 | 120 | #Sort the defaultdict 121 | sortedDict = sorted(a_dict.iteritems(), key=lambda (k, v): v, reverse=True) 122 | for (k,v) in sortedDict: 123 | 124 | #for k, v in a_dict.iteritems(): 125 | if v == 1: 126 | self.singleton_words.add(i + shift) 127 | if remove_singleton: 128 | if v > 1: 129 | # print k, v 130 | vocab[k] = i + shift 131 | i += 1 132 | else: 133 | vocab[k] = i + shift 134 | i += 1 135 | print("Singleton words number: %d" % len(self.singleton_words)) 136 | return vocab 137 | 138 | def read_files(self, paths): 139 | # word_list = [] 140 | # char_list = [] 141 | # tag_list = [] 142 | word_dict = defaultdict(lambda: 0) 143 | char_set = set() 144 | tag_set = set() 145 | 146 | def _read_a_file(path): 147 | with codecs.open(path, "r", "utf-8") as fin: 148 | to_read_line = [] 149 | for line in fin: 150 | if line.strip() == "": 151 | self.read_one_line(to_read_line, tag_set, word_dict, char_set) 152 | to_read_line = [] 153 | else: 154 | to_read_line.append(line.strip()) 155 | self.read_one_line(to_read_line, tag_set, word_dict, char_set) 156 | 157 | for path in paths: 158 | _read_a_file(path) 159 | 160 | tag_vocab = self.get_vocab_from_set(tag_set) 161 | word_vocab = self.get_vocab_from_dict(word_dict, 1, self.args.remove_singleton) 162 | char_vocab = self.get_vocab_from_set(char_set, 1) 163 | 164 | return tag_vocab, word_vocab, char_vocab 165 | 166 | def get_data_set(self, path, lang, source="train"): 167 | sents = [] 168 | char_sents = [] 169 | tgt_tags = [] 170 | discrete_features = [] 171 | bc_features = [] 172 | known_tags = [] 173 | 174 | if source == "train": 175 | sent_types = self.train_senttypes 176 | else: 177 | sent_types = self.dev_senttypes 178 | 179 | def add_sent(one_sent, type): 180 | temp_sent = [] 181 | temp_ner = [] 182 | temp_char = [] 183 | temp_bc = [] 184 | sent = [] 185 | temp_known_tag = [] 186 | for w in one_sent: 187 | fields = w.split() 188 | if len(fields)!=2: 189 | fields = w.split("\t") 190 | assert len(fields)==2 191 | word = fields[0] 192 | sent.append(word) 193 | ner_tag = fields[-1] 194 | if self.use_brown_cluster: 195 | temp_bc.append(self.brown_cluster_dicts[word] if word in self.brown_cluster_dicts else self.brown_cluster_dicts[""]) 196 | 197 | if self.args.fixedVocab: 198 | if word in self.word_to_id: 199 | temp_sent.append(self.word_to_id[word]) 200 | elif word.lower() in self.word_to_id: 201 | temp_sent.append(self.word_to_id[word.lower()]) 202 | else: 203 | temp_sent.append(self.word_to_id[""]) 204 | else: 205 | temp_sent.append(self.word_to_id[word] if word in self.word_to_id else self.word_to_id[""]) 206 | 207 | if "B-UNK" in ner_tag: 208 | temp_ner.append(self.B_UNK) 209 | elif "I-UNK" in ner_tag: 210 | temp_ner.append(self.I_UNK) 211 | else: 212 | if self.args.misc: 213 | ner_tag = ner_tag.replace("GPE","MISC") 214 | temp_ner.append(self.tag_to_id[ner_tag]) 215 | 216 | if "UNK" in ner_tag: 217 | temp_known_tag.append([0]) 218 | else: 219 | temp_known_tag.append([1]) 220 | 221 | temp_char.append([self.char_to_id[c] if c in self.char_to_id else self.char_to_id[""] for c in word]) 222 | 223 | sents.append(temp_sent) 224 | char_sents.append(temp_char) 225 | tgt_tags.append(temp_ner) 226 | bc_features.append(temp_bc) 227 | known_tags.append(temp_known_tag) 228 | discrete_features.append([]) 229 | 230 | # print len(discrete_features[-1]) 231 | 232 | with codecs.open(path, "r", "utf-8") as fin: 233 | i = 0 234 | one_sent = [] 235 | for line in fin: 236 | if line.strip() == "" or line.strip() == "\n": 237 | if len(one_sent) > 0: 238 | add_sent(one_sent, sent_types[i] if sent_types is not None else None) 239 | i += 1 240 | if i % 1000 == 0: 241 | print("Processed %d training data." % (i,)) 242 | one_sent = [] 243 | else: 244 | one_sent.append(line.strip()) 245 | 246 | if len(one_sent) > 0: 247 | add_sent(one_sent, sent_types[i] if sent_types is not None else None) 248 | i += 1 249 | 250 | if sent_types is not None: 251 | assert i == len(sent_types), "Not match between number of sentences and sentence types!" 252 | 253 | if self.use_discrete_feature: 254 | self.num_feats = len(discrete_features[0][0]) 255 | else: 256 | self.num_feats = 0 257 | return sents, char_sents, tgt_tags, discrete_features, bc_features, known_tags 258 | -------------------------------------------------------------------------------- /utils/util.py: -------------------------------------------------------------------------------- 1 | __author__ = 'chuntingzhou' 2 | import dynet as dy 3 | import numpy as np 4 | from collections import defaultdict 5 | import gzip 6 | import cPickle as pkl 7 | import codecs 8 | import math 9 | import random 10 | from random import shuffle 11 | 12 | random.seed(448) 13 | np.random.seed(1) 14 | import operator 15 | import re 16 | 17 | MAX_CHAR_LENGTH = 45 18 | 19 | # Regular expressions used to normalize digits. 20 | DIGIT_RE = re.compile(br"\d") 21 | 22 | 23 | # word = utils.DIGIT_RE.sub(b"0", tokens[0]) if normalize_digits else tokens[0] 24 | 25 | 26 | def iob2(tags): 27 | """ 28 | Check that tags have a valid IOB format. 29 | Tags in IOB1 format are converted to IOB2. 30 | """ 31 | for i, tag in enumerate(tags): 32 | if tag == 'O': 33 | continue 34 | split = tag.split('-') 35 | if len(split) != 2 or split[0] not in ['I', 'B']: 36 | return False 37 | if split[0] == 'B': 38 | continue 39 | elif i == 0 or tags[i - 1] == 'O': # conversion IOB1 to IOB2 40 | tags[i] = 'B' + tag[1:] 41 | elif tags[i - 1][1:] == tag[1:]: 42 | continue 43 | else: # conversion IOB1 to IOB2 44 | tags[i] = 'B' + tag[1:] 45 | return True 46 | 47 | 48 | def get_entity(label): 49 | entities = [] 50 | i = 0 51 | while i < len(label): 52 | if label[i] != 'O': 53 | e_type = label[i][2:] 54 | j = i + 1 55 | while j < len(label) and label[j] == 'I-' + e_type: 56 | j += 1 57 | entities.append((i, j, e_type)) 58 | i = j 59 | else: 60 | i += 1 61 | return entities 62 | 63 | 64 | def evaluate_ner(pred, gold): 65 | tp = 0 66 | fp = 0 67 | fn = 0 68 | for i in range(len(pred)): 69 | pred_entities = get_entity(pred[i]) 70 | gold_entities = get_entity(gold[i]) 71 | temp = 0 72 | for entity in pred_entities: 73 | if entity in gold_entities: 74 | tp += 1 75 | temp += 1 76 | else: 77 | fp += 1 78 | fn += len(gold_entities) - temp 79 | precision = 1.0 * tp / (tp + fp) 80 | recall = 1.0 * tp / (tp + fn) 81 | f1 = 2 * precision * recall / (precision + recall) 82 | return precision, recall, f1 83 | 84 | 85 | def fopen(filename, mode='r'): 86 | if filename.endswith('.gz'): 87 | return gzip.open(filename, mode) 88 | return open(filename, mode) 89 | 90 | 91 | def get_pretrained_emb(fixedVocab, path_to_emb, word_to_id, char_to_id, dim): 92 | word_emb = [] 93 | print("Loading pretrained embeddings from %s." % (path_to_emb)) 94 | print("length of dict: %d" % len(word_to_id)) 95 | 96 | pretrain_word_emb = {} 97 | pretrain_vocab = [] 98 | for line in codecs.open(path_to_emb, "r", "utf-8", errors='replace'): 99 | items = line.strip().split() 100 | if len(items) == dim + 1: 101 | try: 102 | pretrain_word_emb[items[0]] = np.asarray(items[1:]).astype(np.float32) 103 | pretrain_vocab.append(items[0]) 104 | except ValueError: 105 | continue 106 | 107 | for _ in range(len(word_to_id)): 108 | word_emb.append(np.random.uniform(-math.sqrt(3.0 / dim), math.sqrt(3.0 / dim), size=dim)) 109 | 110 | not_covered = 0 111 | print(len(word_to_id), len(word_emb)) 112 | 113 | for word, id in word_to_id.iteritems(): 114 | if word in pretrain_word_emb: 115 | word_emb[id] = pretrain_word_emb[word] 116 | elif word.lower() in pretrain_word_emb: 117 | word_emb[id] = pretrain_word_emb[word.lower()] 118 | else: 119 | not_covered += 1 120 | 121 | if fixedVocab: 122 | #Take top 100000 from the word embeddings 123 | num = 0 124 | for word in pretrain_vocab: 125 | if num > 400000: 126 | break 127 | if word not in word_to_id: 128 | word_to_id[word] = len(word_to_id) 129 | word_emb.append(pretrain_word_emb[word]) 130 | num +=1 131 | 132 | else: 133 | for word in pretrain_word_emb.keys(): 134 | if word not in word_to_id: 135 | word_to_id[word] = len(word_to_id) 136 | word_emb.append(pretrain_word_emb[word]) 137 | 138 | emb = np.array(word_emb, dtype=np.float32) 139 | 140 | print("Word number not covered in pretrain embedding: %d" % not_covered) 141 | return emb, word_to_id, char_to_id 142 | 143 | 144 | def pkl_dump(obj, path): 145 | with open(path, "wb") as fout: 146 | pkl.dump(obj, fout) 147 | 148 | 149 | def pkl_load(path): 150 | with open(path, "rb") as fin: 151 | obj = pkl.load(fin) 152 | return obj 153 | 154 | 155 | def log_sum_exp_dim_0(x): 156 | # numerically stable log_sum_exp 157 | dims = x.dim() 158 | max_score = dy.max_dim(x, 0) # (dim_1, batch_size) 159 | if len(dims[0]) == 1: 160 | max_score_extend = max_score 161 | else: 162 | max_score_reshape = dy.reshape(max_score, (1, dims[0][1]), batch_size=dims[1]) 163 | max_score_extend = dy.concatenate([max_score_reshape] * dims[0][0]) 164 | x = x - max_score_extend 165 | exp_x = dy.exp(x) 166 | # (dim_1, batch_size), if no dim_1, return ((1,), batch_size) 167 | log_sum_exp_x = dy.log(dy.mean_dim(exp_x, d=[0], b=False) * dims[0][0]) 168 | return log_sum_exp_x + max_score 169 | 170 | 171 | def data_iterator(data_pair, batch_size): 172 | batches = make_bucket_batches(data_pair, batch_size) 173 | for batch in batches: 174 | yield batch 175 | 176 | 177 | def make_bucket_batches(data_collections, batch_size): 178 | # Data are bucketed according to the length of the first item in the data_collections. 179 | buckets = defaultdict(list) 180 | tot_items = len(data_collections[0]) 181 | for data_item in data_collections: 182 | src = data_item[0] 183 | buckets[len(src)].append(data_item) 184 | 185 | batches = [] 186 | # np.random.seed(2) 187 | for src_len in buckets: 188 | bucket = buckets[src_len] 189 | np.random.shuffle(bucket) 190 | 191 | num_batches = int(np.ceil(len(bucket) * 1.0 / batch_size)) 192 | for i in range(num_batches): 193 | cur_batch_size = batch_size if i < num_batches - 1 else len(bucket) - batch_size * i 194 | batches.append([[bucket[i * batch_size + j][k] for j in range(cur_batch_size)] for k in range(tot_items)]) 195 | np.random.shuffle(batches) 196 | return batches 197 | 198 | 199 | def transpose_input(seq, padding_token=0): 200 | # input seq: list of samples [[w1, w2, ..], [w1, w2, ..]] 201 | max_len = max([len(sent) for sent in seq]) 202 | seq_pad = [] 203 | seq_mask = [] 204 | for i in range(max_len): 205 | pad_temp = [sent[i] if i < len(sent) else padding_token for sent in seq] 206 | mask_temp = [1.0 if i < len(sent) else 0.0 for sent in seq] 207 | seq_pad.append(pad_temp) 208 | seq_mask.append(mask_temp) 209 | 210 | return seq_pad, seq_mask 211 | 212 | 213 | def transpose_discrete_features(feature_batch): 214 | # Discrete features are zero-one features 215 | # TODO: Other integer features, create lookup tables 216 | # tgt_batch: [[[feature of word 1 of sent 1], [feature of word 2 of sent 2], ]] 217 | # return: [(feature_num, batchsize)] 218 | max_sent_len = max([len(s) for s in feature_batch]) 219 | feature_num = len(feature_batch[0][0]) 220 | batch_size = len(feature_batch) 221 | features = [] # each: (feature_num, batch_size) 222 | for i in range(max_sent_len): 223 | w_i_feature = [dy.inputTensor(sent[i], batched=True) if i < len(sent) else dy.zeros(feature_num) for sent in feature_batch] 224 | w_i_feature = dy.reshape(dy.concatenate(w_i_feature, d=1), (feature_num,), batch_size=batch_size) 225 | features.append(w_i_feature) 226 | 227 | return features 228 | 229 | 230 | def transpose_and_batch_embs(input_embs, emb_size): 231 | # input_embs: [[w1_emb, w2_emb, ]], embs are dy.expressions 232 | max_len = max(len(sent) for sent in input_embs) 233 | batch_size = len(input_embs) 234 | padded_seq_emb = [] 235 | seq_masks = [] 236 | for i in range(max_len): 237 | w_i_emb = [sent[i] if i < len(sent) else dy.zeros(emb_size) for sent in input_embs] 238 | w_i_emb = dy.reshape(dy.concatenate(w_i_emb, d=1), (emb_size,), batch_size=batch_size) 239 | w_i_mask = [1.0 if i < len(sent) else 0.0 for sent in input_embs] 240 | padded_seq_emb.append(w_i_emb) 241 | seq_masks.append(w_i_mask) 242 | 243 | return padded_seq_emb, seq_masks 244 | 245 | 246 | def transpose_char_input(tgt_batch, padding_token): 247 | # The tgt_batch may not be padded with and 248 | # tgt_batch: [[[, , ], [, s,h,e, ], 249 | # [, i,s, ], [, p,r,e,t,t,y, ], [, , ]], [[],[],[]]] 250 | max_sent_len = max([len(s) for s in tgt_batch]) 251 | sent_w_batch = [] # each is list of list: max_word_len, batch_size 252 | sent_mask_batch = [] # each is list of list: max_word_len, batch_size 253 | max_w_lens = [] 254 | SOW_PAD = 0 255 | EOW_PAD = 1 256 | EOS_PAD = 2 257 | for i in range(max_sent_len): 258 | max_len_w = max([len(sent[i]) for sent in tgt_batch if i < len(sent)]) 259 | max_w_lens.append(max_len_w) 260 | w_batch = [] 261 | mask_batch = [] 262 | for j in range(0, max_len_w): 263 | temp_j_w = [] 264 | for sent in tgt_batch: 265 | if i < len(sent) and j < len(sent[i]): 266 | temp_j_w.append(sent[i][j]) 267 | elif i >= len(sent): 268 | if j == 0: 269 | temp_j_w.append(SOW_PAD) 270 | elif j == max_len_w - 1: 271 | temp_j_w.append(EOW_PAD) 272 | else: 273 | temp_j_w.append(EOS_PAD) 274 | else: 275 | temp_j_w.append(EOW_PAD) 276 | # w_batch = [sent[i][j] if i < len(sent) and j < len(sent[i]) else self.EOW for sent in tgt_batch] 277 | # print "temp: ", temp_j_w 278 | w_batch.append(temp_j_w) 279 | mask_batch.append([1. if i < len(sent) and j < len(sent[i]) else 0.0 for sent in tgt_batch]) 280 | sent_w_batch.append(w_batch) 281 | sent_mask_batch.append(mask_batch) 282 | return sent_w_batch, sent_mask_batch, max_sent_len, max_w_lens 283 | 284 | def get_vocab_from_set(a_set, shift=0): 285 | vocab = {} 286 | for i, elem in enumerate(a_set): 287 | vocab[elem] = i + shift 288 | 289 | return vocab 290 | 291 | if __name__ == "__main__": 292 | # from scipy.misc import logsumexp 293 | # import numpy as np 294 | # 295 | # # a = np.random.rand(3, 4, 2) 296 | # # b = logsumexp(a, axis=0) 297 | # # a_t = dy.inputTensor(a, batched=True) 298 | # # b_t = log_sum_exp_dim_0(a_t) 299 | # # print "numpy " 300 | # # print b 301 | # # print "dynet " 302 | # # print b_t.value(), b_t.dim() 303 | # # print dy.pick_batch_elem(b_t, 1).npvalue() 304 | # 305 | # a = np.random.rand(3, 2) 306 | # b = logsumexp(a, axis=0) 307 | # a_t = dy.inputTensor(a, batched=True) 308 | # b_t = log_sum_exp_dim_0(a_t) 309 | # print "numpy " 310 | # print b 311 | # print "dynet " 312 | # print b_t.value(), b_t.dim() 313 | # print dy.pick_batch_elem(b_t, 1).npvalue() 314 | dim = 100 315 | # 9 1000 316 | 317 | path_to_emb = "../datasets/english/glove.6B/glove.6B.100d.txt" 318 | pretrain_word_emb = {} 319 | i = 1 320 | for line in codecs.open(path_to_emb, "r", 'utf-8', errors='replace'): 321 | items = line.strip().split() 322 | if len(items) == dim + 1: 323 | try: 324 | pretrain_word_emb[items[0]] = np.asarray(items[1:]).astype(np.float32) 325 | except ValueError: 326 | continue 327 | print items[0], i, pretrain_word_emb[items[0]][:3] 328 | i += 1 329 | 330 | # gradient clipping 331 | # turn off the dropout 332 | # use smaller initial lr 333 | # variational dropout 334 | -------------------------------------------------------------------------------- /models/encoders.py: -------------------------------------------------------------------------------- 1 | __author__ = 'chuntingzhou' 2 | from utils.util import * 3 | 4 | ''' Designing idea: the encoder should be agnostic to the input, it can be either 5 | arbitrary spans, characters, or words, or even raw feature. However, user has to specify 6 | whether to have the lookup table for any input. 7 | 8 | There are also two ways to feed in multiple input features: 9 | (a) First concatenate all features for each position, and then use them as features for one encoder, e.g. bilstm 10 | (b) Use multiple encoders for multiple features then combine outputs from multiple encoders, either concat them 11 | or feed them to another encoder.''' 12 | 13 | 14 | class Encoder(): 15 | def __init__(self): 16 | pass 17 | 18 | def encode(self): 19 | raise NotImplementedError 20 | 21 | # class concat_input_encoder(encoder): 22 | # def __init__(self, model, lookups, lookup_table_dims): 23 | # # length of elements in lookup_table_dims == number of elements in lookups which are true 24 | # self.num_inputs = len(lookups) 25 | # self.lookups = lookups 26 | # self.lookup_params = [] 27 | # for i, lookup in enumerate(lookups): 28 | # if lookup == 1: 29 | # # add loop up parameters 30 | # self.lookup_params.append(model.add_lookup_parameters((lookup_table_dims[i][0], lookup_table_dims[i][1]))) 31 | # elif lookup == 2: 32 | # # add normal transformation parameters 33 | # # dims: discrete_feature_num, continuous_emb_dim 34 | # # the input should concatenate all the discrete features together first 35 | # self.lookup_params.append(model.add_parameters((lookup_table_dims[i][0], lookup_table_dims[i][1]))) 36 | # else: 37 | # self.lookup_params.append(0) 38 | # 39 | # def prepare_inputs(self, inputs): 40 | # # inputs: (a) 41 | # input_features = [] 42 | # for i, lookup in enumerate(self.lookups): 43 | # if lookup == 1: 44 | 45 | 46 | class Lookup_Encoder(Encoder): 47 | def __init__(self, model, args, vocab_size, emb_size, padding_token=None, pretrain_embedding=None, isFeatureEmb=False): 48 | Encoder.__init__(self) 49 | self.padding_token = padding_token 50 | self.map_pretrain = args.map_pretrain 51 | self.pretrain_fix = args.pretrain_fix 52 | self.isFeatureEmb = isFeatureEmb 53 | if args.map_pretrain: 54 | self.W_map = model.add_parameters((args.map_dim, emb_size)) 55 | self.b_map = model.add_parameters(args.map_dim) 56 | self.b_map.zero() 57 | if pretrain_embedding is not None: 58 | self.lookup_table = model.lookup_parameters_from_numpy(pretrain_embedding) 59 | else: 60 | self.lookup_table = model.add_lookup_parameters((vocab_size, emb_size)) 61 | 62 | def encode(self, input_seqs): 63 | transpose_inputs, _ = transpose_input(input_seqs, self.padding_token) 64 | embs = [dy.lookup_batch(self.lookup_table, wids) for wids in transpose_inputs] 65 | if self.pretrain_fix and not self.isFeatureEmb: 66 | embs = [dy.nobackprop(emb) for emb in embs] 67 | # TODO: initialize with ones vector, initialize W_map with identity matrix 68 | if self.map_pretrain and not self.isFeatureEmb: 69 | if not self.pretrain_fix: 70 | embs = [dy.nobackprop(emb) for emb in embs] 71 | W_map = dy.parameter(self.W_map) 72 | b_map = dy.parameter(self.b_map) 73 | embs = [dy.affine_transform([b_map, W_map, emb]) for emb in embs] 74 | return embs 75 | 76 | 77 | class Discrete_Feature_Encoder(Encoder): 78 | def __init__(self, model, num_feats, to_dim): 79 | Encoder.__init__(self) 80 | self.num_feats = num_feats 81 | self.to_dim = to_dim 82 | self.W_feat_emb = model.add_parameters((to_dim, num_feats)) 83 | 84 | def encode(self, input_feats): 85 | batch_size = len(input_feats) 86 | # after transpose: input_feats: [(num_feats, batch_size)] 87 | input_feats = transpose_discrete_features(input_feats) 88 | W_feat_emb = dy.parameter(self.W_feat_emb) 89 | output_emb = [] 90 | for wif in input_feats: 91 | extend_wif = dy.transpose(dy.concatenate_cols([wif for _ in range(self.to_dim)])) 92 | feature_emb = dy.cmult(extend_wif, W_feat_emb) 93 | output_emb.append(dy.reshape(feature_emb, (self.to_dim * self.num_feats, ), batch_size=batch_size)) 94 | return output_emb 95 | 96 | 97 | class CNN_Encoder(Encoder): 98 | def __init__(self, model, emb_size, win_size=3, filter_size=64, dropout=0.5, vocab_size=0, padding_token=0, lookup_emb=None): 99 | Encoder.__init__(self) 100 | self.vocab_size = vocab_size # if 0, no lookup tables 101 | self.win_size = win_size 102 | self.filter_size = filter_size 103 | self.emb_size = emb_size 104 | self.dropout_rate = dropout 105 | self.paddding_token = padding_token 106 | if vocab_size != 0: 107 | print("In CNN encoder: creating lookup embedding!") 108 | self.lookup_emb = model.add_lookup_parameters((vocab_size, 1, 1, emb_size)) 109 | else: 110 | assert lookup_emb is not None 111 | print("In CNN encoder: reusing lookup embedding!") 112 | self.lookup_emb = lookup_emb 113 | 114 | self.W_cnn = model.add_parameters((1, win_size, emb_size, filter_size)) 115 | self.b_cnn = model.add_parameters((filter_size)) 116 | self.b_cnn.zero() 117 | 118 | def _cnn_emb(self, input_embs, training): 119 | # input_embs: (h, time_step, dim, batch_size), h=1 120 | if self.dropout_rate > 0 and training: 121 | input_embs = dy.dropout(input_embs, self.dropout_rate) 122 | W_cnn = dy.parameter(self.W_cnn) 123 | b_cnn = dy.parameter(self.b_cnn) 124 | 125 | cnn_encs = dy.conv2d_bias(input_embs, W_cnn, b_cnn, stride=(1, 1), is_valid=False) 126 | tanh_cnn_encs = dy.tanh(cnn_encs) 127 | max_pool_out = dy.reshape(dy.max_dim(tanh_cnn_encs, d=1), (self.filter_size,)) 128 | # rec_pool_out = dy.rectify(max_pool_out) 129 | return max_pool_out 130 | 131 | def encode(self, input_seqs, training=True, char=True): 132 | batch_size = len(input_seqs) 133 | sents_embs = [] 134 | if char: 135 | # we don't batch at first, we batch after cnn 136 | for sent in input_seqs: 137 | sent_emb = [] 138 | for w in sent: 139 | if len(w) < self.win_size: 140 | w += [self.paddding_token] * (self.win_size - len(w)) 141 | input_embs = dy.concatenate([dy.lookup(self.lookup_emb, c) for c in w], d=1) 142 | w_emb = self._cnn_emb(input_embs, training) # (filter_size, 1) 143 | sent_emb.append(w_emb) 144 | sents_embs.append(sent_emb) 145 | sents_embs, sents_mask = transpose_and_batch_embs(sents_embs, self.filter_size) # [(filter_size, batch_size)] 146 | else: 147 | for sent in input_seqs: 148 | if self.vocab_size != 0: 149 | if len(sent) < self.win_size: 150 | sent += [0] * (self.win_size - len(sent)) 151 | input_embs = dy.concatenate([dy.lookup(self.lookup_emb, w) for w in sent], d=1) 152 | else: 153 | # input_seqs: [(emb_size, batch_size)] 154 | if len(sent) < self.win_size: 155 | sent += [dy.zeros(self.emb_size)] * (self.win_size - len(sent)) 156 | input_embs = dy.transpose(dy.concatenate_cols(sent)) # (time_step, emb_size, bs) 157 | input_embs = dy.reshape(input_embs, (1, len(sent), self.emb_size), ) 158 | 159 | sent_emb = self._cnn_emb(input_embs, training) # (filter_size, 1) 160 | sents_embs.append(sent_emb) 161 | sents_embs = dy.reshape(dy.concatenate(sents_embs, d=1), (self.filter_size,), batch_size =batch_size) # (filter_size, batch_size) 162 | 163 | return sents_embs 164 | 165 | 166 | class BiRNN_Encoder(Encoder): 167 | def __init__(self, 168 | model, 169 | input_dim, 170 | hidden_dim, 171 | emb_dropout_rate=0.3, 172 | output_dropout_rate=0.5, 173 | padding_token=None, 174 | vocab_size=0, 175 | emb_size=0, 176 | layer=1, 177 | rnn="lstm", 178 | vocab_emb=None): 179 | Encoder.__init__(self) 180 | # self.birnn = dy.BiRNNBuilder(layer, input_dim, hidden_dim, model, dy.LSTMBuilder if rnn == "lstm" else dy.GRUBuilder) 181 | self.fwd_RNN = dy.LSTMBuilder(layer, input_dim, hidden_dim, model) if rnn == "lstm" else dy.GRUBuilder(layer, input_dim, hidden_dim, model) 182 | self.bwd_RNN = dy.LSTMBuilder(layer, input_dim, hidden_dim, model) if rnn == "lstm" else dy.GRUBuilder(layer, input_dim, hidden_dim, model) 183 | 184 | self.input_dim = input_dim 185 | self.vocab_size = vocab_size 186 | self.padding_token = padding_token 187 | self.drop_out_rate = output_dropout_rate 188 | self.emb_drop_rate = emb_dropout_rate 189 | self.hidden_dim = hidden_dim 190 | if vocab_size > 0: 191 | print("In BiRNN, creating lookup table!") 192 | self.vocab_emb = model.add_lookup_parameters((vocab_size, emb_size)) 193 | else: 194 | if vocab_emb is not None: 195 | # assert vocab_emb is not None 196 | self.vocab_emb = vocab_emb 197 | else: 198 | self.vocab_emb = None 199 | 200 | def encode(self, input_seqs, training=True, char=False): 201 | if char: 202 | return self.encode_word(input_seqs, training=training) 203 | else: 204 | return self.encode_seq(input_seqs, training=training) 205 | 206 | def encode_seq(self, input_seqs, training=True, char=False): 207 | if self.vocab_emb is not None: 208 | # input_seqs = [[w1, w2],[]] 209 | transpose_inputs, _ = transpose_input(input_seqs, self.padding_token) 210 | if self.vocab_size != 0: 211 | w_embs = [dy.dropout(dy.lookup_batch(self.vocab_emb, wids), 212 | self.emb_drop_rate) if self.emb_drop_rate > 0. and training 213 | else dy.lookup_batch(self.vocab_emb, wids) 214 | for wids in transpose_inputs] 215 | else: 216 | # print "In our case, use parameters shared by CNN char encoder, need conversion!" 217 | vocab_emb = dy.parameter(self.vocab_emb) 218 | vocab_size = vocab_emb.dim()[0][-1] 219 | # print "In BiRNN Char vocab size: ", vocab_size 220 | vocab_emb = dy.reshape(vocab_emb, (self.input_dim, vocab_size)) # expression, not lookup_parameters 221 | 222 | # for wids in transpose_inputs: 223 | # print wids 224 | # print vocab_emb.dim() 225 | # a = dy.pick_batch(vocab_emb, wids, dim=1) 226 | # print a.value() 227 | # Special case handler: use pick_batch 228 | w_embs = [dy.dropout(dy.pick_batch(vocab_emb, wids, dim=1), 229 | self.emb_drop_rate) if self.emb_drop_rate > 0. and training 230 | else dy.pick_batch(vocab_emb, wids, dim=1) 231 | for wids in transpose_inputs] 232 | # print "In BiRNN char: ", w_embs[0].dim() 233 | else: 234 | w_embs = [dy.dropout(emb, self.emb_drop_rate) if self.emb_drop_rate > 0. and training else emb for emb in input_seqs] 235 | # if vocab_size = 0: input_seqs = [(input_dim, batch_size)] 236 | 237 | w_embs_r = w_embs[::-1] 238 | # birnn_outputs = [dy.dropout(emb, self.drop_out_rate) if self.drop_out_rate > 0. else emb for emb in self.birnn.transduce(w_embs)] 239 | fwd_vectors = self.fwd_RNN.initial_state().transduce(w_embs) 240 | bwd_vectors = self.bwd_RNN.initial_state().transduce(w_embs_r)[::-1] 241 | 242 | if char: 243 | return dy.concatenate([fwd_vectors[-1], bwd_vectors[0]]) 244 | 245 | birnn_outputs = [dy.dropout(dy.concatenate([fwd_v, bwd_v]), self.drop_out_rate) if self.drop_out_rate > 0.0 and training 246 | else dy.concatenate([fwd_v, bwd_v]) 247 | for (fwd_v, bwd_v) in zip(fwd_vectors, bwd_vectors)] 248 | return birnn_outputs 249 | 250 | def encode_word(self, input_seqs, training=True): 251 | # embedding dropout rate is 0.0, because we dropout at the later stage of RNN 252 | sents_embs = [] 253 | 254 | for sent in input_seqs: 255 | sent_emb = [] 256 | for w in sent: 257 | w_emb = self.encode_seq([w], training=training, char=True) 258 | sent_emb.append(w_emb) 259 | sents_embs.append(sent_emb) 260 | sents_embs, sents_mask = transpose_and_batch_embs(sents_embs, self.hidden_dim*2) # [(hidden_dim*2, batch_size)] 261 | return sents_embs -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | __author__ = 'chuntingzhou and aditichaudhary' 2 | import sys 3 | reload(sys) 4 | sys.setdefaultencoding('utf-8') 5 | 6 | def evaluate(data_loader, path, model, model_name,type="dev"): 7 | sents, char_sents, tgt_tags, discrete_features, bc_feats,_ = data_loader.get_data_set(path, args.lang, source="dev") 8 | 9 | prefix = model_name + "_" + str(uid) 10 | # tot_acc = 0.0 11 | predictions = [] 12 | gold_standards = [] 13 | sentences = [] 14 | i = 0 15 | sentence_gold = {} 16 | 17 | score_sent = {} 18 | for sent, char_sent, tgt_tag, discrete_feature, bc_feat in zip(sents, char_sents, tgt_tags, discrete_features, bc_feats): 19 | dy.renew_cg() 20 | sent, char_sent, discrete_feature, bc_feat = [sent], [char_sent], [discrete_feature], [bc_feat] 21 | best_score, best_path = model.eval(sent, char_sent, discrete_feature, bc_feat, training=False,type=type) 22 | 23 | assert len(best_path) == len(tgt_tag) 24 | # acc = model.crf_decoder.cal_accuracy(best_path, tgt_tag) 25 | # tot_acc += acc 26 | predictions.append(best_path) 27 | gold_standards.append(tgt_tag) 28 | 29 | sentences.append(sent) 30 | sent_key = " ".join([str(x) for x in sent[0]]) 31 | sentence_gold[sent_key] = tgt_tag 32 | score_sent[sent_key] = best_score 33 | 34 | i += 1 35 | if i % 1000 == 0: 36 | print("Testing processed %d lines " % i) 37 | 38 | pred_output_fname = "%s/%s_pred_output.txt" % (args.eval_folder,prefix) 39 | eval_output_fname = "%s_eval_score.txt" % (prefix) 40 | with open(pred_output_fname, "w") as fout: 41 | for sent, pred, gold in zip(sentences, predictions, gold_standards): 42 | for s, p, g in zip(sent[0], pred, gold): 43 | fout.write(data_loader.id_to_word[int(s)] + " " + data_loader.id_to_tag[g] + " " + data_loader.id_to_tag[p] + "\n") 44 | fout.write("\n") 45 | 46 | os.system("%s/conlleval.v2 < %s > %s" % (args.eval_folder,pred_output_fname, eval_output_fname)) 47 | 48 | with open(eval_output_fname, "r") as fin: 49 | lid = 0 50 | for line in fin: 51 | if lid == 1: 52 | fields = line.split(";") 53 | acc = float(fields[0].split(":")[1].strip()[:-1]) 54 | precision = float(fields[1].split(":")[1].strip()[:-1]) 55 | recall = float(fields[2].split(":")[1].strip()[:-1]) 56 | f1 = float(fields[3].split(":")[1].strip()) 57 | lid += 1 58 | 59 | output = open(eval_output_fname, "r").read().strip() 60 | print(output) 61 | if type == "dev": 62 | os.system("rm %s" % (eval_output_fname,)) 63 | os.system("rm %s" % (pred_output_fname,)) 64 | 65 | 66 | return acc, precision, recall, f1,sentence_gold, score_sent 67 | 68 | 69 | def replace_singletons(data_loader, sents, replace_rate): 70 | new_batch_sents = [] 71 | for sent in sents: 72 | new_sent = [] 73 | for word in sent: 74 | if word in data_loader.singleton_words: 75 | new_sent.append(word if np.random.uniform(0., 1.) > replace_rate else data_loader.word_to_id[""]) 76 | else: 77 | new_sent.append(word) 78 | new_batch_sents.append(new_sent) 79 | return new_batch_sents 80 | 81 | 82 | def main(args): 83 | prefix = args.model_name + "_" + str(uid) 84 | print("PREFIX: %s" % prefix) 85 | final_darpa_output_fname = "%s/%s_output.conll" % (args.eval_folder,prefix) 86 | best_output_fname = "%s/best_%s_output.conll" % (args.eval_folder,prefix) 87 | ner_data_loader = NER_DataLoader(args) 88 | print(ner_data_loader.id_to_tag) 89 | 90 | #Loading training data from CoNLL format 91 | if not args.data_aug: 92 | sents, char_sents, tgt_tags, discrete_features, bc_features,known_tags = ner_data_loader.get_data_set(args.train_path, args.lang) 93 | else: 94 | sents_tgt, char_sents_tgt, tags_tgt, dfs_tgt, bc_feats_tgt,known_tags_tgt = ner_data_loader.get_data_set(args.tgt_lang_train_path, args.lang) 95 | sents_aug, char_sents_aug, tags_aug, dfs_aug, bc_feats_aug, known_tags_aug= ner_data_loader.get_data_set(args.aug_lang_train_path, args.aug_lang) 96 | sents, char_sents, tgt_tags, discrete_features, bc_features,known_tags = sents_tgt+sents_aug, char_sents_tgt+char_sents_aug, tags_tgt+tags_aug, dfs_tgt+dfs_aug, bc_feats_tgt+bc_feats_aug,known_tags_tgt+known_tags_aug 97 | 98 | 99 | print("Data set size (train): %d" % len(sents)) 100 | print("Number of discrete features: ", ner_data_loader.num_feats) 101 | epoch = bad_counter = updates = tot_example = cum_loss = 0 102 | patience = args.patience 103 | 104 | display_freq = 100 105 | valid_freq = args.valid_freq 106 | batch_size = args.batch_size 107 | 108 | 109 | print("Using Char CNN model!") 110 | model = vanilla_NER_CRF_model(args, ner_data_loader) 111 | inital_lr = args.init_lr 112 | 113 | if args.fineTune: 114 | print("Loading pre-trained model!") 115 | model.load() 116 | 117 | if len(sents) < 100: 118 | inital_lr = 0.0001 119 | else: 120 | inital_lr = args.init_lr #+ inital_lr * len(sents) / 1500.0 121 | 122 | 123 | trainer = dy.MomentumSGDTrainer(model.model, inital_lr, 0.9) 124 | 125 | def _check_batch_token(batch, id_to_vocab): 126 | for line in batch: 127 | print([id_to_vocab[i] for i in line]) 128 | 129 | def _check_batch_char(batch, id_to_vocab): 130 | for line in batch: 131 | print([u" ".join([id_to_vocab[c] for c in w]) for w in line]) 132 | 133 | lr_decay = args.decay_rate 134 | 135 | # decay_patience = 3 136 | # decay_num = 0 137 | valid_history = [] 138 | best_results = [0.0, 0.0, 0.0, 0.0] 139 | while epoch <= args.tot_epochs: 140 | batches = make_bucket_batches( 141 | zip(sents, char_sents, tgt_tags, discrete_features, bc_features, known_tags), batch_size) 142 | 143 | for b_sents, b_char_sents, b_ner_tags, b_feats, b_bc_feats, b_known_tags in batches: 144 | dy.renew_cg() 145 | 146 | if args.replace_unk_rate > 0.0: 147 | b_sents = replace_singletons(ner_data_loader, b_sents, args.replace_unk_rate) 148 | # _check_batch_token(b_sents, ner_data_loader.id_to_word) 149 | # _check_batch_token(b_ner_tags, ner_data_loader.id_to_tag) 150 | # _check_batch_char(b_char_sents, ner_data_loader.id_to_char) 151 | 152 | loss = model.cal_loss(b_sents, b_char_sents, b_ner_tags, b_feats, b_bc_feats, b_known_tags, training=True) 153 | loss_val = loss.value() 154 | cum_loss += loss_val * len(b_sents) 155 | tot_example += len(b_sents) 156 | 157 | updates += 1 158 | loss.backward() 159 | trainer.update() 160 | 161 | if updates % display_freq == 0: 162 | print("Epoch = %d, Updates = %d, CRF Loss=%f, Accumulative Loss=%f." % (epoch, updates, loss_val, cum_loss*1.0/tot_example)) 163 | if updates % valid_freq == 0: 164 | acc, precision, recall, f1,_,_ = evaluate(ner_data_loader, args.dev_path, model, args.model_name) 165 | 166 | if len(valid_history) == 0 or f1 > max(valid_history): 167 | bad_counter = 0 168 | best_results = [acc, precision, recall, f1] 169 | if updates > 0: 170 | print("Saving the best model so far.......") 171 | model.save() 172 | else: 173 | bad_counter += 1 174 | if args.lr_decay and bad_counter >= 3 and os.path.exists(args.save_to_path): 175 | bad_counter = 0 176 | model.load() 177 | lr = inital_lr / (1 + epoch * lr_decay) 178 | print("Epoch = %d, Learning Rate = %f." % (epoch, lr)) 179 | trainer = dy.MomentumSGDTrainer(model.model, lr) 180 | 181 | if bad_counter > patience: 182 | print("Early stop!") 183 | print("Best on validation: acc=%f, prec=%f, recall=%f, f1=%f" % tuple(best_results)) 184 | 185 | acc, precision, recall, f1,sentence_gold, score_sent = evaluate(ner_data_loader, args.test_path, model, args.model_name,"test") 186 | if args.SPAN_wise: 187 | createAnnotationOutput_SPAN_wise(args, model, ner_data_loader, sentence_gold, score_sent) 188 | 189 | exit(0) 190 | valid_history.append(f1) 191 | epoch += 1 192 | 193 | 194 | 195 | _,_,_,_,sentence_gold, score_sent = evaluate(ner_data_loader, args.test_path, model, args.model_name,"test") 196 | if args.SPAN_wise: 197 | createAnnotationOutput_SPAN_wise(args, model, ner_data_loader, sentence_gold, score_sent) 198 | print("All Epochs done.") 199 | 200 | def createAnnotationOutput_SPAN_wise(args, model, data_loader, sentence_gold, score_sent): 201 | # normalize all the entropy_spans ONLY DONE for the CFB 202 | 203 | 204 | reverse = True #For ETAL we look at the highest entropy ones, hence sorting is reversed 205 | if args.use_CFB: #For CFEAL we look at the least confident, hence sorting is not reversed 206 | reverse = False 207 | 208 | 209 | # Order the sentences by entropy of the spans 210 | fout= codecs.open(args.to_annotate, "w", encoding='utf-8') 211 | 212 | sorted_spans = sorted(model.crf_decoder.most_uncertain_entropy_spans, key=lambda k:model.crf_decoder.most_uncertain_entropy_spans[k],reverse=reverse) 213 | print("Total unique spans: {0}".format(len(sorted_spans))) 214 | count_span = args.k 215 | count_tokens = args.k 216 | 217 | #DEBUG Print Span Entropy in the sorted order of selected spans 218 | fdebug = codecs.open("./" + args.model_name + "_span_entropy_debug.txt", "w", encoding='utf-8') 219 | 220 | for sorted_span in sorted_spans: 221 | 222 | span_words= [] 223 | if count_tokens <=0: 224 | break 225 | (span_entropy,sentence_key, start, end,best_path) = model.crf_decoder.most_uncertain_entropy_spans[sorted_span] 226 | gold_path = sentence_gold[sentence_key] 227 | sent = sentence_key.split() 228 | 229 | for t in sorted_span.split(): 230 | span_words.append(data_loader.id_to_word[int(t)]) 231 | fdebug.write(" ".join(span_words) + " " + str(span_entropy) + "\n") 232 | 233 | first = True 234 | path = deepcopy(best_path) 235 | for i in range(start, end): 236 | if first: 237 | path[i] = -10 #Id for B-UNK 238 | first = False 239 | else: 240 | path[i] = -11 #Id for I-UNK 241 | 242 | idx = 0 243 | for token, tag in zip(sent, path): 244 | 245 | if tag == -10: 246 | tag_label = "B-UNK" 247 | count_span -= 1 248 | count_tokens -= 1 249 | elif tag == -11: 250 | tag_label = "I-UNK" 251 | count_tokens -= 1 252 | else: 253 | tag_label = data_loader.id_to_tag[tag] 254 | 255 | gold_tag_label = data_loader.id_to_tag[gold_path[idx]] 256 | idx += 1 257 | fout.write(data_loader.id_to_word[int(token)] + "\t" + tag_label + "\t" + gold_tag_label + "\n") 258 | 259 | fout.write("\n") 260 | 261 | print("Total unique spans for exercise: {0}".format(args.k)) 262 | 263 | #SAL: Select most uncertain sequence 264 | basename = os.path.basename(args.to_annotate).replace(".conll", "") 265 | LC_output_file = os.path.dirname(args.to_annotate) + "/" + basename + "_LC.conll" 266 | count_tokens = args.k 267 | with codecs.open(LC_output_file, "w", encoding='utf-8') as fout: 268 | idx = 0 269 | for sentence_key in sorted(score_sent.keys(), reverse=False): 270 | if count_tokens<=0: 271 | break 272 | sent = sentence_key.split() 273 | gold_path = sentence_gold[sentence_key] 274 | token_count = 0 275 | for token in sent: 276 | count_tokens -= 1 277 | gold_tag_label = data_loader.id_to_tag[gold_path[token_count]] 278 | token_count += 1 279 | fout.write(data_loader.id_to_word[int(token)] + "\t" + "UNK " + gold_tag_label + "\n") 280 | fout.write("\n") 281 | idx += 1 282 | 283 | 284 | def test_single_model(args): 285 | ner_data_loader = NER_DataLoader(args) 286 | # ugly: get discrete number features 287 | _, _, _, _, _,_ = ner_data_loader.get_data_set(args.train_path, args.lang) 288 | 289 | print("Using Char CNN model!") 290 | model = vanilla_NER_CRF_model(args, ner_data_loader) 291 | model.load() 292 | 293 | _,_,_,_,sentence_gold, score_sent = evaluate(ner_data_loader, args.test_path, model, args.model_name,"test") 294 | if args.SPAN_wise: 295 | createAnnotationOutput_SPAN_wise(args, model, ner_data_loader, sentence_gold, score_sent) 296 | 297 | 298 | 299 | 300 | from args import init_config 301 | 302 | args = init_config() 303 | from models.model_builder import * 304 | import os 305 | import uuid 306 | from dataloaders.data_loader import * 307 | uid = uuid.uuid4().get_hex()[:6] 308 | 309 | if __name__ == "__main__": 310 | # args = init_config() 311 | if args.mode == "train": 312 | if args.load_from_path is not None: 313 | args.load_from_path = args.load_from_path 314 | else: 315 | args.load_from_path = args.save_to_path 316 | main(args) 317 | 318 | elif args.mode == "test_1": 319 | test_single_model(args) 320 | 321 | else: 322 | raise NotImplementedError 323 | -------------------------------------------------------------------------------- /eval/conlleval.v2: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # conlleval: evaluate result of processing CoNLL-2000 shared task 3 | # usage: conlleval [-l] [-r] [-d delimiterTag] [-o oTag] < file 4 | # README: http://cnts.uia.ac.be/conll2000/chunking/output.html 5 | # options: l: generate LaTeX output for tables like in 6 | # http://cnts.uia.ac.be/conll2003/ner/example.tex 7 | # r: accept raw result tags (without B- and I- prefix; 8 | # assumes one word per chunk) 9 | # d: alternative delimiter tag (default is single space) 10 | # o: alternative outside tag (default is O) 11 | # note: the file should contain lines with items separated 12 | # by $delimiter characters (default space). The final 13 | # two items should contain the correct tag and the 14 | # guessed tag in that order. Sentences should be 15 | # separated from each other by empty lines or lines 16 | # with $boundary fields (default -X-). 17 | # url: http://lcg-www.uia.ac.be/conll2000/chunking/ 18 | # started: 1998-09-25 19 | # version: 2004-01-26 20 | # author: Erik Tjong Kim Sang 21 | 22 | use strict; 23 | 24 | my $false = 0; 25 | my $true = 42; 26 | 27 | my $boundary = "-X-"; # sentence boundary 28 | my $correct; # current corpus chunk tag (I,O,B) 29 | my $correctChunk = 0; # number of correctly identified chunks 30 | my $correctTags = 0; # number of correct chunk tags 31 | my $correctType; # type of current corpus chunk tag (NP,VP,etc.) 32 | my $delimiter = " "; # field delimiter 33 | my $FB1 = 0.0; # FB1 score (Van Rijsbergen 1979) 34 | my $firstItem; # first feature (for sentence boundary checks) 35 | my $foundCorrect = 0; # number of chunks in corpus 36 | my $foundGuessed = 0; # number of identified chunks 37 | my $guessed; # current guessed chunk tag 38 | my $guessedType; # type of current guessed chunk tag 39 | my $i; # miscellaneous counter 40 | my $inCorrect = $false; # currently processed chunk is correct until now 41 | my $lastCorrect = "O"; # previous chunk tag in corpus 42 | my $latex = 0; # generate LaTeX formatted output 43 | my $lastCorrectType = ""; # type of previously identified chunk tag 44 | my $lastGuessed = "O"; # previously identified chunk tag 45 | my $lastGuessedType = ""; # type of previous chunk tag in corpus 46 | my $lastType; # temporary storage for detecting duplicates 47 | my $line; # line 48 | my $nbrOfFeatures = -1; # number of features per line 49 | my $precision = 0.0; # precision score 50 | my $oTag = "O"; # outside tag, default O 51 | my $raw = 0; # raw input: add B to every token 52 | my $recall = 0.0; # recall score 53 | my $tokenCounter = 0; # token counter (ignores sentence breaks) 54 | 55 | my %correctChunk = (); # number of correctly identified chunks per type 56 | my %foundCorrect = (); # number of chunks in corpus per type 57 | my %foundGuessed = (); # number of identified chunks per type 58 | 59 | my @features; # features on line 60 | my @sortedTypes; # sorted list of chunk type names 61 | 62 | # sanity check 63 | while (@ARGV and $ARGV[0] =~ /^-/) { 64 | if ($ARGV[0] eq "-l") { $latex = 1; shift(@ARGV); } 65 | elsif ($ARGV[0] eq "-r") { $raw = 1; shift(@ARGV); } 66 | elsif ($ARGV[0] eq "-d") { 67 | shift(@ARGV); 68 | if (not defined $ARGV[0]) { 69 | die "conlleval: -d requires delimiter character"; 70 | } 71 | $delimiter = shift(@ARGV); 72 | } elsif ($ARGV[0] eq "-o") { 73 | shift(@ARGV); 74 | if (not defined $ARGV[0]) { 75 | die "conlleval: -o requires delimiter character"; 76 | } 77 | $oTag = shift(@ARGV); 78 | } else { die "conlleval: unknown argument $ARGV[0]\n"; } 79 | } 80 | if (@ARGV) { die "conlleval: unexpected command line argument\n"; } 81 | # process input 82 | while () { 83 | chomp($line = $_); 84 | @features = split(/$delimiter/,$line); 85 | if ($nbrOfFeatures < 0) { $nbrOfFeatures = $#features; } 86 | elsif ($nbrOfFeatures != $#features and @features != 0) { 87 | printf STDERR "unexpected number of features: %d (%d)\n", 88 | $#features+1,$nbrOfFeatures+1; 89 | exit(1); 90 | } 91 | if (@features == 0 or 92 | $features[0] eq $boundary) { @features = ($boundary,"O","O"); } 93 | if (@features < 2) { 94 | die "conlleval: unexpected number of features in line $line\n"; 95 | } 96 | if ($raw) { 97 | if ($features[$#features] eq $oTag) { $features[$#features] = "O"; } 98 | if ($features[$#features-1] eq $oTag) { $features[$#features-1] = "O"; } 99 | if ($features[$#features] ne "O") { 100 | $features[$#features] = "B-$features[$#features]"; 101 | } 102 | if ($features[$#features-1] ne "O") { 103 | $features[$#features-1] = "B-$features[$#features-1]"; 104 | } 105 | } 106 | # 20040126 ET code which allows hyphens in the types 107 | if ($features[$#features] =~ /^([^-]*)-(.*)$/) { 108 | $guessed = $1; 109 | $guessedType = $2; 110 | } else { 111 | $guessed = $features[$#features]; 112 | $guessedType = ""; 113 | } 114 | pop(@features); 115 | if ($features[$#features] =~ /^([^-]*)-(.*)$/) { 116 | $correct = $1; 117 | $correctType = $2; 118 | } else { 119 | $correct = $features[$#features]; 120 | $correctType = ""; 121 | } 122 | pop(@features); 123 | # ($guessed,$guessedType) = split(/-/,pop(@features)); 124 | # ($correct,$correctType) = split(/-/,pop(@features)); 125 | $guessedType = $guessedType ? $guessedType : ""; 126 | $correctType = $correctType ? $correctType : ""; 127 | $firstItem = shift(@features); 128 | 129 | # 1999-06-26 sentence breaks should always be counted as out of chunk 130 | if ( $firstItem eq $boundary ) { $guessed = "O"; } 131 | 132 | if ($inCorrect) { 133 | if ( &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and 134 | &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and 135 | $lastGuessedType eq $lastCorrectType) { 136 | $inCorrect=$false; 137 | $correctChunk++; 138 | $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ? 139 | $correctChunk{$lastCorrectType}+1 : 1; 140 | } elsif ( 141 | &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) != 142 | &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) or 143 | $guessedType ne $correctType ) { 144 | $inCorrect=$false; 145 | } 146 | } 147 | 148 | if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and 149 | &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and 150 | $guessedType eq $correctType) { $inCorrect = $true; } 151 | 152 | if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) ) { 153 | $foundCorrect++; 154 | $foundCorrect{$correctType} = $foundCorrect{$correctType} ? 155 | $foundCorrect{$correctType}+1 : 1; 156 | } 157 | if ( &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) ) { 158 | $foundGuessed++; 159 | $foundGuessed{$guessedType} = $foundGuessed{$guessedType} ? 160 | $foundGuessed{$guessedType}+1 : 1; 161 | } 162 | if ( $firstItem ne $boundary ) { 163 | if ( $correct eq $guessed and $guessedType eq $correctType ) { 164 | $correctTags++; 165 | } 166 | $tokenCounter++; 167 | } 168 | 169 | $lastGuessed = $guessed; 170 | $lastCorrect = $correct; 171 | $lastGuessedType = $guessedType; 172 | $lastCorrectType = $correctType; 173 | } 174 | if ($inCorrect) { 175 | $correctChunk++; 176 | $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ? 177 | $correctChunk{$lastCorrectType}+1 : 1; 178 | } 179 | 180 | if (not $latex) { 181 | # compute overall precision, recall and FB1 (default values are 0.0) 182 | $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0); 183 | $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0); 184 | $FB1 = 2*$precision*$recall/($precision+$recall) 185 | if ($precision+$recall > 0); 186 | 187 | # print overall performance 188 | printf "processed $tokenCounter tokens with $foundCorrect phrases; "; 189 | printf "found: $foundGuessed phrases; correct: $correctChunk.\n"; 190 | if ($tokenCounter>0) { 191 | printf "accuracy: %6.2f%%; ",100*$correctTags/$tokenCounter; 192 | printf "precision: %6.2f%%; ",$precision; 193 | printf "recall: %6.2f%%; ",$recall; 194 | printf "FB1: %6.2f\n",$FB1; 195 | } 196 | } 197 | 198 | # sort chunk type names 199 | undef($lastType); 200 | @sortedTypes = (); 201 | foreach $i (sort (keys %foundCorrect,keys %foundGuessed)) { 202 | if (not($lastType) or $lastType ne $i) { 203 | push(@sortedTypes,($i)); 204 | } 205 | $lastType = $i; 206 | } 207 | # print performance per chunk type 208 | if (not $latex) { 209 | for $i (@sortedTypes) { 210 | $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0; 211 | if (not($foundGuessed{$i})) { $foundGuessed{$i} = 0; $precision = 0.0; } 212 | else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; } 213 | if (not($foundCorrect{$i})) { $recall = 0.0; } 214 | else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; } 215 | if ($precision+$recall == 0.0) { $FB1 = 0.0; } 216 | else { $FB1 = 2*$precision*$recall/($precision+$recall); } 217 | printf "%17s: ",$i; 218 | printf "precision: %6.2f%%; ",$precision; 219 | printf "recall: %6.2f%%; ",$recall; 220 | printf "FB1: %6.2f %d\n",$FB1,$foundGuessed{$i}; 221 | } 222 | } else { 223 | print " & Precision & Recall & F\$_{\\beta=1} \\\\\\hline"; 224 | for $i (@sortedTypes) { 225 | $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0; 226 | if (not($foundGuessed{$i})) { $precision = 0.0; } 227 | else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; } 228 | if (not($foundCorrect{$i})) { $recall = 0.0; } 229 | else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; } 230 | if ($precision+$recall == 0.0) { $FB1 = 0.0; } 231 | else { $FB1 = 2*$precision*$recall/($precision+$recall); } 232 | printf "\n%-7s & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\", 233 | $i,$precision,$recall,$FB1; 234 | } 235 | print "\\hline\n"; 236 | $precision = 0.0; 237 | $recall = 0; 238 | $FB1 = 0.0; 239 | $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0); 240 | $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0); 241 | $FB1 = 2*$precision*$recall/($precision+$recall) 242 | if ($precision+$recall > 0); 243 | printf "Overall & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline\n", 244 | $precision,$recall,$FB1; 245 | } 246 | 247 | exit 0; 248 | 249 | # endOfChunk: checks if a chunk ended between the previous and current word 250 | # arguments: previous and current chunk tags, previous and current types 251 | # note: this code is capable of handling other chunk representations 252 | # than the default CoNLL-2000 ones, see EACL'99 paper of Tjong 253 | # Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006 254 | 255 | sub endOfChunk { 256 | my $prevTag = shift(@_); 257 | my $tag = shift(@_); 258 | my $prevType = shift(@_); 259 | my $type = shift(@_); 260 | my $chunkEnd = $false; 261 | 262 | if ( $prevTag eq "B" and $tag eq "B" ) { $chunkEnd = $true; } 263 | if ( $prevTag eq "B" and $tag eq "O" ) { $chunkEnd = $true; } 264 | if ( $prevTag eq "B" and $tag eq "S" ) { $chunkEnd = $true; } 265 | 266 | if ( $prevTag eq "I" and $tag eq "B" ) { $chunkEnd = $true; } 267 | if ( $prevTag eq "I" and $tag eq "S" ) { $chunkEnd = $true; } 268 | if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; } 269 | 270 | if ( $prevTag eq "E" and $tag eq "E" ) { $chunkEnd = $true; } 271 | if ( $prevTag eq "E" and $tag eq "I" ) { $chunkEnd = $true; } 272 | if ( $prevTag eq "E" and $tag eq "O" ) { $chunkEnd = $true; } 273 | if ( $prevTag eq "E" and $tag eq "S" ) { $chunkEnd = $true; } 274 | if ( $prevTag eq "E" and $tag eq "B" ) { $chunkEnd = $true; } 275 | 276 | if ( $prevTag eq "S" and $tag eq "E" ) { $chunkEnd = $true; } 277 | if ( $prevTag eq "S" and $tag eq "I" ) { $chunkEnd = $true; } 278 | if ( $prevTag eq "S" and $tag eq "O" ) { $chunkEnd = $true; } 279 | if ( $prevTag eq "S" and $tag eq "S" ) { $chunkEnd = $true; } 280 | if ( $prevTag eq "S" and $tag eq "B" ) { $chunkEnd = $true; } 281 | 282 | 283 | if ($prevTag ne "O" and $prevTag ne "." and $prevType ne $type) { 284 | $chunkEnd = $true; 285 | } 286 | 287 | # corrected 1998-12-22: these chunks are assumed to have length 1 288 | if ( $prevTag eq "]" ) { $chunkEnd = $true; } 289 | if ( $prevTag eq "[" ) { $chunkEnd = $true; } 290 | 291 | return($chunkEnd); 292 | } 293 | 294 | # startOfChunk: checks if a chunk started between the previous and current word 295 | # arguments: previous and current chunk tags, previous and current types 296 | # note: this code is capable of handling other chunk representations 297 | # than the default CoNLL-2000 ones, see EACL'99 paper of Tjong 298 | # Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006 299 | 300 | sub startOfChunk { 301 | my $prevTag = shift(@_); 302 | my $tag = shift(@_); 303 | my $prevType = shift(@_); 304 | my $type = shift(@_); 305 | my $chunkStart = $false; 306 | 307 | if ( $prevTag eq "B" and $tag eq "B" ) { $chunkStart = $true; } 308 | if ( $prevTag eq "I" and $tag eq "B" ) { $chunkStart = $true; } 309 | if ( $prevTag eq "O" and $tag eq "B" ) { $chunkStart = $true; } 310 | if ( $prevTag eq "S" and $tag eq "B" ) { $chunkStart = $true; } 311 | if ( $prevTag eq "E" and $tag eq "B" ) { $chunkStart = $true; } 312 | 313 | if ( $prevTag eq "B" and $tag eq "S" ) { $chunkStart = $true; } 314 | if ( $prevTag eq "I" and $tag eq "S" ) { $chunkStart = $true; } 315 | if ( $prevTag eq "O" and $tag eq "S" ) { $chunkStart = $true; } 316 | if ( $prevTag eq "S" and $tag eq "S" ) { $chunkStart = $true; } 317 | if ( $prevTag eq "E" and $tag eq "S" ) { $chunkStart = $true; } 318 | 319 | if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; } 320 | if ( $prevTag eq "S" and $tag eq "I" ) { $chunkStart = $true; } 321 | if ( $prevTag eq "E" and $tag eq "I" ) { $chunkStart = $true; } 322 | 323 | if ( $prevTag eq "S" and $tag eq "E" ) { $chunkStart = $true; } 324 | if ( $prevTag eq "E" and $tag eq "E" ) { $chunkStart = $true; } 325 | if ( $prevTag eq "O" and $tag eq "E" ) { $chunkStart = $true; } 326 | 327 | if ($tag ne "O" and $tag ne "." and $prevType ne $type) { 328 | $chunkStart = $true; 329 | } 330 | 331 | # corrected 1998-12-22: these chunks are assumed to have length 1 332 | if ( $tag eq "[" ) { $chunkStart = $true; } 333 | if ( $tag eq "]" ) { $chunkStart = $true; } 334 | 335 | return($chunkStart); 336 | } 337 | -------------------------------------------------------------------------------- /models/decoders.py: -------------------------------------------------------------------------------- 1 | __author__ = 'chuntingzhou and aditichaudhary' 2 | from utils.util import * 3 | import numpy as np 4 | from collections import defaultdict 5 | from scipy.special import logsumexp 6 | 7 | 8 | class Decoder(): 9 | def __init__(self, tag_size): 10 | # type: () -> object 11 | pass 12 | 13 | def decode_loss(self): 14 | raise NotImplementedError 15 | 16 | def decoding(self): 17 | raise NotImplementedError 18 | 19 | 20 | def constrained_transition_init(transition_matrix, contraints): 21 | ''' 22 | :param transition_matrix: numpy array, (from, to) 23 | :param contraints: [[from_indexes], [to_indexes]] 24 | :return: newly initialized transition matrix 25 | ''' 26 | for cons in contraints: 27 | transition_matrix[cons[0], cons[1]] = -1000.0 28 | return transition_matrix 29 | 30 | class chain_CRF_decoder(Decoder): 31 | ''' For NER and POS Tagging. ''' 32 | 33 | def __init__(self, args, model, src_output_dim, tag_emb_dim, tag_size, constraints=None): 34 | Decoder.__init__(self, tag_size) 35 | self.model = model 36 | self.start_id = tag_size 37 | self.end_id = tag_size + 1 38 | self.tag_size = tag_size + 2 39 | tag_size = tag_size + 2 40 | self.args = args 41 | 42 | # optional: transform the hidden space of src encodings into the tag embedding space 43 | self.W_src2tag_readout = model.add_parameters((tag_emb_dim, src_output_dim)) 44 | self.b_src2tag_readout = model.add_parameters((tag_emb_dim)) 45 | self.b_src2tag_readout.zero() 46 | 47 | self.W_scores_readout2tag = model.add_parameters((tag_size, tag_emb_dim)) 48 | self.b_scores_readout2tag = model.add_parameters((tag_size)) 49 | self.b_scores_readout2tag.zero() 50 | 51 | # (to, from), trans[i] is the transition score to i 52 | init_transition_matrix = np.random.randn(tag_size, tag_size) # from, to 53 | init_transition_matrix[:, self.end_id] = -1000.0 54 | init_transition_matrix[self.start_id, :] = -1000.0 55 | if False and constraints is not None: 56 | init_transition_matrix = constrained_transition_init(init_transition_matrix, constraints) 57 | # print init_transition_matrix 58 | #self.transition_matrix = model.add_lookup_parameters((tag_size, tag_size), 59 | # init=dy.NumpyInitializer(init_transition_matrix)) 60 | self.transition_matrix = model.lookup_parameters_from_numpy(init_transition_matrix) # (to, from) 61 | 62 | self.ngram = args.ngram 63 | 64 | self.entropy_threshold = args.entropy_threshold 65 | if args.entropy_threshold is not None and args.use_CFB: 66 | self.entropy_threshold = args.entropy_threshold * -1 67 | 68 | self.prob_threshold = np.NINF 69 | self.entropy_spans = defaultdict(lambda: 0) 70 | self.most_uncertain_entropy_spans = {} 71 | self.entropy_spans_number = defaultdict(lambda: 0) 72 | self.full_sentences = defaultdict(list) 73 | self.avg_spans_in_sent_entropy = defaultdict(list) 74 | self.SPAN_wise = args.SPAN_wise 75 | 76 | def forward_alg(self, tag_scores): 77 | ''' Forward DP for CRF. 78 | tag_scores (list of batched dy.Tensor): (tag_size, batchsize) 79 | ''' 80 | # Be aware: if a is lookup_parameter with 2 dimension, then a[i] returns one row; 81 | # if b = dy.parameter(a), then b[i] returns one column; which means dy.parameter(a) already transpose a 82 | # transpose_transition_score = self.transition_matrix 83 | transpose_transition_score = dy.parameter(self.transition_matrix) # (from, to) 84 | 85 | # alpha(t', s) = the score of sequence from t=0 to t=t' in log space 86 | # np_init_alphas = -100.0 * np.ones((self.tag_size, batch_size)) 87 | # np_init_alphas[self.start_id, :] = 0.0 88 | # alpha_tm1 = dy.inputTensor(np_init_alphas, batched=True) 89 | alphas = [] 90 | 91 | alpha_tm1 = transpose_transition_score[self.start_id] + tag_scores[0] 92 | # self.transition_matrix[i]: from i, column 93 | # transpose_score[i]: to i, row 94 | # transpose_score: to, from 95 | alphas.append(alpha_tm1) 96 | 97 | for tag_score in tag_scores[1:]: 98 | # extend for each transit 99 | alpha_tm1 = dy.concatenate_cols([alpha_tm1] * self.tag_size) # (from, to, batch_size) 100 | # each column i of tag_score will be the repeated emission score to tag i 101 | tag_score = dy.transpose(dy.concatenate_cols([tag_score] * self.tag_size)) 102 | alpha_t = alpha_tm1 + transpose_transition_score + tag_score 103 | alpha_tm1 = log_sum_exp_dim_0(alpha_t) # (tag_size, batch_size) 104 | alphas.append(alpha_tm1) 105 | 106 | terminal_alpha = log_sum_exp_dim_0(alpha_tm1 + self.transition_matrix[self.end_id]) # (1, batch_size) 107 | return terminal_alpha,alphas 108 | 109 | def score_one_sequence(self, tag_scores, tags, batch_size): 110 | ''' tags: list of tag ids at each time step ''' 111 | # print tags, batch_size 112 | # print batch_size 113 | # print "scoring one sentence" 114 | tags = [[self.start_id] * batch_size] + tags # len(tag_scores) = len(tags) - 1 115 | score = dy.inputTensor(np.zeros(batch_size), batched=True) 116 | # tag_scores = dy.concatenate_cols(tag_scores) # tot_tags, sent_len, batch_size 117 | # print "tag dim: ", tag_scores.dim() 118 | for i in range(len(tags) - 1): 119 | score += dy.pick_batch(dy.lookup_batch(self.transition_matrix, tags[i + 1]), tags[i]) \ 120 | + dy.pick_batch(tag_scores[i], tags[i + 1]) 121 | score += dy.pick_batch(dy.lookup_batch(self.transition_matrix, [self.end_id]*batch_size), tags[-1]) 122 | return score 123 | 124 | def backward_one_sequence(self, tag_scores): 125 | ''' Backward DP for CRF. 126 | tag_scores (list of batched dy.Tensor): (tag_size, batchsize) 127 | ''' 128 | # Be aware: if a is lookup_parameter with 2 dimension, then a[i] returns one row; 129 | # if b = dy.parameter(a), then b[i] returns one column; which means dy.parameter(a) already transpose a 130 | transpose_transition_score = dy.parameter(self.transition_matrix) 131 | # transpose_transition_score = dy.parameter(self.transition_matrix) 132 | 133 | # alpha(t', s) = the score of sequence from t=0 to t=t' in log space 134 | # np_init_alphas = -100.0 * np.ones((self.tag_size, batch_size)) 135 | # np_init_alphas[self.start_id, :] = 0.0 136 | # alpha_tm1 = dy.inputTensor(np_init_alphas, batched=True) 137 | betas = [] 138 | # beta_tp1 = self.transition_matrix[self.end_id] + tag_scores[-1] 139 | # beta_tp1 = dy.inputTensor(np.zeros(self.tag_size)) 140 | beta_tp1 = self.transition_matrix[self.end_id] 141 | betas.append(beta_tp1) 142 | # self.transition_matrix[i]: from i, column 143 | # transpose_score[i]: to i, row 144 | # transpose_score: to, from 145 | seq_len = len(tag_scores) 146 | tag_scores.reverse() 147 | for tag_score in tag_scores[0:seq_len - 1]: 148 | # extend for each transit 149 | beta_tp1 = dy.concatenate_cols([beta_tp1] * self.tag_size) # (to, from, batch_size) 150 | # each column i of tag_score will be the repeated emission score to tag i 151 | tag_score = dy.concatenate_cols([tag_score] * self.tag_size) # (to, from) 152 | beta_t = beta_tp1 + dy.transpose(transpose_transition_score) + tag_score # (to, from) 153 | beta_tp1 = log_sum_exp_dim_0(beta_t) # (tag_size, batch_size) 154 | betas.append(beta_tp1) 155 | 156 | # betas.append(beta_tp1 + transpose_transition_score[self.start_id] + tag_scores[-1]) 157 | terminal_beta = log_sum_exp_dim_0( 158 | beta_tp1 + transpose_transition_score[self.start_id] + tag_scores[-1]) # (1, batch_size) 159 | betas.reverse() 160 | return terminal_beta, betas 161 | 162 | def get_uncertain_subsequences(self, sents, tag_scores, alphas, betas, Z, gammas, 163 | best_path, B_tags, I_tags, O_tags): 164 | # predicted_path = deepcopy(best_path) 165 | # transition_B_O = np.array(dy.pick_batch(dy.lookup_batch(self.transition_matrix, O_tags), B_tags).value()) 166 | # transition_I_O = np.array(dy.pick_batch(dy.lookup_batch(self.transition_matrix, O_tags), I_tags).value()) 167 | # transition_B_I = np.array(dy.pick_batch(dy.lookup_batch(self.transition_matrix, I_tags), B_tags).value()) 168 | # transition_I_I = np.array(dy.pick_batch(dy.lookup_batch(self.transition_matrix, I_tags), I_tags).value()) 169 | 170 | 171 | first = True 172 | Z = Z.value() 173 | for i in range(len(sents)): 174 | # log_p_alpha = np.array(alphas[i].value())[B_tags] 175 | # transition = transition_B_O 176 | log_pin = logsumexp(np.array(gammas[i].value())[B_tags]) #Prob (y=start_entity|x)= log_sum{tags}# (e^log(P=B-tag|x)) 177 | 178 | for j in range(i + 1, len(sents)): 179 | if (j - i) > self.ngram: 180 | break 181 | 182 | log_p_out = np.array(gammas[j].value())[O_tags] 183 | log_p = log_pin + log_p_out 184 | p = np.exp(log_p) 185 | if p > 1.0: 186 | #print(p, log_p) 187 | H= 0.0 188 | else: 189 | H = -(p * log_p) - ((1-p)* np.log(1-p)) 190 | 191 | if H > self.entropy_threshold: 192 | # best_path = deepcopy(predicted_path) 193 | span = " ".join([str(x) for x in sents[i:j]]) 194 | sent = " ".join([str(x) for x in sents]) 195 | self.entropy_spans[span] += H 196 | self.entropy_spans_number[span] += 1 197 | 198 | if self.SPAN_wise: 199 | if span in self.most_uncertain_entropy_spans: 200 | (existing_entropy, _,_,_,_) = self.most_uncertain_entropy_spans[span] 201 | #if H > existing_entropy: 202 | # self.most_uncertain_entropy_spans[span] = (H, sent, i,j,best_path) 203 | self.most_uncertain_entropy_spans[span] = (self.entropy_spans[span], sent, i, j, best_path) 204 | else: 205 | self.most_uncertain_entropy_spans[span] = (H, sent, i,j,best_path) 206 | 207 | # for k in range(i,j+1): 208 | # best_path[k] = -10 209 | self.full_sentences[sent].append((i, j, best_path, self.entropy_spans[span])) 210 | self.avg_spans_in_sent_entropy[sent].append(span) 211 | # self.full_sentences[span] = (sents,best_path,predicted_path, self.entropy_spans[span]) 212 | 213 | log_pin += logsumexp(np.array(gammas[j].value())[I_tags]) 214 | if log_pin < np.log(1e-4): 215 | break 216 | 217 | 218 | def get_uncertain_subsequences_CFB(self, sents, tag_scores, alphas, betas, Z, gammas, 219 | best_path, tag_to_id,B_UNK, I_UNK): 220 | first = True 221 | Z = Z.value() 222 | entropy_spans_number = defaultdict(lambda :0) 223 | for i in range(len(sents)): 224 | known_tags =np.array([[0]] * len(sents)).reshape((len(sents),1,1)) 225 | known_tags[i][0][0] = 1 226 | tags = np.array([B_UNK] * len(sents)).reshape((len(sents),1)) 227 | tags[i][0] = best_path[i] 228 | for j in range(i + 1, len(sents)): 229 | if (j - i + 1) > self.ngram: 230 | break 231 | tags[j][0] = best_path[j] 232 | known_tags[j][0][0] = 1 233 | Z_span = self.score_one_sequence_partial(tag_scores, tags, 1, known_tags, tag_to_id, B_UNK, I_UNK) 234 | confidence = Z_span.value() - Z 235 | 236 | if confidence < self.entropy_threshold: 237 | # best_path = deepcopy(predicted_path) 238 | span = " ".join([str(x) for x in sents[i:j + 1]]) 239 | sent = " ".join([str(x) for x in sents]) 240 | 241 | if self.SPAN_wise: 242 | if span in self.most_uncertain_entropy_spans: 243 | (existing_threshold, _,_,_,_) = self.most_uncertain_entropy_spans[span] 244 | if confidence < existing_threshold: 245 | self.most_uncertain_entropy_spans[span] = (confidence, sent, i, j,best_path) 246 | else: 247 | self.most_uncertain_entropy_spans[span] = (confidence, sent, i, j,best_path) 248 | 249 | self.entropy_spans[span] += confidence 250 | self.entropy_spans_number[span] += 1 251 | #self.full_sentences[sent].append((i, j, best_path, self.entropy_spans[span])) 252 | self.full_sentences[sent].append((i, j, best_path, span)) 253 | #self.avg_spans_in_sent_entropy[sent].append(self.entropy_spans[span]) 254 | self.avg_spans_in_sent_entropy[sent].append(span) 255 | 256 | 257 | def decode_loss(self, src_encodings, tgt_tags, use_partial, known_tags, tag_to_id, B_UNK, I_UNK): 258 | # This is the batched version which requires bucketed batch input with the same length. 259 | ''' 260 | The length of src_encodings and tgt_tags are time_steps. 261 | src_encodings: list of dynet.Tensor (src_output_dim, batch_size) 262 | tgt_tags: list of tag ids [(1, batch_size)] 263 | return: average of negative log likelihood 264 | ''' 265 | # TODO: transpose tgt tags first 266 | batch_size = len(tgt_tags) 267 | tgt_tags, tgt_mask = transpose_input(tgt_tags, 0) 268 | known_tags, _ = transpose_input(known_tags, 0) 269 | 270 | W_src2tag_readout = dy.parameter(self.W_src2tag_readout) 271 | b_src2tag_readout = dy.parameter(self.b_src2tag_readout) 272 | W_score_tag = dy.parameter(self.W_scores_readout2tag) 273 | b_score_tag = dy.parameter(self.b_scores_readout2tag) 274 | 275 | tag_embs = [dy.tanh(dy.affine_transform([b_src2tag_readout, W_src2tag_readout, src_encoding])) for src_encoding 276 | in src_encodings] 277 | 278 | tag_scores = [dy.affine_transform([b_score_tag, W_score_tag, tag_emb]) for tag_emb in tag_embs] 279 | 280 | # scores over all paths, all scores are in log-space 281 | forward_scores,_ = self.forward_alg(tag_scores) 282 | 283 | if use_partial: 284 | gold_score = self.score_one_sequence_partial(tag_scores, tgt_tags, batch_size, known_tags, tag_to_id, B_UNK, 285 | I_UNK) 286 | else: 287 | gold_score = self.score_one_sequence(tag_scores, tgt_tags, batch_size) 288 | 289 | # negative log likelihood 290 | loss = dy.sum_batches(forward_scores - gold_score) / batch_size 291 | return loss #, dy.sum_batches(forward_scores)/batch_size, dy.sum_batches(gold_score) / batch_size 292 | 293 | def makeMask(self, batch_size, known_tags, tag_to_id, tags, index, B_UNK, I_UNK): 294 | mask_w_0 = np.array([[-1000] * self.tag_size]) 295 | mask_w_0 = np.transpose(mask_w_0) 296 | mask_w_0_all_s = np.reshape(np.array([mask_w_0] * batch_size), (self.tag_size, batch_size)) 297 | 298 | mask_idx = [] 299 | tag_vals = [] 300 | for idx, w0_si in enumerate(known_tags[index]): 301 | if w0_si[0] == 1: 302 | mask_idx.append(idx) 303 | tag_vals.append(tags[index][idx]) 304 | else: 305 | if tags[index][idx] == B_UNK: 306 | if self.args.misc: 307 | possible_labels = ["B-LOC", "B-PER", "B-ORG", "B-MISC", "O","I-LOC", "I-PER", "I-ORG", "I-MISC"] 308 | else: 309 | possible_labels = ["B-LOC", "B-PER", "B-ORG", "B-GPE", "O","I-LOC", "I-PER", "I-ORG", "I-GPE"] 310 | for pl in possible_labels: 311 | mask_idx.append(idx) 312 | tag_vals.append(tag_to_id[pl]) 313 | mask_w_0_all_s[tag_vals, mask_idx] = 0 314 | return mask_w_0_all_s 315 | 316 | def score_one_sequence_partial(self, tag_scores, tags, batch_size, known_tags, tag_to_id, B_UNK, I_UNK): 317 | transpose_transition_score = dy.parameter(self.transition_matrix) 318 | 319 | alpha_tm1 = transpose_transition_score[self.start_id] + tag_scores[0] 320 | 321 | mask_w_0_all_s = self.makeMask(batch_size, known_tags, tag_to_id, tags, 0, B_UNK, I_UNK) 322 | i = 1 323 | alpha_tm1 = alpha_tm1 + dy.inputTensor(mask_w_0_all_s, batched=True) 324 | for tag_score in tag_scores[1:]: 325 | alpha_tm1 = dy.concatenate_cols([alpha_tm1] * self.tag_size) # (from, to, batch_size) 326 | tag_score = dy.transpose(dy.concatenate_cols([tag_score] * self.tag_size)) 327 | alpha_t = alpha_tm1 + transpose_transition_score + tag_score 328 | alpha_tm1 = log_sum_exp_dim_0(alpha_t) # (tag_size, batch_size) 329 | mask_w_i_all_s = self.makeMask(batch_size, known_tags, tag_to_id, tags, i, B_UNK, I_UNK) 330 | alpha_tm1 = alpha_tm1 + dy.inputTensor(mask_w_i_all_s, batched=True) 331 | i = i + 1 332 | 333 | terminal_alpha = log_sum_exp_dim_0(alpha_tm1 + self.transition_matrix[self.end_id]) # (1, batch_size) 334 | return terminal_alpha 335 | 336 | 337 | def get_crf_scores(self, src_encodings): 338 | W_src2tag_readout = dy.parameter(self.W_src2tag_readout) 339 | b_src2tag_readout = dy.parameter(self.b_src2tag_readout) 340 | W_score_tag = dy.parameter(self.W_scores_readout2tag) 341 | b_score_tag = dy.parameter(self.b_scores_readout2tag) 342 | 343 | tag_embs = [dy.tanh(dy.affine_transform([b_src2tag_readout, W_src2tag_readout, src_encoding])) 344 | for src_encoding in src_encodings] 345 | tag_scores = [dy.affine_transform([b_score_tag, W_score_tag, tag_emb]) for tag_emb in tag_embs] 346 | 347 | transpose_transition_score = dy.parameter(self.transition_matrix) # (from, to) 348 | 349 | return transpose_transition_score.npvalue(), [ts.npvalue() for ts in tag_scores] 350 | 351 | def decoding(self, src_encodings,OTag, addbias=False): 352 | ''' Viterbi decoding for a single sequence. ''' 353 | W_src2tag_readout = dy.parameter(self.W_src2tag_readout) 354 | b_src2tag_readout = dy.parameter(self.b_src2tag_readout) 355 | W_score_tag = dy.parameter(self.W_scores_readout2tag) 356 | b_score_tag = dy.parameter(self.b_scores_readout2tag) 357 | 358 | tag_embs = [dy.tanh(dy.affine_transform([b_src2tag_readout, W_src2tag_readout, src_encoding])) 359 | for src_encoding in src_encodings] 360 | if addbias: 361 | b_score_tag = np.zeros(self.tag_size) 362 | b_score_tag[OTag] = 0.5 363 | b_score_tag = dy.inputTensor(b_score_tag) 364 | 365 | 366 | tag_scores = [dy.affine_transform([b_score_tag, W_score_tag, tag_emb]) for tag_emb in tag_embs] 367 | 368 | back_trace_tags = [] 369 | np_init_alpha = np.ones(self.tag_size) * -2000.0 370 | np_init_alpha[self.start_id] = 0.0 371 | max_tm1 = dy.inputTensor(np_init_alpha) 372 | transpose_transition_score = dy.parameter(self.transition_matrix) # (from, to) 373 | 374 | for i, tag_score in enumerate(tag_scores): 375 | max_tm1 = dy.concatenate_cols([max_tm1] * self.tag_size) 376 | max_t = max_tm1 + transpose_transition_score 377 | if i != 0: 378 | eval_score = max_t.npvalue()[:-2, :] 379 | else: 380 | eval_score = max_t.npvalue() 381 | best_tag = np.argmax(eval_score, axis=0) 382 | back_trace_tags.append(best_tag) 383 | max_tm1 = dy.inputTensor(eval_score[best_tag, range(self.tag_size)]) + tag_score 384 | 385 | terminal_max_T = max_tm1 + self.transition_matrix[self.end_id] 386 | eval_terminal = terminal_max_T.npvalue()[:-2] 387 | best_tag = np.argmax(eval_terminal, axis=0) 388 | best_path_score = eval_terminal[best_tag] 389 | 390 | best_path = [best_tag] 391 | for btpoint in reversed(back_trace_tags): 392 | best_tag = btpoint[best_tag] 393 | best_path.append(best_tag) 394 | start = best_path.pop() 395 | assert start == self.start_id 396 | best_path.reverse() 397 | return best_path_score, best_path, tag_scores 398 | 399 | def cal_accuracy(self, pred_path, true_path): 400 | return np.sum(np.equal(pred_path, true_path).astype(np.float32)) / len(pred_path) 401 | 402 | 403 | def ensemble_viterbi_decoding(l_tag_scores, l_transit_score, tag_size): 404 | back_trace_tags = [] 405 | tag_size = tag_size + 2 406 | start_id = tag_size - 2 407 | end_id = tag_size - 1 408 | max_tm1 = np.ones(tag_size) * -2000.0 409 | max_tm1[start_id] = 0.0 410 | 411 | tag_scores = [] 412 | for i in range(len(l_tag_scores[0])): 413 | tag_scores.append(sum([ts[i] for ts in l_tag_scores]) / len(l_tag_scores)) 414 | transpose_transition_score = sum(l_transit_score) / len(l_transit_score) # (from, to) 415 | 416 | for i, tag_score in enumerate(tag_scores): 417 | max_tm1 = np.tile(np.expand_dims(max_tm1, axis=1), (1, tag_size)) 418 | max_t = max_tm1 + transpose_transition_score 419 | if i != 0: 420 | eval_score = max_t[:-2, :] 421 | else: 422 | eval_score = max_t 423 | best_tag = np.argmax(eval_score, axis=0) 424 | back_trace_tags.append(best_tag) 425 | max_tm1 = eval_score[best_tag, range(tag_size)] + tag_score 426 | 427 | terminal_max_T = max_tm1 + transpose_transition_score[:, end_id] 428 | eval_terminal = terminal_max_T[:-2] 429 | best_tag = np.argmax(eval_terminal, axis=0) 430 | best_path_score = eval_terminal[best_tag] 431 | 432 | best_path = [best_tag] 433 | for btpoint in reversed(back_trace_tags): 434 | best_tag = btpoint[best_tag] 435 | best_path.append(best_tag) 436 | start = best_path.pop() 437 | assert start == start_id 438 | best_path.reverse() 439 | return best_path_score, best_path 440 | 441 | 442 | class classifier(Decoder): 443 | def __init__(self, model, input_dim, tag_size): 444 | self.W_softmax = model.add_parameters((tag_size, input_dim)) 445 | self.b_softmax = model.add_parameters((tag_size)) 446 | 447 | def decode_loss(self, src_encoding, tgt_tags): 448 | batch_size = len(tgt_tags) 449 | tgt_tags, tgt_mask = transpose_input(tgt_tags, 0) 450 | 451 | assert len(src_encoding) == len(tgt_tags) 452 | 453 | W_softmax = dy.parameter(self.W_softmax) 454 | b_softmax = dy.parameter(self.b_softmax) 455 | 456 | predictions = [dy.affine_transform([b_softmax, W_softmax, src_emb]) for src_emb in src_encoding] 457 | 458 | losses = [dy.pickneglogsoftmax_batch(pred, tgt) for pred, tgt in zip(predictions, tgt_tags)] 459 | 460 | loss = dy.sum_batches(dy.esum(losses)) / (batch_size * len(src_encoding)) 461 | 462 | return loss 463 | 464 | def decoding(self, src_encoding): 465 | W_softmax = dy.parameter(self.W_softmax) 466 | b_softmax = dy.parameter(self.b_softmax) 467 | predictions = [dy.affine_transform([b_softmax, W_softmax, src_emb]) for src_emb in src_encoding] 468 | 469 | predictions = [np.argmax(pred.npvalue()) for pred in predictions] 470 | 471 | return None, predictions 472 | --------------------------------------------------------------------------------