├── QANet ├── README.md ├── answers │ ├── ver20 │ │ └── ans_files │ ├── ver60 │ │ └── ans_files │ └── ver646 │ │ └── ans_files ├── config_ans_v20.py ├── config_cla_v60.py ├── config_cla_v646.py ├── data │ ├── pickles │ │ └── log │ └── prepro │ │ └── log ├── datasets │ ├── aic18 │ │ ├── test_mini.json │ │ ├── train_mini.json │ │ └── valid_mini.json │ └── word2vec │ │ └── log ├── layers.py ├── main_debug_ans_v20.py ├── main_debug_cla_v60.py ├── main_debug_cla_ver646.py ├── model_ans_v20.py ├── model_cla_v60.py ├── model_cla_ver646.py ├── models │ ├── ver20 │ │ ├── event │ │ │ └── log │ │ └── model │ │ │ └── log │ ├── ver60 │ │ ├── event │ │ │ └── log │ │ └── model │ │ │ └── log │ └── ver646 │ │ ├── event │ │ └── log │ │ └── model │ │ └── log ├── prepro.py ├── preprocess.py ├── run_preprocess.py ├── util.py └── vote.py ├── README.md ├── capsuleNet ├── README.md ├── dataset.py ├── layers │ ├── __init__.py │ ├── basic_rnn.py │ ├── caps_layer.py │ └── match_layer.py ├── model.png ├── post_process.py ├── rc_model81.py ├── rc_model84.py ├── run81.py ├── run84.py └── vocab.py ├── data ├── devset │ └── dev_mini.json ├── trainset │ └── train_mini.json ├── v81 │ ├── logging2 │ └── testset │ │ └── test_mini.json ├── v84 │ ├── logging2 │ └── testset │ │ └── test_mini.json └── w2v │ └── log ├── start.sh └── vote_ser_new_word.py /QANet/README.md: -------------------------------------------------------------------------------- 1 | ## Model-1 QANet 2 | This model is based on QANet, but it has 2 submodel. In this folder, you can see 3 version of model, ver20 belongs to submodel-1, ver60 and ver646 belong to submode-2. I use these 3 version to ensemble the model. The details of these 3 version are: 3 | 4 | Version | Kernel Size Char | Kernel Size Conv | Hidden Size 5 | ---|---|---|--- 6 | ver20| 2 | 7 | 96 7 | ver60| 2 | 7 |96 8 | ver646| 1| 4 | 64 9 | 10 | Kernel Size Char: the kernel size used in char embedding layer. 11 | 12 | kernel Size Conv : the kernel size used in model encoder layer 13 | 14 | all of them use params below: 15 | * w2v : jwe_size300.txt,this is a word embedding trained by spliting Chinese characters into components, for example, we split “好” into "女" and "子". My partner Zhang does this excellent job. You can download it from [HERE](https://pan.baidu.com/s/1eKa7F-OBGQgLSsOaTtJDxg), the password is "qt16". 16 | * context length: 100, the max length for context, i want to keep context words num below 100. 17 | * query length: 30, the max length for query, i want to keep query words num below 30. 18 | 19 | If you want to know my code clearly, you should learn QANet principle first. 20 | And I have written a blog for you :D 21 | 22 | [彻底弄懂QANet](https://antdlx.com/qanet/) 23 | 24 | ## SubModel-1 25 | ![submodel1](http://cdn.antdlx.com/qa20.png) 26 | 1. I add an alternatives embedding layer 27 | 2. I change model encoder layer's encoder block num from 3 to 2 28 | 3. I change output layer 29 | 30 | ## SubModel-2 31 | ![submodel2](http://cdn.antdlx.com/qa60.png) 32 | 1. I change model encoder layer's encoder block num from 3 to 2 33 | 2. I change output layer 34 | 35 | ## Usage 36 | run config_xx_vxx.py. You need to set at least 2 params: 37 | * --mode:test/valid/train/debug 38 | * --input: test/valid/train file path, especially , debug mode uses train path 39 | 40 | Be careful, the results are in answers folder, but these are tmp file, you can run "vote_ser_new_word.py" to get the final answers or edit codes by yourself. And thanks for my partner Liu wrote this .py 41 | -------------------------------------------------------------------------------- /QANet/answers/ver20/ans_files: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/antdlx/aic18_rc/1e243f0b385e4d7645658020ecb202866b9829aa/QANet/answers/ver20/ans_files -------------------------------------------------------------------------------- /QANet/answers/ver60/ans_files: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/antdlx/aic18_rc/1e243f0b385e4d7645658020ecb202866b9829aa/QANet/answers/ver60/ans_files -------------------------------------------------------------------------------- /QANet/answers/ver646/ans_files: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/antdlx/aic18_rc/1e243f0b385e4d7645658020ecb202866b9829aa/QANet/answers/ver646/ans_files -------------------------------------------------------------------------------- /QANet/config_ans_v20.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import tensorflow as tf 4 | 5 | from QANet.main_debug_ans_v20 import valid, test, train 6 | from QANet.prepro import prepro 7 | 8 | ''' 9 | This file is taken and modified from R-Net by HKUST-KnowComp 10 | https://github.com/HKUST-KnowComp/R-Net 11 | ''' 12 | 13 | flags = tf.flags 14 | parser = argparse.ArgumentParser('Reading Comprehension on aic dataset') 15 | parser.add_argument('--mode', default="test", 16 | help="Running mode test/valid/train/debug") 17 | parser.add_argument('--input', default="./datasets/aic18/test_mini.json", 18 | help='data input path') 19 | 20 | answer_dir = "./answers/ver20" 21 | model_dir = "./models/ver20" 22 | 23 | answer_file = os.path.join(answer_dir, "tmp_te_v20.txt") 24 | sorted_answer_file = os.path.join(answer_dir, "sorted_testa_v20.txt") 25 | valid_file = os.path.join(answer_dir, "tmp_va_v20.txt") 26 | sorted_valid_file = os.path.join(answer_dir, "sorted_valid_v20.txt") 27 | 28 | train_dir = "models" 29 | model_name = "ver20" 30 | dir_name = os.path.join(train_dir, model_name) 31 | if not os.path.exists(train_dir): 32 | os.mkdir(train_dir) 33 | if not os.path.exists(os.path.join(os.getcwd(),dir_name)): 34 | os.mkdir(os.path.join(os.getcwd(),dir_name)) 35 | dir_name = os.path.join(train_dir, model_name) 36 | log_dir = os.path.join(dir_name, "event") 37 | save_dir = os.path.join(dir_name, "model") 38 | 39 | if not os.path.exists(log_dir): 40 | os.makedirs(log_dir) 41 | if not os.path.exists(answer_dir): 42 | os.makedirs(answer_dir) 43 | flags.DEFINE_string("log_dir", log_dir, "Directory for tf event") 44 | flags.DEFINE_string("save_dir", save_dir, "Directory for saving model") 45 | flags.DEFINE_string("answer_file", answer_file, "Out file for answer") 46 | flags.DEFINE_string("sorted_answer_file", sorted_answer_file, "Out file for answer") 47 | flags.DEFINE_string("valid_file", valid_file, "Out file for answer") 48 | flags.DEFINE_string("sorted_valid_file", sorted_valid_file, "Out file for answer") 49 | flags.DEFINE_string("model_dir", model_dir, "Directory for saving model") 50 | flags.DEFINE_integer("glove_dim",300, "Embedding dimension for Glove") 51 | flags.DEFINE_integer("char_dim", 300, "Embedding dimension for char") 52 | 53 | flags.DEFINE_integer("para_limit",100, "200Limit length for paragraph") 54 | flags.DEFINE_integer("ques_limit", 30, "Limit length for question") 55 | flags.DEFINE_integer("ans_limit", 3, "Limit length for answers") 56 | flags.DEFINE_integer("test_para_limit", 1000, "Limit length for paragraph in test file") 57 | flags.DEFINE_integer("test_ques_limit", 100, "Limit length for question in test file") 58 | flags.DEFINE_integer("char_limit", 4, "Limit length for character") 59 | 60 | flags.DEFINE_integer("capacity", 15000, "Batch size of dataset shuffle") 61 | flags.DEFINE_integer("num_threads", 4, "Number of threads in input pipeline") 62 | flags.DEFINE_boolean("is_bucket", False, "build bucket batch iterator or not") 63 | flags.DEFINE_list("bucket_range", [40, 401, 40], "the range of bucket") 64 | 65 | flags.DEFINE_integer("batch_size", 128, "128Batch size") 66 | flags.DEFINE_integer("epoch", 25, "epoch num") 67 | flags.DEFINE_integer("period", 50, "100period to save batch loss") 68 | flags.DEFINE_integer("val_num_batches", 150, "Number of batches to evaluate the model") 69 | flags.DEFINE_float("dropout", 0.1, "Dropout prob across the layers") 70 | flags.DEFINE_float("grad_clip", 5.0, "Global Norm gradient clipping rate") 71 | flags.DEFINE_float("learning_rate", 0.001, "Learning rate") 72 | flags.DEFINE_float("decay", 0.9999, "Exponential moving average decay") 73 | flags.DEFINE_float("l2_norm", 3e-7, "L2 norm scale") 74 | flags.DEFINE_integer("hidden", 96, "Hidden size") 75 | flags.DEFINE_integer("num_heads", 1, "Number of heads in self attention") 76 | flags.DEFINE_integer("early_stop", 5, "Checkpoints for early stop") 77 | 78 | 79 | 80 | def main(_): 81 | config = flags.FLAGS 82 | args = parser.parse_args() 83 | if args.mode == "valid": 84 | word_mat,counter,_ = prepro(20) 85 | valid(config,word_mat,counter) 86 | elif args.mode == "test": 87 | word_mat,counter,_ = prepro(20,type=1,input=args.input) 88 | test(config,word_mat,counter,args.input) 89 | elif args.mode == "train": 90 | word_mat, train_counter,w2id = prepro(20, type=2) 91 | _,dev_counter,_ = prepro(20,type=0,embedding_table=w2id) 92 | train(config,word_mat,train_counter,dev_counter) 93 | elif args.mode == "debug": 94 | word_mat, train_counter,w2id = prepro(20, type=2) 95 | _,dev_counter,_ = prepro(20,type=0,embedding_table=w2id) 96 | config.batch_size = 2 97 | config.epoch = 3 98 | train(config,word_mat,train_counter,dev_counter) 99 | else: 100 | print("Unknown mode") 101 | exit(0) 102 | 103 | 104 | if __name__ == "__main__": 105 | tf.app.run() 106 | -------------------------------------------------------------------------------- /QANet/config_cla_v60.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import tensorflow as tf 4 | 5 | from QANet.prepro import prepro 6 | 7 | ''' 8 | This file is taken and modified from R-Net by HKUST-KnowComp 9 | https://github.com/HKUST-KnowComp/R-Net 10 | ''' 11 | 12 | from QANet.main_debug_cla_v60 import valid, test, train 13 | 14 | flags = tf.flags 15 | parser = argparse.ArgumentParser('Reading Comprehension on aic dataset') 16 | parser.add_argument('--mode', default="test", 17 | help="Running mode test/valid/train/debug") 18 | parser.add_argument('--input', default="./datasets/aic18/test_mini.json", 19 | help='data input path') 20 | 21 | answer_dir = "./answers/ver60" 22 | model_dir = "./models/ver60" 23 | 24 | train_dir = "models" 25 | model_name = "ver60" 26 | answer_file = os.path.join(answer_dir, "testa_v60.txt") 27 | valid_file = os.path.join(answer_dir, "valid_v60.txt") 28 | dir_name = os.path.join(train_dir, model_name) 29 | log_dir = os.path.join(dir_name, "event") 30 | save_dir = os.path.join(dir_name, "model") 31 | 32 | if not os.path.exists(answer_dir): 33 | os.makedirs(answer_dir) 34 | if not os.path.exists(answer_dir): 35 | os.makedirs(answer_dir) 36 | 37 | flags.DEFINE_string("log_dir", log_dir, "Directory for tf event") 38 | flags.DEFINE_string("save_dir", save_dir, "Directory for saving model") 39 | 40 | flags.DEFINE_string("mode", "train", "Running mode test/valid") 41 | flags.DEFINE_string("input", "./datasets/aic18/test.json", "Running mode test/valid") 42 | flags.DEFINE_string("model_dir", model_dir, "Directory for saving model") 43 | flags.DEFINE_string("valid_file", valid_file, "Out file for answer") 44 | flags.DEFINE_string("answer_file", answer_file, "Out file for answer") 45 | 46 | flags.DEFINE_integer("glove_char_size", 94, "Corpus size for Glove") 47 | flags.DEFINE_integer("glove_word_size", int(2.2e6), "Corpus size for Glove") 48 | flags.DEFINE_integer("glove_dim", 300, "Embedding dimension for Glove") 49 | flags.DEFINE_integer("char_dim", 300, "Embedding dimension for char") 50 | 51 | flags.DEFINE_integer("para_limit",100, "200Limit length for paragraph") 52 | flags.DEFINE_integer("ques_limit", 30, "Limit length for question") 53 | flags.DEFINE_integer("ans_limit", 3, "Limit length for answers") 54 | flags.DEFINE_integer("test_para_limit", 1000, "Limit length for paragraph in test file") 55 | flags.DEFINE_integer("test_ques_limit", 100, "Limit length for question in test file") 56 | flags.DEFINE_integer("char_limit", 4, "Limit length for character") 57 | flags.DEFINE_integer("word_count_limit", -1, "Min count for word") 58 | flags.DEFINE_integer("char_count_limit", -1, "Min count for char") 59 | 60 | flags.DEFINE_integer("capacity", 15000, "Batch size of dataset shuffle") 61 | flags.DEFINE_integer("num_threads", 4, "Number of threads in input pipeline") 62 | 63 | flags.DEFINE_integer("batch_size", 256, "128Batch size") 64 | flags.DEFINE_integer("epoch", 25, "epoch size") 65 | flags.DEFINE_integer("period", 50, "100period to save batch loss") 66 | flags.DEFINE_integer("val_num_batches", 150, "Number of batches to evaluate the model") 67 | flags.DEFINE_float("dropout", 0.1, "Dropout prob across the layers") 68 | flags.DEFINE_float("grad_clip", 5.0, "Global Norm gradient clipping rate") 69 | flags.DEFINE_float("learning_rate", 0.001, "Learning rate") 70 | flags.DEFINE_float("learning_decay_rate", 0.9, "Learning rate") 71 | flags.DEFINE_float("end_learning_rate", 0.0001, "Learning rate") 72 | flags.DEFINE_float("decay", 0.9999, "Exponential moving average decay") 73 | flags.DEFINE_float("l2_norm", 3e-7, "L2 norm scale") 74 | flags.DEFINE_integer("hidden", 96, "Hidden size") 75 | flags.DEFINE_integer("num_heads", 1, "Number of heads in self attention") 76 | flags.DEFINE_integer("early_stop", 8, "Checkpoints for early stop") 77 | 78 | def main(_): 79 | config = flags.FLAGS 80 | args = parser.parse_args() 81 | if args.mode == "valid": 82 | word_mat,counter,_ = prepro(60) 83 | valid(config,word_mat,counter) 84 | elif args.mode == "test": 85 | word_mat,counter,_ = prepro(60,type=1,input=args.input) 86 | test(config,word_mat,counter) 87 | elif args.mode == "train": 88 | word_mat, train_counter, w2id = prepro(60, type=2) 89 | _, dev_counter, _ = prepro(60, type=0, embedding_table=w2id) 90 | train(config, word_mat, train_counter, dev_counter) 91 | elif args.mode == "debug": 92 | word_mat, train_counter, w2id = prepro(60, type=2) 93 | _, dev_counter, _ = prepro(60, type=0, embedding_table=w2id) 94 | config.batch_size = 2 95 | config.epoch = 3 96 | train(config, word_mat, train_counter, dev_counter) 97 | else: 98 | print("Unknown mode") 99 | exit(0) 100 | 101 | 102 | 103 | if __name__ == "__main__": 104 | tf.app.run() 105 | -------------------------------------------------------------------------------- /QANet/config_cla_v646.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import tensorflow as tf 4 | 5 | from QANet.prepro import prepro 6 | 7 | ''' 8 | This file is taken and modified from R-Net by HKUST-KnowComp 9 | https://github.com/HKUST-KnowComp/R-Net 10 | ''' 11 | from QANet.main_debug_cla_ver646 import valid, test,train 12 | flags = tf.flags 13 | parser = argparse.ArgumentParser('Reading Comprehension on aic dataset') 14 | parser.add_argument('--mode', default="test", 15 | help="Running mode test/valid/train/debug") 16 | parser.add_argument('--input', default="./datasets/aic18/test_mini.json", 17 | help='data input path') 18 | 19 | answer_dir = "./answers/ver646" 20 | model_dir = "./models/ver646" 21 | 22 | train_dir = "models" 23 | model_name = "ver646" 24 | answer_file = os.path.join(answer_dir, "testa_v646.txt") 25 | valid_file = os.path.join(answer_dir, "valid_v646.txt") 26 | dir_name = os.path.join(train_dir, model_name) 27 | log_dir = os.path.join(dir_name, "event") 28 | save_dir = os.path.join(dir_name, "model") 29 | 30 | if not os.path.exists(answer_dir): 31 | os.makedirs(answer_dir) 32 | if not os.path.exists(answer_dir): 33 | os.makedirs(answer_dir) 34 | 35 | flags.DEFINE_string("log_dir", log_dir, "Directory for tf event") 36 | flags.DEFINE_string("save_dir", save_dir, "Directory for saving model") 37 | 38 | flags.DEFINE_string("mode", "train", "Running mode test/valid") 39 | flags.DEFINE_string("input", "./datasets/aic18/test.json", "Running mode test/valid") 40 | flags.DEFINE_string("model_dir", model_dir, "Directory for saving model") 41 | flags.DEFINE_string("valid_file", valid_file, "Out file for answer") 42 | flags.DEFINE_string("answer_file", answer_file, "Out file for answer") 43 | 44 | flags.DEFINE_integer("glove_char_size", 94, "Corpus size for Glove") 45 | flags.DEFINE_integer("glove_word_size", int(2.2e6), "Corpus size for Glove") 46 | flags.DEFINE_integer("glove_dim", 300, "Embedding dimension for Glove") 47 | flags.DEFINE_integer("char_dim", 300, "Embedding dimension for char") 48 | 49 | flags.DEFINE_integer("para_limit",100, "200Limit length for paragraph") 50 | flags.DEFINE_integer("ques_limit", 30, "Limit length for question") 51 | flags.DEFINE_integer("ans_limit", 3, "Limit length for answers") 52 | flags.DEFINE_integer("test_para_limit", 1000, "Limit length for paragraph in test file") 53 | flags.DEFINE_integer("test_ques_limit", 100, "Limit length for question in test file") 54 | flags.DEFINE_integer("char_limit", 4, "Limit length for character") 55 | flags.DEFINE_integer("word_count_limit", -1, "Min count for word") 56 | flags.DEFINE_integer("char_count_limit", -1, "Min count for char") 57 | 58 | flags.DEFINE_integer("capacity", 15000, "Batch size of dataset shuffle") 59 | flags.DEFINE_integer("num_threads", 4, "Number of threads in input pipeline") 60 | flags.DEFINE_boolean("is_bucket", False, "build bucket batch iterator or not") 61 | 62 | flags.DEFINE_integer("batch_size", 256, "128Batch size") 63 | flags.DEFINE_integer("epoch", 25, "epoch size") 64 | flags.DEFINE_integer("period", 50, "100period to save batch loss") 65 | flags.DEFINE_integer("val_num_batches", 150, "Number of batches to evaluate the model") 66 | flags.DEFINE_float("dropout", 0.1, "Dropout prob across the layers") 67 | flags.DEFINE_float("grad_clip", 5.0, "Global Norm gradient clipping rate") 68 | flags.DEFINE_float("learning_rate", 0.001, "Learning rate") 69 | flags.DEFINE_float("learning_decay_rate", 0.9, "Learning rate") 70 | flags.DEFINE_float("end_learning_rate", 0.0001, "Learning rate") 71 | flags.DEFINE_float("decay", 0.9999, "Exponential moving average decay") 72 | flags.DEFINE_float("l2_norm", 3e-7, "L2 norm scale") 73 | flags.DEFINE_integer("hidden", 64, "Hidden size") 74 | flags.DEFINE_integer("num_heads", 1, "Number of heads in self attention") 75 | flags.DEFINE_integer("early_stop", 10, "Checkpoints for early stop") 76 | 77 | def main(_): 78 | config = flags.FLAGS 79 | args = parser.parse_args() 80 | if args.mode == "valid": 81 | word_mat,counter,_ = prepro(646) 82 | valid(config,word_mat,counter) 83 | elif args.mode == "test": 84 | word_mat,counter,_ = prepro(646,type=1,input=args.input) 85 | test(config,word_mat,counter) 86 | elif args.mode == "train": 87 | word_mat, train_counter, w2id = prepro(646, type=2) 88 | _, dev_counter, _ = prepro(646, type=0, embedding_table=w2id) 89 | train(config, word_mat, train_counter, dev_counter) 90 | elif args.mode == "debug": 91 | word_mat, train_counter, w2id = prepro(646, type=2) 92 | _, dev_counter, _ = prepro(646, type=0, embedding_table=w2id) 93 | config.batch_size = 2 94 | config.epoch = 3 95 | train(config, word_mat, train_counter, dev_counter) 96 | else: 97 | print("Unknown mode") 98 | exit(0) 99 | 100 | 101 | if __name__ == "__main__": 102 | tf.app.run() 103 | -------------------------------------------------------------------------------- /QANet/data/pickles/log: -------------------------------------------------------------------------------- 1 | here would generated some tmp pickle files -------------------------------------------------------------------------------- /QANet/data/prepro/log: -------------------------------------------------------------------------------- 1 | here would generated some tmp tfrecords for model -------------------------------------------------------------------------------- /QANet/datasets/aic18/test_mini.json: -------------------------------------------------------------------------------- 1 | {"url": "http://gsrb.gansudaily.com.cn/system/2009/08/23/011235562.shtml", "query": "武威的公交卡古浪能不能用", "query_id": 280001, "alternatives": "能|不能|无法确定", "passage": "武威公交一体化纪实 10家运输公司中标经营包括凉州区、古浪、民勤、天祝在内的城乡公交线路。经过收编、整合、更新,开通城乡公交客运班线23条,统一投放80辆高档次客运车辆,由运输公司统一管理。实际上,运营在这些线路的新型双开门公交车的标准、设施已远远超过城区公交车。武威运管部门通过市场竞争和行业引导,建立退出机制,规范经营行为,提升服务质量。   去年11月下旬,武威市区至古浪县城和凉州区50公里范围内的乡镇全部开通城乡公交,凉州区28个乡镇300个行政村更是全部覆盖城乡公交,率先实现“乡乡通公交,村村通客车”。这些城乡公交定时、定班、定点、定线,城乡公交均等化延伸到农民的家门口。“乡村小公交起到了穿针引线、走村串巷的功能。”沈兴国说。"} 2 | {"url": "http://wenwen.sogou.com/z/q701006723.htm", "query": "能买到无线偷拍器吗", "query_id": 280002, "alternatives": "能|不能|无法确定", "passage": "现在这个社会什么买不到,只要你有钱是不是 欢迎光临【深圳平安安防】无线的有线的都有呢,看你喜欢什么样的了,在这里就不多介绍了,也不好介绍有需要的话你可以进去看一看"} 3 | {"url": "http://wenwen.sogou.com/z/q763575352.htm", "query": "中安信业减免还款是真实的吗", "query_id": 280003, "alternatives": "是真的|不是真的|无法确定", "passage": "请问朋友们网上中安信业代款是真的吗? 【百度反诈骗联盟团队】特别提醒:网上发布的所有只凭身份证就可以贷款或者信用卡的信息都是低级骗局,无论公司是否注册备案,都不要相信,骗子先骗你签订传真合同,并按捺手印,然后会一步步骗取你先支付首月利息、履约费、保证金、保险费、担保费、放款费、公证费、征信费、抵押金、开卡费等等,还会以你银行流水不足、查验你的还款能力或者是验资为名,要求你将自己账户上所有的资金打至骗子的账户,如果你不按骗子的要求交纳费用,骗子会以你已经和他们签了合同为名,威胁要起诉你违约,并威胁你赔偿巨额违约金,这实为低级的诈骗手段和典型诈骗!请永远记住,凡是对方以任何理由要求你先支付任何费用的,都是绝对的诈骗,无论在任何情况下,都不要先给其他人汇款、转账,以免被骗!更不要相信骗子的任何威胁,由于对方涉嫌诈骗,所以,和骗子签的合约没有任何法律效力,更不存在违约之说。所以,特此提醒广大网友,不要相信网上各种投资、融资担保公司以及各类小额贷公司发布的此类贷款或者卡信息,特别是北京、上海、广州、深圳等大城市的这类公司基本都是骗子公司!如果被骗,无论金额大小,都请选择报警!如此猖狂诈骗,还请各地公安机关大力打击和整顿! "} 4 | {"url": "http://www.mama.cn/ask/q13547252-p1.html", "query": "petct医保报销吗", "query_id": 280004, "alternatives": "能|不能|无法确定", "passage": "对于这些的话也可以咨询一下你的直属上司或者是领导,他们专业的意见也都是可以的。"} 5 | {"url": "http://www.d1xz.net/astro/Cancer/art117849.aspx", "query": "巨蟹座慢热么", "query_id": 280005, "alternatives": "慢热|不慢热|无法确定", "passage": "在巨蟹座当中,慢热型的性格,更是让她们的爱心与细腻,更好的发挥到极致。"} 6 | {"url": "http://www.169kang.com/question/369685826.html", "query": "菊花茶叶能一起泡吗", "query_id": 280006, "alternatives": "能|不能|无法确定", "passage": "菊花有清热解毒、清肝明目的作用,茶叶尤其绿茶同样具有清热解毒的作用,两者一起泡茶无碍。"} 7 | {"url": "http://www.169kang.com/question/409628430.html", "query": "嗓子疼吃感康行吗", "query_id": 280007, "alternatives": "行|不行|无法确定", "passage": "引起咽喉疼痛不适的原因多是由于扁桃体炎或是急性咽炎所导致,感康片主要用于感冒不适有一定的作用,如发热,头痛,鼻塞。"} 8 | {"url": "http://www.ali213.net/news/html/2014-7/109714.html", "query": "漫威电影美队换人了吗", "query_id": 280008, "alternatives": "换了|没换|无法确定", "passage": "漫威近日宣布,“猎鹰”山姆·威尔森将代替史蒂夫·罗杰斯,成为10月新系列漫画的新任美国队长!"} 9 | {"url": "http://www.abcb.net.cn/ximan-wulumuqijiunianyiwujiaoyu.html", "query": "新疆是九年义务教育还是十二年义务教育", "query_id": 280009, "alternatives": "九年义务教育|十二年义务教育|无法确定", "passage": ".我国现在实行的还是9年义务教育..至少目前没有准确消息说要实行12点义务教育..不实行新疆不属于这次的试点地区不."} 10 | {"url": "http://cq.bendibao.com/traffic/2018412/72718.shtm", "query": "重庆星期六限号吗", "query_id": 280010, "alternatives": "限|不限|无法确定", "passage": "星期六、星期日因法定节假日调休变为工作日的,不实施尾号限行措施。"} -------------------------------------------------------------------------------- /QANet/datasets/aic18/valid_mini.json: -------------------------------------------------------------------------------- 1 | {"url": "http://iask.sina.com.cn/key/5a18d46b84aedabb5c07a131.html", "alternatives": "有|没有|无法确定", "passage": "动漫好看的H:爱的魔法,KEY的作品,喧嚣学院,草莓100%,双恋,爱丽丝学园,灼眼的夏娜,我的女神,赐予护女神的祝福,旋风管家,全金属狂潮,初音岛,命运之夜,心跳回忆。", "query_id": 250001, "answer": "有", "query": "有没有好看的h"} 2 | {"url": "http://www.120ask.com/question/65970789.htm", "alternatives": "能|不能|无法确定", "passage": "醋泡鸡蛋确实具有一定美白嫩化肌肤、提高皮肤亮度、祛斑的效果,因为白醋中含有的醋酸可以加速表皮新陈代谢、软化角质,鸡蛋清中的蛋白质可以嫩化肌肤,收缩毛孔的作用。", "query_id": 250002, "answer": "能", "query": "醋泡鸡蛋真能去斑吗"} 3 | {"url": "http://wenwen.sogou.com/z/q166740184.htm", "alternatives": "听不懂|听得懂|无法确定", "passage": "人有人言,兽有兽语。动物是不会听懂人说话的", "query_id": 250003, "answer": "听不懂", "query": "老鼠听得懂人话吗"} 4 | {"url": "http://wenwen.sogou.com/z/q705319471.htm", "alternatives": "无法确定|大|不大", "passage": "1.前期投资约5-10万元设备投资:柜台、门面装修、电脑及简单家具,一次性投入约2万元。2.3个月运转费用:一家店新开张,要作好两三个月没有生意的准备,最好事先筹备好3个月的运转费用3万元左右。3.进货款:新店开张,店里要备好大约价值2万元的汽车装潢材料。当然,如果有供应商愿意让你代销装潢材料,卖出去再结算,那这一笔费用可以省下。4.手续费:一般来说,注册资金为50万元的企业,代理费用约三四千元。B.每月支出1.房租:在较高档的居民小区附近,租一个20-40平方米的门面,加上水电和物业管理费,一般花费在每月2000-5000元。2.员工工资:开一家汽车装潢小店,至少要聘请一名电工和两名贴膜工。电工月薪在1200-1500元左右,贴膜工大约月薪千元。加上给员工加缴“三金”,每月工资支出约4000元。3.税收:每月固定税收大约500元。4.每月交际费用:不算很高,大约1000元就可以了。", "query_id": 250004, "answer": "无法确定", "query": "开洗车店投资大吗"} 5 | {"url": "http://www.169kang.com/question/166710467.html", "alternatives": "会|不会|无法确定", "passage": "性接触没有保护措施,是有感染的几率的,艾滋病没有特异性的症状。", "query_id": 250005, "answer": "会", "query": "类似性行为会不会感染艾滋病"} 6 | {"url": "http://www.120ask.com/question/36999066.htm", "alternatives": "不能|能|无法确定", "passage": "最起码再来月经后在考虑上环,一般在月经干净后3天左右去上环,这时候是最佳的时间。现在还没有来月经,与生育有关系,所以不用担心的。再说月经周期与心情、压力、饮食、内分泌等也有关系,注意保持心情舒畅,不要有大的心理压力。", "query_id": 250006, "answer": "不能", "query": "产后没来月经能上环么"} 7 | {"url": "http://baike.baidu.com/item/%E6%83%A0%E5%B7%9E%E5%8D%97%E7%AB%99/9326466?fr=aladdin", "alternatives": "有|没有|无法确定", "passage": "惠州南站 惠州南站,位于惠州市惠阳区,是厦深铁路沿线大站之一。隶属广州铁路(集团)公司管辖,现为一等站。 位 置 惠州市惠阳区淡水新桥 序号 车次 等级 始发站   终到站 出发时间   到达时间 到时 发时 车站 到达查询站历时 1 G6343 高速 潮汕   广州南 06:09   08:45 07:37 07:38 惠州南 1小时28分   当日到达", "query_id": 250007, "answer": "有", "query": "惠州淡水有高铁站吗"} 8 | {"url": "https://zhidao.baidu.com/question/246119134944261724", "alternatives": "有|没有|无法确定", "passage": "陈立农有弟弟吗 一个妹妹一个弟弟 当然是亲生的 妹妹比弟弟大", "query_id": 250008, "answer": "有", "query": "陈立农有兄弟姐妹吗"} 9 | {"url": "http://m.iask.sina.com.cn/b/2260918.html", "alternatives": "是|否|无法确定", "passage": "飞机在起飞和降落时是最危险的。飞中远距离的飞机所携带的燃油比较多,如果不放掉大部分燃油,着陆时对起落架等部位的冲击力太大,容易发生意外,同时燃油较少,发生事故时,火灾也不会太严。", "query_id": 250009, "answer": "是", "query": "飞机降落前是否放油"} 10 | {"url": "http://club.xywy.com/static/20150427/65656405.htm", "alternatives": "不行|行|无法确定", "passage": "建议入院行X片检查,必要时行踝关节CT检查排除骨折。如无骨折,则考虑韧带损伤,建议受伤关节制动(不活动)、不负重行走,或保护性活动关节,休息;急性期(伤后48小时)内可冷敷消肿,48小时后可热敷促进血液循环以利消肿,局部可外敷消肿止痛药。必须经过冷敷", "query_id": 250010, "answer": "不行", "query": "脚扭伤不冷敷行吗"} -------------------------------------------------------------------------------- /QANet/datasets/word2vec/log: -------------------------------------------------------------------------------- 1 | put jwe_size300.txt instead -------------------------------------------------------------------------------- /QANet/main_debug_ans_v20.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import tensorflow as tf 4 | import ujson as json 5 | from tqdm import tqdm 6 | 7 | from QANet.model_ans_v20 import Model 8 | from QANet.util import get_record_parser20 9 | from QANet.vote import modify_index_save, modify 10 | 11 | ''' 12 | This file is taken and modified from R-Net by HKUST-KnowComp 13 | https://github.com/HKUST-KnowComp/R-Net 14 | ''' 15 | ''' 16 | train:249996 17 | dev:30000 18 | testa:10000 19 | ''' 20 | 21 | def train(config,word_mat,train_counter,dev_counter): 22 | print("Building model...") 23 | parser = get_record_parser20() 24 | graph = tf.Graph() 25 | with graph.as_default() as g: 26 | #读取tfrecords文件 27 | train_dataset = tf.data.TFRecordDataset( 28 | "./data/prepro/train_ver20.tfrecords") 29 | train_dataset = train_dataset.shuffle(train_counter).map(parser).batch(config.batch_size) 30 | 31 | dev_dataset = tf.data.TFRecordDataset( 32 | "./data/prepro/valid_ver20.tfrecords") 33 | dev_dataset = dev_dataset.shuffle(dev_counter).map(parser).batch(config.batch_size) 34 | 35 | iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes) 36 | train_initializer = iterator.make_initializer(train_dataset) 37 | dev_initializer = iterator.make_initializer(dev_dataset) 38 | 39 | model = Model(config, iterator, word_mat, word_mat, graph = g) 40 | 41 | sess_config = tf.ConfigProto(allow_soft_placement=True) 42 | sess_config.gpu_options.allow_growth = True 43 | 44 | patience = 0 45 | 46 | with tf.Session() as sess: 47 | writer = tf.summary.FileWriter(config.log_dir) 48 | sess.run(tf.global_variables_initializer()) 49 | saver = tf.train.Saver() 50 | 51 | #训练多少轮就验证一下 52 | eval_per_epoch = 1 53 | #绘制验证集的tensorboard需要用的变量 54 | best_dev_acc = 0 55 | best_dev_loss = 100000 56 | 57 | #如果有先前训练好的模型,那就重载模型 58 | if os.path.exists(os.path.join(config.save_dir, "checkpoint")): 59 | saver.restore(sess, tf.train.latest_checkpoint(config.save_dir)) 60 | global_step = max(sess.run(model.global_step), 1) 61 | 62 | for e in range(0,config.epoch): 63 | sess.run(train_initializer) 64 | #训练 65 | num_steps = int(train_counter/config.batch_size) 66 | for _ in tqdm(range(num_steps)): 67 | global_step = sess.run(model.global_step) + 1 68 | loss, train_op,ansp,acc = sess.run([model.loss, model.train_op, model.ansp, model.acc], feed_dict={ 69 | model.dropout: config.dropout}) 70 | if global_step % config.period == 0: 71 | loss_sum = tf.Summary(value=[tf.Summary.Value( 72 | tag="model/loss", simple_value=loss), ]) 73 | loss_acc = tf.Summary(value=[tf.Summary.Value( 74 | tag="model/acc", simple_value=acc), ]) 75 | writer.add_summary(loss_sum, global_step) 76 | writer.add_summary(loss_acc, global_step) 77 | if global_step % 100 ==0 : 78 | print("===TRAIN=== global_step is {0}, loss is {1}, ansp is {2}, acc is {3}" 79 | .format(global_step, loss, ansp,acc)) 80 | 81 | filename = os.path.join( 82 | config.save_dir, "model_{}.ckpt".format(e)) 83 | saver.save(sess, filename) 84 | 85 | #验证 86 | if e % eval_per_epoch == 0: 87 | sess.run(dev_initializer) 88 | times = int(dev_counter/config.batch_size) 89 | all_dev_loss = 0 90 | all_dev_acc = 0 91 | for _ in range(times): 92 | loss, ansp, acc = sess.run([model.loss, model.ansp, model.acc], feed_dict={ 93 | model.dropout: 0.0}) 94 | all_dev_acc += acc 95 | all_dev_loss += loss 96 | 97 | all_ave_dev_loss = all_dev_loss/times 98 | all_ave_dev_acc = all_dev_acc/times 99 | summary_all_dev_ave_loss = tf.Summary(value=[tf.Summary.Value( 100 | tag="model/dev_ave_loss", simple_value=all_ave_dev_loss), ]) 101 | summary_all_dev_ave_acc = tf.Summary(value=[tf.Summary.Value( 102 | tag="model/dev_ave_acc", simple_value=all_ave_dev_acc), ]) 103 | writer.add_summary(summary_all_dev_ave_loss,e) 104 | writer.add_summary(summary_all_dev_ave_acc, e) 105 | print("==DEV{0}== ave loss is {1}, ave acc is{2}".format(e,all_ave_dev_loss,all_ave_dev_acc)) 106 | if all_ave_dev_loss > best_dev_loss and all_ave_dev_acc < best_dev_acc: 107 | patience += 1 108 | if patience > config.early_stop: 109 | break 110 | else: 111 | patience = 0 112 | best_dev_loss = min(best_dev_loss, all_ave_dev_loss) 113 | best_dev_acc = max(best_dev_acc,all_ave_dev_acc) 114 | 115 | 116 | def test(config,word_mat,counter,input_path): 117 | 118 | def parse_example(serial_example): 119 | features = tf.parse_single_example(serial_example,features={ 120 | 'context_tokens_ids':tf.FixedLenFeature([],tf.string), 121 | 'context_chars_ids':tf.FixedLenFeature([],tf.string), 122 | 'ques_tokens_ids':tf.FixedLenFeature([],tf.string), 123 | 'ques_chars_ids':tf.FixedLenFeature([],tf.string), 124 | 'ans':tf.FixedLenFeature([],tf.string), 125 | 'q_id': tf.FixedLenFeature([], tf.string) 126 | }) 127 | context_tokens = tf.reshape(tf.decode_raw(features['context_tokens_ids'],tf.int64),[100]) 128 | context_chars = tf.reshape(tf.decode_raw(features['context_chars_ids'],tf.int64),[100,4]) 129 | ques_tokens = tf.reshape(tf.decode_raw(features['ques_tokens_ids'], tf.int64), [30]) 130 | ques_chars = tf.reshape(tf.decode_raw(features['ques_chars_ids'], tf.int64), [30, 4]) 131 | ans = tf.reshape(tf.decode_raw(features['ans'],tf.int64),[3,2]) 132 | q_id = tf.reshape(tf.decode_raw(features['q_id'], tf.int64), []) 133 | return context_tokens,context_chars,ques_tokens,ques_chars,ans,q_id 134 | 135 | graph = tf.Graph() 136 | batch_size = 1 137 | print("Loading model...") 138 | with graph.as_default() as g: 139 | test_dataset = tf.data.TFRecordDataset( 140 | "./data/prepro/testa_ver20.tfrecords") 141 | test_dataset = test_dataset.map(parse_example).batch(batch_size) 142 | 143 | iterator = tf.data.Iterator.from_structure(test_dataset.output_types, test_dataset.output_shapes) 144 | test_initializer = iterator.make_initializer(test_dataset) 145 | 146 | model = Model(config, iterator, word_mat, word_mat,trainable=False, graph=g) 147 | 148 | sess_config = tf.ConfigProto(allow_soft_placement=True) 149 | sess_config.gpu_options.allow_growth = True 150 | 151 | 152 | with tf.Session(config=sess_config) as sess: 153 | sess.run(tf.global_variables_initializer()) 154 | sess.run(test_initializer) 155 | saver = tf.train.Saver() 156 | saver.restore(sess, tf.train.latest_checkpoint(config.save_dir)) 157 | 158 | results = [] 159 | for step in tqdm(range(counter)): 160 | q_id,logits = sess.run([model.q_id,model.outer]) 161 | for i in range(batch_size): 162 | result = {} 163 | 164 | result['query_id'] = q_id.tolist()[0] 165 | result['predict'] = logits[i].tolist() 166 | 167 | s = json.dumps(result) 168 | results.append("{}\n".format(s)) 169 | with open(config.answer_file, "w") as fh: 170 | fh.writelines(results) 171 | 172 | modify_index_save(input_path, 173 | './datasets/aic18/sorted_test_ver20.json') 174 | modify(config.answer_file, 175 | './datasets/aic18/sorted_test_ver20.json', 176 | config.sorted_answer_file) 177 | 178 | def valid(config,word_mat,counter): 179 | 180 | def parse_example(serial_example): 181 | features = tf.parse_single_example(serial_example,features={ 182 | 'context_tokens_ids':tf.FixedLenFeature([],tf.string), 183 | 'context_chars_ids':tf.FixedLenFeature([],tf.string), 184 | 'ques_tokens_ids':tf.FixedLenFeature([],tf.string), 185 | 'ques_chars_ids':tf.FixedLenFeature([],tf.string), 186 | 'ans':tf.FixedLenFeature([],tf.string), 187 | 'q_id': tf.FixedLenFeature([], tf.string) 188 | }) 189 | context_tokens = tf.reshape(tf.decode_raw(features['context_tokens_ids'],tf.int64),[100]) 190 | context_chars = tf.reshape(tf.decode_raw(features['context_chars_ids'],tf.int64),[100,4]) 191 | ques_tokens = tf.reshape(tf.decode_raw(features['ques_tokens_ids'], tf.int64), [30]) 192 | ques_chars = tf.reshape(tf.decode_raw(features['ques_chars_ids'], tf.int64), [30, 4]) 193 | ans = tf.reshape(tf.decode_raw(features['ans'],tf.int64),[3,2]) 194 | q_id = tf.reshape(tf.decode_raw(features['q_id'], tf.int64), []) 195 | return context_tokens,context_chars,ques_tokens,ques_chars,ans,q_id 196 | 197 | graph = tf.Graph() 198 | batch_size = 1 199 | print("Loading model...") 200 | with graph.as_default() as g: 201 | test_dataset = tf.data.TFRecordDataset( 202 | "./data/prepro/valid_ver20.tfrecords") 203 | test_dataset = test_dataset.map(parse_example).batch(batch_size) 204 | 205 | iterator = tf.data.Iterator.from_structure(test_dataset.output_types, test_dataset.output_shapes) 206 | test_initializer = iterator.make_initializer(test_dataset) 207 | 208 | model = Model(config, iterator, word_mat, word_mat,trainable=False, graph=g) 209 | 210 | sess_config = tf.ConfigProto(allow_soft_placement=True) 211 | sess_config.gpu_options.allow_growth = True 212 | 213 | 214 | with tf.Session(config=sess_config) as sess: 215 | sess.run(tf.global_variables_initializer()) 216 | sess.run(test_initializer) 217 | saver = tf.train.Saver() 218 | saver.restore(sess, tf.train.latest_checkpoint(config.save_dir)) 219 | 220 | results = [] 221 | for step in tqdm(range(counter)): 222 | q_id,logits = sess.run([model.q_id,model.outer]) 223 | for i in range(batch_size): 224 | result = {} 225 | # try: 226 | result['query_id'] = q_id.tolist()[0] 227 | result['predict'] = logits[i].tolist() 228 | 229 | s = json.dumps(result) 230 | results.append("{}\n".format(s)) 231 | with open(config.valid_file, "w") as fh: 232 | fh.writelines(results) 233 | 234 | #这两个方法就是将预测出的答案排序成正向|负向|不确定的形式,因为默认的输出是第一个是正确答案 235 | modify_index_save('./datasets/aic18/valid_mini.json', 236 | './datasets/aic18/sorted_valid_ver20.json') 237 | modify(config.valid_file, 238 | './datasets/aic18/sorted_valid_ver20.json', 239 | config.sorted_valid_file) -------------------------------------------------------------------------------- /QANet/main_debug_cla_v60.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import tensorflow as tf 4 | import ujson as json 5 | from tqdm import tqdm 6 | 7 | from QANet.util import get_record_parser60 8 | 9 | ''' 10 | This file is taken and modified from R-Net by HKUST-KnowComp 11 | https://github.com/HKUST-KnowComp/R-Net 12 | ''' 13 | ''' 14 | train:249431 15 | dev:29930 16 | testa:10000249431 17 | ''' 18 | from QANet.model_cla_v60 import Model 19 | 20 | def train(config,word_mat,train_counter,dev_counter): 21 | print("Building model...") 22 | parser = get_record_parser60() 23 | graph = tf.Graph() 24 | with graph.as_default() as g: 25 | #读取tfrecords文件 26 | train_dataset = tf.data.TFRecordDataset( 27 | "./data/prepro/train_ver60.tfrecords") 28 | train_dataset = train_dataset.shuffle(train_counter).map(parser).batch(config.batch_size) 29 | 30 | dev_dataset = tf.data.TFRecordDataset( 31 | "./data/prepro/valid_ver60.tfrecords") 32 | dev_dataset = dev_dataset.shuffle(dev_counter).map(parser).batch(config.batch_size) 33 | 34 | iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes) 35 | train_initializer = iterator.make_initializer(train_dataset) 36 | dev_initializer = iterator.make_initializer(dev_dataset) 37 | 38 | model = Model(config, iterator, word_mat, word_mat, graph = g) 39 | 40 | sess_config = tf.ConfigProto(allow_soft_placement=True) 41 | sess_config.gpu_options.allow_growth = True 42 | 43 | patience = 0 44 | 45 | with tf.Session() as sess: 46 | writer = tf.summary.FileWriter(config.log_dir) 47 | sess.run(tf.global_variables_initializer()) 48 | saver = tf.train.Saver() 49 | 50 | #训练多少轮就验证一下 51 | eval_per_epoch = 1 52 | #绘制验证集的tensorboard需要用的变量 53 | best_dev_acc = 0 54 | best_dev_loss = 100000 55 | 56 | #如果有先前训练好的模型,那就重载模型 57 | if os.path.exists(os.path.join(config.save_dir, "checkpoint")): 58 | saver.restore(sess, tf.train.latest_checkpoint(config.save_dir)) 59 | global_step = max(sess.run(model.global_step), 1) 60 | 61 | for e in range(0,config.epoch): 62 | sess.run(train_initializer) 63 | #训练 64 | num_steps = int(train_counter/config.batch_size) 65 | for _ in tqdm(range(num_steps)): 66 | global_step = sess.run(model.global_step) + 1 67 | loss, train_op,ansp,acc = sess.run([model.loss, model.train_op, model.ansp, model.acc], feed_dict={ 68 | model.dropout: config.dropout}) 69 | if global_step % config.period == 0: 70 | loss_sum = tf.Summary(value=[tf.Summary.Value( 71 | tag="model/loss", simple_value=loss), ]) 72 | loss_acc = tf.Summary(value=[tf.Summary.Value( 73 | tag="model/acc", simple_value=acc), ]) 74 | writer.add_summary(loss_sum, global_step) 75 | writer.add_summary(loss_acc, global_step) 76 | if global_step % 100 ==0 : 77 | print("===TRAIN=== global_step is {0}, loss is {1}, ansp is {2}, acc is {3}" 78 | .format(global_step, loss, ansp,acc)) 79 | 80 | filename = os.path.join( 81 | config.save_dir, "model_{}.ckpt".format(e)) 82 | saver.save(sess, filename) 83 | 84 | #验证 85 | if e % eval_per_epoch == 0: 86 | sess.run(dev_initializer) 87 | times = int(dev_counter/config.batch_size) 88 | all_dev_loss = 0 89 | all_dev_acc = 0 90 | for _ in range(times): 91 | loss, ansp, acc = sess.run([model.loss, model.ansp, model.acc], feed_dict={ 92 | model.dropout: 0.0}) 93 | all_dev_acc += acc 94 | all_dev_loss += loss 95 | 96 | all_ave_dev_loss = all_dev_loss/times 97 | all_ave_dev_acc = all_dev_acc/times 98 | summary_all_dev_ave_loss = tf.Summary(value=[tf.Summary.Value( 99 | tag="model/dev_ave_loss", simple_value=all_ave_dev_loss), ]) 100 | summary_all_dev_ave_acc = tf.Summary(value=[tf.Summary.Value( 101 | tag="model/dev_ave_acc", simple_value=all_ave_dev_acc), ]) 102 | writer.add_summary(summary_all_dev_ave_loss,e) 103 | writer.add_summary(summary_all_dev_ave_acc, e) 104 | print("==DEV{0}== ave loss is {1}, ave acc is{2}".format(e,all_ave_dev_loss,all_ave_dev_acc)) 105 | if all_ave_dev_loss > best_dev_loss and all_ave_dev_acc < best_dev_acc: 106 | patience += 1 107 | if patience > config.early_stop: 108 | break 109 | else: 110 | patience = 0 111 | best_dev_loss = min(best_dev_loss, all_ave_dev_loss) 112 | best_dev_acc = max(best_dev_acc,all_ave_dev_acc) 113 | 114 | 115 | def test(config,word_mat,counter): 116 | def parse_example(serial_example): 117 | features = tf.parse_single_example(serial_example, features={ 118 | 'context_tokens_ids': tf.FixedLenFeature([], tf.string), 119 | 'context_chars_ids': tf.FixedLenFeature([], tf.string), 120 | 'ques_tokens_ids': tf.FixedLenFeature([], tf.string), 121 | 'ques_chars_ids': tf.FixedLenFeature([], tf.string), 122 | 'q_id': tf.FixedLenFeature([], tf.string) 123 | }) 124 | context_tokens = tf.reshape(tf.decode_raw(features['context_tokens_ids'], tf.int64), [100]) 125 | context_chars = tf.reshape(tf.decode_raw(features['context_chars_ids'], tf.int64), [100, 4]) 126 | ques_tokens = tf.reshape(tf.decode_raw(features['ques_tokens_ids'], tf.int64), [30]) 127 | ques_chars = tf.reshape(tf.decode_raw(features['ques_chars_ids'], tf.int64), [30, 4]) 128 | q_id = tf.reshape(tf.decode_raw(features['q_id'], tf.int64), []) 129 | 130 | return context_tokens, context_chars, ques_tokens, ques_chars, q_id 131 | 132 | # word_mat, w2id = GetEmbeddingFromNoHeadTXT( 133 | # "./datasets/word2vec/jwe_size300.txt", word_dim=300) 134 | 135 | graph = tf.Graph() 136 | batch_size = 1 137 | print("Loading model...") 138 | with graph.as_default() as g: 139 | train_dataset = tf.data.TFRecordDataset( 140 | "./data/prepro/testa_ver60.tfrecords") 141 | train_dataset = train_dataset.map(parse_example).batch(batch_size) 142 | 143 | iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes) 144 | train_initializer = iterator.make_initializer(train_dataset) 145 | 146 | model = Model(config, iterator, word_mat, word_mat,trainable=False, graph=g) 147 | 148 | sess_config = tf.ConfigProto(allow_soft_placement=True) 149 | sess_config.gpu_options.allow_growth = True 150 | out_file = open(config.answer_file,"w",encoding="utf8",errors='ignore') 151 | with tf.Session(config=sess_config) as sess: 152 | sess.run(tf.global_variables_initializer()) 153 | sess.run(train_initializer) 154 | saver = tf.train.Saver() 155 | saver.restore(sess, tf.train.latest_checkpoint(config.save_dir)) 156 | 157 | results = [] 158 | for step in tqdm(range(counter)): 159 | 160 | qa_id, logits = sess.run([model.id,model.logits]) 161 | for i in range(batch_size): 162 | result = {} 163 | result['query_id'] = qa_id.tolist()[i] 164 | result['predict'] = logits.tolist()[i] 165 | 166 | results.append(result) 167 | for i in range(len(results)): 168 | r = json.dumps(results[i], ensure_ascii=False) 169 | out_file.write("{}\n".format(r)) 170 | out_file.close() 171 | 172 | def valid(config,word_mat,counter): 173 | def parse_example(serial_example): 174 | features = tf.parse_single_example(serial_example, features={ 175 | 'context_tokens_ids': tf.FixedLenFeature([], tf.string), 176 | 'context_chars_ids': tf.FixedLenFeature([], tf.string), 177 | 'ques_tokens_ids': tf.FixedLenFeature([], tf.string), 178 | 'ques_chars_ids': tf.FixedLenFeature([], tf.string), 179 | 'ans': tf.FixedLenFeature([], tf.string), 180 | 'q_id': tf.FixedLenFeature([], tf.string) 181 | }) 182 | context_tokens = tf.reshape(tf.decode_raw(features['context_tokens_ids'], tf.int64), [100]) 183 | context_chars = tf.reshape(tf.decode_raw(features['context_chars_ids'], tf.int64), [100, 4]) 184 | ques_tokens = tf.reshape(tf.decode_raw(features['ques_tokens_ids'], tf.int64), [30]) 185 | ques_chars = tf.reshape(tf.decode_raw(features['ques_chars_ids'], tf.int64), [30, 4]) 186 | ans = tf.reshape(tf.decode_raw(features['ans'], tf.int64), [3]) 187 | q_id = tf.reshape(tf.decode_raw(features['q_id'], tf.int64), []) 188 | return context_tokens, context_chars, ques_tokens, ques_chars, ans,q_id 189 | 190 | # word_mat, w2id = GetEmbeddingFromNoHeadTXT( 191 | # "./datasets/word2vec/jwe_size300.txt", word_dim=300) 192 | 193 | graph = tf.Graph() 194 | batch_size = 1 195 | print("Loading model...") 196 | with graph.as_default() as g: 197 | train_dataset = tf.data.TFRecordDataset( 198 | "./data/prepro/valid_ver60.tfrecords") 199 | train_dataset = train_dataset.map(parse_example).batch(batch_size) 200 | 201 | iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes) 202 | train_initializer = iterator.make_initializer(train_dataset) 203 | 204 | model = Model(config, iterator, word_mat, word_mat,trainable=False,wrong=True, graph=g) 205 | 206 | sess_config = tf.ConfigProto(allow_soft_placement=True) 207 | sess_config.gpu_options.allow_growth = True 208 | out_file = open(config.valid_file,"w",encoding="utf8",errors='ignore') 209 | with tf.Session(config=sess_config) as sess: 210 | sess.run(tf.global_variables_initializer()) 211 | sess.run(train_initializer) 212 | saver = tf.train.Saver() 213 | saver.restore(sess, tf.train.latest_checkpoint(config.save_dir)) 214 | 215 | results = [] 216 | for step in tqdm(range(counter)): 217 | 218 | qa_id,logits, label = sess.run([model.qa_id,model.logits,model.ans]) 219 | for i in range(batch_size): 220 | result = {} 221 | result['query_id'] = qa_id.tolist()[i] 222 | result['predict'] = logits.tolist()[i] 223 | result['label'] = label.tolist()[i] 224 | results.append(result) 225 | for i in range(len(results)): 226 | r = json.dumps(results[i], ensure_ascii=False) 227 | out_file.write("{}\n".format(r)) 228 | out_file.close() -------------------------------------------------------------------------------- /QANet/main_debug_cla_ver646.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import ujson as json 3 | from tqdm import tqdm 4 | 5 | from QANet.util import get_record_parser60 6 | from QANet.model_cla_ver646 import Model 7 | 8 | import os 9 | 10 | ''' 11 | This file is taken and modified from R-Net by HKUST-KnowComp 12 | https://github.com/HKUST-KnowComp/R-Net 13 | ''' 14 | ''' 15 | train:249431 16 | dev:29930 17 | testa:10000249431 18 | ''' 19 | 20 | def train(config,word_mat,train_counter,dev_counter): 21 | print("Building model...") 22 | parser = get_record_parser60() 23 | graph = tf.Graph() 24 | with graph.as_default() as g: 25 | #读取tfrecords文件 26 | train_dataset = tf.data.TFRecordDataset( 27 | "./data/prepro/train_ver60.tfrecords") 28 | train_dataset = train_dataset.shuffle(train_counter).map(parser).batch(config.batch_size) 29 | 30 | dev_dataset = tf.data.TFRecordDataset( 31 | "./data/prepro/valid_ver60.tfrecords") 32 | dev_dataset = dev_dataset.shuffle(dev_counter).map(parser).batch(config.batch_size) 33 | 34 | iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes) 35 | train_initializer = iterator.make_initializer(train_dataset) 36 | dev_initializer = iterator.make_initializer(dev_dataset) 37 | 38 | model = Model(config, iterator, word_mat, word_mat, graph = g) 39 | 40 | sess_config = tf.ConfigProto(allow_soft_placement=True) 41 | sess_config.gpu_options.allow_growth = True 42 | 43 | patience = 0 44 | 45 | with tf.Session() as sess: 46 | writer = tf.summary.FileWriter(config.log_dir) 47 | sess.run(tf.global_variables_initializer()) 48 | saver = tf.train.Saver() 49 | 50 | #训练多少轮就验证一下 51 | eval_per_epoch = 1 52 | #绘制验证集的tensorboard需要用的变量 53 | best_dev_acc = 0 54 | best_dev_loss = 100000 55 | 56 | #如果有先前训练好的模型,那就重载模型 57 | if os.path.exists(os.path.join(config.save_dir, "checkpoint")): 58 | saver.restore(sess, tf.train.latest_checkpoint(config.save_dir)) 59 | global_step = max(sess.run(model.global_step), 1) 60 | 61 | for e in range(0,config.epoch): 62 | sess.run(train_initializer) 63 | #训练 64 | num_steps = int(train_counter/config.batch_size) 65 | for _ in tqdm(range(num_steps)): 66 | global_step = sess.run(model.global_step) + 1 67 | loss, train_op,ansp,acc = sess.run([model.loss, model.train_op, model.ansp, model.acc], feed_dict={ 68 | model.dropout: config.dropout}) 69 | if global_step % config.period == 0: 70 | loss_sum = tf.Summary(value=[tf.Summary.Value( 71 | tag="model/loss", simple_value=loss), ]) 72 | loss_acc = tf.Summary(value=[tf.Summary.Value( 73 | tag="model/acc", simple_value=acc), ]) 74 | writer.add_summary(loss_sum, global_step) 75 | writer.add_summary(loss_acc, global_step) 76 | if global_step % 100 ==0 : 77 | print("===TRAIN=== global_step is {0}, loss is {1}, ansp is {2}, acc is {3}" 78 | .format(global_step, loss, ansp,acc)) 79 | 80 | filename = os.path.join( 81 | config.save_dir, "model_{}.ckpt".format(e)) 82 | saver.save(sess, filename) 83 | 84 | #验证 85 | if e % eval_per_epoch == 0: 86 | sess.run(dev_initializer) 87 | times = int(dev_counter/config.batch_size) 88 | all_dev_loss = 0 89 | all_dev_acc = 0 90 | for _ in range(times): 91 | loss, ansp, acc = sess.run([model.loss, model.ansp, model.acc], feed_dict={ 92 | model.dropout: 0.0}) 93 | all_dev_acc += acc 94 | all_dev_loss += loss 95 | 96 | all_ave_dev_loss = all_dev_loss/times 97 | all_ave_dev_acc = all_dev_acc/times 98 | summary_all_dev_ave_loss = tf.Summary(value=[tf.Summary.Value( 99 | tag="model/dev_ave_loss", simple_value=all_ave_dev_loss), ]) 100 | summary_all_dev_ave_acc = tf.Summary(value=[tf.Summary.Value( 101 | tag="model/dev_ave_acc", simple_value=all_ave_dev_acc), ]) 102 | writer.add_summary(summary_all_dev_ave_loss,e) 103 | writer.add_summary(summary_all_dev_ave_acc, e) 104 | print("==DEV{0}== ave loss is {1}, ave acc is{2}".format(e,all_ave_dev_loss,all_ave_dev_acc)) 105 | if all_ave_dev_loss > best_dev_loss and all_ave_dev_acc < best_dev_acc: 106 | patience += 1 107 | if patience > config.early_stop: 108 | break 109 | else: 110 | patience = 0 111 | best_dev_loss = min(best_dev_loss, all_ave_dev_loss) 112 | best_dev_acc = max(best_dev_acc,all_ave_dev_acc) 113 | 114 | def test(config,word_mat,counter): 115 | def parse_example(serial_example): 116 | features = tf.parse_single_example(serial_example, features={ 117 | 'context_tokens_ids': tf.FixedLenFeature([], tf.string), 118 | 'context_chars_ids': tf.FixedLenFeature([], tf.string), 119 | 'ques_tokens_ids': tf.FixedLenFeature([], tf.string), 120 | 'ques_chars_ids': tf.FixedLenFeature([], tf.string), 121 | 'q_id': tf.FixedLenFeature([], tf.string) 122 | }) 123 | context_tokens = tf.reshape(tf.decode_raw(features['context_tokens_ids'], tf.int64), [100]) 124 | context_chars = tf.reshape(tf.decode_raw(features['context_chars_ids'], tf.int64), [100, 4]) 125 | ques_tokens = tf.reshape(tf.decode_raw(features['ques_tokens_ids'], tf.int64), [30]) 126 | ques_chars = tf.reshape(tf.decode_raw(features['ques_chars_ids'], tf.int64), [30, 4]) 127 | q_id = tf.reshape(tf.decode_raw(features['q_id'], tf.int64), []) 128 | return context_tokens, context_chars, ques_tokens, ques_chars, q_id 129 | 130 | 131 | graph = tf.Graph() 132 | batch_size = 1 133 | print("Loading model...") 134 | with graph.as_default() as g: 135 | train_dataset = tf.data.TFRecordDataset( 136 | "./data/prepro/testa_ver60.tfrecords") 137 | train_dataset = train_dataset.map(parse_example).batch(batch_size) 138 | 139 | iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes) 140 | train_initializer = iterator.make_initializer(train_dataset) 141 | 142 | model = Model(config, iterator, word_mat, word_mat,trainable=False, graph=g) 143 | 144 | sess_config = tf.ConfigProto(allow_soft_placement=True) 145 | sess_config.gpu_options.allow_growth = True 146 | out_file = open(config.answer_file,"w",encoding="utf8",errors='ignore') 147 | with tf.Session(config=sess_config) as sess: 148 | sess.run(tf.global_variables_initializer()) 149 | sess.run(train_initializer) 150 | saver = tf.train.Saver() 151 | saver.restore(sess, tf.train.latest_checkpoint(config.save_dir)) 152 | 153 | results = [] 154 | for step in tqdm(range(counter)): 155 | 156 | qa_id, logits = sess.run([model.id,model.logits]) 157 | for i in range(batch_size): 158 | result = {} 159 | result['query_id'] = qa_id.tolist()[i] 160 | result['predict'] = logits.tolist()[i] 161 | 162 | results.append(result) 163 | for i in range(len(results)): 164 | r = json.dumps(results[i], ensure_ascii=False) 165 | out_file.write("{}\n".format(r)) 166 | out_file.close() 167 | 168 | def valid(config,word_mat,counter): 169 | def parse_example(serial_example): 170 | features = tf.parse_single_example(serial_example, features={ 171 | 'context_tokens_ids': tf.FixedLenFeature([], tf.string), 172 | 'context_chars_ids': tf.FixedLenFeature([], tf.string), 173 | 'ques_tokens_ids': tf.FixedLenFeature([], tf.string), 174 | 'ques_chars_ids': tf.FixedLenFeature([], tf.string), 175 | 'ans': tf.FixedLenFeature([], tf.string), 176 | 'q_id': tf.FixedLenFeature([], tf.string) 177 | }) 178 | context_tokens = tf.reshape(tf.decode_raw(features['context_tokens_ids'], tf.int64), [100]) 179 | context_chars = tf.reshape(tf.decode_raw(features['context_chars_ids'], tf.int64), [100, 4]) 180 | ques_tokens = tf.reshape(tf.decode_raw(features['ques_tokens_ids'], tf.int64), [30]) 181 | ques_chars = tf.reshape(tf.decode_raw(features['ques_chars_ids'], tf.int64), [30, 4]) 182 | ans = tf.reshape(tf.decode_raw(features['ans'], tf.int64), [3]) 183 | q_id = tf.reshape(tf.decode_raw(features['q_id'], tf.int64), []) 184 | return context_tokens, context_chars, ques_tokens, ques_chars, ans,q_id 185 | 186 | 187 | graph = tf.Graph() 188 | batch_size = 1 189 | print("Loading model...") 190 | with graph.as_default() as g: 191 | train_dataset = tf.data.TFRecordDataset( 192 | "./data/prepro/valid_ver60.tfrecords") 193 | train_dataset = train_dataset.map(parse_example).batch(batch_size) 194 | 195 | iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes) 196 | train_initializer = iterator.make_initializer(train_dataset) 197 | 198 | model = Model(config, iterator, word_mat, word_mat,trainable=False,wrong=True, graph=g) 199 | 200 | sess_config = tf.ConfigProto(allow_soft_placement=True) 201 | sess_config.gpu_options.allow_growth = True 202 | out_file = open(config.valid_file,"w",encoding="utf8",errors='ignore') 203 | with tf.Session(config=sess_config) as sess: 204 | sess.run(tf.global_variables_initializer()) 205 | sess.run(train_initializer) 206 | saver = tf.train.Saver() 207 | saver.restore(sess, tf.train.latest_checkpoint(config.save_dir)) 208 | 209 | results = [] 210 | for step in tqdm(range(counter)): 211 | try: 212 | qa_id, logits, label = sess.run([model.qa_id, model.logits, model.ans]) 213 | except: 214 | print(step) 215 | exit(1) 216 | for i in range(batch_size): 217 | result = {} 218 | result['query_id'] = qa_id.tolist()[i] 219 | result['predict'] = logits.tolist()[i] 220 | result['label'] = label.tolist()[i] 221 | results.append(result) 222 | for i in range(len(results)): 223 | r = json.dumps(results[i], ensure_ascii=False) 224 | out_file.write("{}\n".format(r)) 225 | out_file.close() -------------------------------------------------------------------------------- /QANet/model_ans_v20.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from QANet.layers import initializer, regularizer, residual_block, highway, conv, mask_logits, trilinear, total_params, optimized_trilinear_for_attention 3 | 4 | 5 | class Model(object): 6 | def __init__(self, config, batch, word_mat=None, char_mat=None, trainable=True,wrong=False, opt=False, demo = False, graph = None): 7 | self.config = config 8 | self.demo = demo 9 | self.graph = graph if graph is not None else tf.Graph() 10 | with self.graph.as_default(): 11 | 12 | self.global_step = tf.get_variable('global_step', shape=[], dtype=tf.int32, 13 | initializer=tf.constant_initializer(0), trainable=False) 14 | self.dropout = tf.placeholder_with_default(0.0, (), name="dropout") 15 | if self.demo: 16 | self.c = tf.placeholder(tf.int64, [None, config.test_para_limit],"context") 17 | self.q = tf.placeholder(tf.int64, [None, config.test_ques_limit],"question") 18 | self.ch = tf.placeholder(tf.int64, [None, config.test_para_limit, config.char_limit],"context_char") 19 | self.qh = tf.placeholder(tf.int64, [None, config.test_ques_limit, config.char_limit],"question_char") 20 | self.ans = tf.placeholder(tf.int64, [None, config.test_para_limit],"answer_index") 21 | else: 22 | self.c, self.ch, self.q, self.qh, self.ans,self.q_id = batch.get_next() 23 | 24 | 25 | 26 | self.word_mat = tf.Variable(word_mat,name="word_mat", dtype=tf.float32,trainable=False) 27 | 28 | self.char_mat = self.word_mat 29 | self.c_mask = tf.cast(self.c, tf.bool) 30 | self.q_mask = tf.cast(self.q, tf.bool) 31 | self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1) 32 | self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1) 33 | if opt: 34 | N, CL = config.batch_size if not self.demo else 1, config.char_limit 35 | self.c_maxlen = tf.reduce_max(self.c_len) 36 | self.q_maxlen = tf.reduce_max(self.q_len) 37 | 38 | self.c = tf.slice(self.c, [0, 0], [N, self.c_maxlen]) 39 | self.q = tf.slice(self.q, [0, 0], [N, self.q_maxlen]) 40 | self.c_mask = tf.slice(self.c_mask, [0, 0], [N, self.c_maxlen]) 41 | self.q_mask = tf.slice(self.q_mask, [0, 0], [N, self.q_maxlen]) 42 | self.ch = tf.slice(self.ch, [0, 0, 0], [N, self.c_maxlen, CL]) 43 | self.qh = tf.slice(self.qh, [0, 0, 0], [N, self.q_maxlen, CL]) 44 | 45 | else: 46 | self.c_maxlen, self.q_maxlen = config.para_limit, config.ques_limit 47 | 48 | self.ch_len = tf.reshape(tf.reduce_sum( 49 | tf.cast(tf.cast(self.ch, tf.bool), tf.int32), axis=2), [-1]) 50 | self.qh_len = tf.reshape(tf.reduce_sum( 51 | tf.cast(tf.cast(self.qh, tf.bool), tf.int32), axis=2), [-1]) 52 | 53 | self.forward(trainable) 54 | total_params() 55 | 56 | if trainable: 57 | self.lr = tf.minimum(config.learning_rate, 0.001 / tf.log(999.) * tf.log(tf.cast(self.global_step, tf.float32) + 1)) 58 | self.opt = tf.train.AdamOptimizer(learning_rate = self.lr, beta1 = 0.8, beta2 = 0.999, epsilon = 1e-7) 59 | grads = self.opt.compute_gradients(self.loss) 60 | gradients, variables = zip(*grads) 61 | capped_grads, _ = tf.clip_by_global_norm( 62 | gradients, config.grad_clip) 63 | self.train_op = self.opt.apply_gradients( 64 | zip(capped_grads, variables), global_step=self.global_step) 65 | 66 | def forward(self,trainable): 67 | config = self.config 68 | N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads 69 | if not trainable: 70 | N = 1 71 | 72 | with tf.variable_scope("Input_Embedding_Layer"): 73 | ch_emb = tf.nn.embedding_lookup(self.char_mat, self.ch) 74 | ch_emb = tf.reshape(ch_emb, [N * PL, CL, dc]) 75 | qh_emb = tf.reshape(tf.nn.embedding_lookup( 76 | self.char_mat, self.qh), [N * QL, CL, dc]) 77 | ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) 78 | qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) 79 | 80 | # Bidaf style conv-highway encoder 81 | ch_emb = conv(ch_emb, d, 82 | bias = True, activation = tf.nn.relu, kernel_size = 2, name = "char_conv", reuse = None) 83 | qh_emb = conv(qh_emb, d, 84 | bias = True, activation = tf.nn.relu, kernel_size = 2, name = "char_conv", reuse = True) 85 | 86 | ch_emb = tf.reduce_max(ch_emb, axis = 1) 87 | qh_emb = tf.reduce_max(qh_emb, axis = 1) 88 | 89 | ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) 90 | qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) 91 | 92 | c_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) 93 | q_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) 94 | 95 | c_emb = tf.concat([c_emb, ch_emb], axis=2) 96 | q_emb = tf.concat([q_emb, qh_emb], axis=2) 97 | 98 | c_emb = highway(c_emb, size = d, scope = "highway", dropout = self.dropout, reuse = None) 99 | q_emb = highway(q_emb, size = d, scope = "highway", dropout = self.dropout, reuse = True) 100 | 101 | #==ANS EMBEDDING=== 102 | ans_maxlen = 2 103 | ans_num = 3 104 | ans_dim = 300 105 | ans_emb = tf.nn.embedding_lookup(self.word_mat, self.ans) 106 | ans_emb = tf.reshape(ans_emb, [N * ans_num, ans_maxlen, ans_dim]) 107 | ans_emb = highway(ans_emb,size = d, scope = "ans_highway") 108 | 109 | 110 | with tf.variable_scope("Embedding_Encoder_Layer"): 111 | c = residual_block(c_emb, 112 | num_blocks = 1, 113 | num_conv_layers = 4, 114 | kernel_size = 7, 115 | mask = self.c_mask, 116 | num_filters = d, 117 | num_heads = nh, 118 | seq_len = self.c_len, 119 | scope = "Encoder_Residual_Block", 120 | bias = False, 121 | dropout = self.dropout) 122 | q = residual_block(q_emb, 123 | num_blocks = 1, 124 | num_conv_layers = 4, 125 | kernel_size = 7, 126 | mask = self.q_mask, 127 | num_filters = d, 128 | num_heads = nh, 129 | seq_len = self.q_len, 130 | scope = "Encoder_Residual_Block", 131 | reuse = True, # Share the weights between passage and question 132 | bias = False, 133 | dropout = self.dropout) 134 | a = residual_block(ans_emb, 135 | num_blocks = 1, 136 | num_conv_layers = 4, 137 | kernel_size = 7, 138 | num_filters = d, 139 | num_heads = nh, 140 | scope = "Encoder_Residual_Block", 141 | reuse=True, 142 | bias = False) 143 | 144 | with tf.variable_scope("Context_to_Query_Attention_Layer"): 145 | 146 | S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob = 1.0 - self.dropout) 147 | mask_q = tf.expand_dims(self.q_mask, 1) 148 | S_ = tf.nn.softmax(mask_logits(S, mask = mask_q)) 149 | mask_c = tf.expand_dims(self.c_mask, 2) 150 | S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask = mask_c), dim = 1),(0,2,1)) 151 | self.c2q = tf.matmul(S_, q) 152 | self.q2c = tf.matmul(tf.matmul(S_, S_T), c) 153 | attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] 154 | 155 | #===ANS_SELF_ATTENTION=== 156 | a_logits = tf.layers.dense(a,units=1,use_bias=False) 157 | a_weights = tf.nn.softmax(a_logits,axis=1) 158 | a_weights = tf.transpose(a_weights, perm=[0, 2, 1]) 159 | a_att = tf.matmul(a_weights, ans_emb) 160 | a_att = tf.squeeze(a_att) 161 | 162 | with tf.variable_scope("Model_Encoder_Layer"): 163 | inputs = tf.concat(attention_outputs, axis = -1) 164 | self.enc = [conv(inputs, d, name = "input_projection")] 165 | for i in range(2): 166 | if i % 2 == 0: # dropout every 2 blocks 167 | self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) 168 | self.enc.append( 169 | residual_block(self.enc[i], 170 | num_blocks = 7, 171 | num_conv_layers = 2, 172 | kernel_size = 5, 173 | mask = self.c_mask, 174 | num_filters = d, 175 | num_heads = nh, 176 | seq_len = self.c_len, 177 | scope = "Model_Encoder", 178 | bias = False, 179 | reuse = True if i > 0 else None, 180 | dropout = self.dropout) 181 | ) 182 | 183 | with tf.variable_scope("Output_Layer"): 184 | sl_tmp = tf.concat([self.enc[1], self.enc[2]], axis=-1) 185 | 186 | l_tmp1 = conv(sl_tmp, 1, bias=False, name="start_pointer") 187 | 188 | l_tmp1 = tf.transpose(l_tmp1,perm=[0,2,1]) 189 | 190 | reshape = tf.layers.dense(l_tmp1,units=d,use_bias=False) 191 | reshape = tf.transpose(reshape,perm=[0,2,1]) 192 | a_att = tf.reshape(a_att, [-1, 3, d]) 193 | logits = tf.matmul(a_att,reshape) 194 | 195 | logits = tf.squeeze(logits, -1) 196 | self.logits = logits 197 | 198 | outer = tf.nn.softmax(self.logits,axis=1) 199 | self.ansp = tf.argmax(outer, axis=1) 200 | self.outer = outer 201 | if trainable: 202 | ans_label = [[1,0,0]]*N 203 | losses = tf.losses.softmax_cross_entropy(ans_label, self.logits) 204 | self.loss = losses 205 | 206 | self.acc = tf.reduce_mean(tf.cast(tf.equal(self.ansp, tf.argmax(ans_label, 1)), tf.float32)) 207 | 208 | if config.l2_norm is not None: 209 | variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) 210 | l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables) 211 | self.loss += l2_loss 212 | 213 | if config.decay is not None: 214 | self.var_ema = tf.train.ExponentialMovingAverage(config.decay) 215 | ema_op = self.var_ema.apply(tf.trainable_variables()) 216 | with tf.control_dependencies([ema_op]): 217 | self.loss = tf.identity(self.loss) 218 | 219 | self.assign_vars = [] 220 | for var in tf.global_variables(): 221 | v = self.var_ema.average(var) 222 | if v: 223 | self.assign_vars.append(tf.assign(var,v)) 224 | 225 | def get_loss(self): 226 | return self.loss 227 | 228 | def get_global_step(self): 229 | return self.global_step 230 | -------------------------------------------------------------------------------- /QANet/model_cla_v60.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from QANet.layers import initializer, regularizer, residual_block, highway, conv, mask_logits, trilinear, total_params, optimized_trilinear_for_attention 3 | 4 | class Model(object): 5 | def __init__(self, config, batch, word_mat=None, char_mat=None, trainable=True,wrong=False, opt=False, demo = False, graph = None): 6 | self.config = config 7 | self.demo = demo 8 | self.graph = graph if graph is not None else tf.Graph() 9 | with self.graph.as_default(): 10 | 11 | self.global_step = tf.get_variable('global_step', shape=[], dtype=tf.int32, 12 | initializer=tf.constant_initializer(0), trainable=False) 13 | self.dropout = tf.placeholder_with_default(0.0, (), name="dropout") 14 | if self.demo: 15 | self.c = tf.placeholder(tf.int64, [None, config.test_para_limit],"context") 16 | self.q = tf.placeholder(tf.int64, [None, config.test_ques_limit],"question") 17 | self.ch = tf.placeholder(tf.int64, [None, config.test_para_limit, config.char_limit],"context_char") 18 | self.qh = tf.placeholder(tf.int64, [None, config.test_ques_limit, config.char_limit],"question_char") 19 | self.ans = tf.placeholder(tf.int64, [None, config.test_para_limit],"answer_index") 20 | else: 21 | if trainable or wrong: 22 | self.c, self.ch, self.q, self.qh, self.ans,self.qa_id = batch.get_next() 23 | else: 24 | self.c, self.ch, self.q, self.qh,self.id= batch.get_next() 25 | 26 | 27 | # self.word_unk = tf.get_variable("word_unk", shape = [config.glove_dim], initializer=initializer()) 28 | self.word_mat = tf.get_variable("word_mat", initializer=tf.constant( 29 | word_mat, dtype=tf.float32),trainable=False) 30 | # self.word_mat = tf.get_variable("word_mat",[len(word_mat),300],initializer=tf.constant_initializer( 31 | # word_mat, dtype=tf.float32), trainable=False) 32 | # self.char_mat = tf.get_variable( 33 | # "char_mat", initializer=tf.constant(char_mat, dtype=tf.float32)) 34 | self.char_mat = self.word_mat 35 | self.c_mask = tf.cast(self.c, tf.bool) 36 | self.q_mask = tf.cast(self.q, tf.bool) 37 | self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1) 38 | self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1) 39 | if opt: 40 | N, CL = config.batch_size if not self.demo else 1, config.char_limit 41 | self.c_maxlen = tf.reduce_max(self.c_len) 42 | self.q_maxlen = tf.reduce_max(self.q_len) 43 | 44 | self.c = tf.slice(self.c, [0, 0], [N, self.c_maxlen]) 45 | self.q = tf.slice(self.q, [0, 0], [N, self.q_maxlen]) 46 | self.c_mask = tf.slice(self.c_mask, [0, 0], [N, self.c_maxlen]) 47 | self.q_mask = tf.slice(self.q_mask, [0, 0], [N, self.q_maxlen]) 48 | self.ch = tf.slice(self.ch, [0, 0, 0], [N, self.c_maxlen, CL]) 49 | self.qh = tf.slice(self.qh, [0, 0, 0], [N, self.q_maxlen, CL]) 50 | # self.ans = tf.slice(self.ans, [0, 0], [N, self.c_maxlen]) 51 | 52 | else: 53 | self.c_maxlen, self.q_maxlen = config.para_limit, config.ques_limit 54 | 55 | self.ch_len = tf.reshape(tf.reduce_sum( 56 | tf.cast(tf.cast(self.ch, tf.bool), tf.int32), axis=2), [-1]) 57 | self.qh_len = tf.reshape(tf.reduce_sum( 58 | tf.cast(tf.cast(self.qh, tf.bool), tf.int32), axis=2), [-1]) 59 | 60 | self.forward(trainable) 61 | total_params() 62 | 63 | if trainable: 64 | self.lr = tf.minimum(config.learning_rate, 0.001 / tf.log(999.) * tf.log(tf.cast(self.global_step, tf.float32) + 1)) 65 | # self.lr = tf.train.cosine_decay_restarts(config.learning_rate, self.global_step, config.num_steps) 66 | # self.lr = tf.maximum(self.lr, config.end_learning_rate) 67 | self.opt = tf.train.AdamOptimizer(learning_rate = self.lr, beta1 = 0.8, beta2 = 0.999, epsilon = 1e-7) 68 | grads = self.opt.compute_gradients(self.loss) 69 | gradients, variables = zip(*grads) 70 | capped_grads, _ = tf.clip_by_global_norm( 71 | gradients, config.grad_clip) 72 | self.train_op = self.opt.apply_gradients( 73 | zip(capped_grads, variables), global_step=self.global_step) 74 | 75 | def forward(self,trainable): 76 | config = self.config 77 | N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads 78 | if not trainable: 79 | N = 1 80 | 81 | with tf.variable_scope("Input_Embedding_Layer"): 82 | ch_emb = tf.reshape(tf.nn.embedding_lookup( 83 | self.char_mat, self.ch), [N * PL, CL, dc]) 84 | qh_emb = tf.reshape(tf.nn.embedding_lookup( 85 | self.char_mat, self.qh), [N * QL, CL, dc]) 86 | ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) 87 | qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) 88 | 89 | # Bidaf style conv-highway encoder 90 | ch_emb = conv(ch_emb, d, 91 | bias = True, activation = tf.nn.relu, kernel_size = 2, name = "char_conv", reuse = None) 92 | qh_emb = conv(qh_emb, d, 93 | bias = True, activation = tf.nn.relu, kernel_size = 2, name = "char_conv", reuse = True) 94 | 95 | ch_emb = tf.reduce_max(ch_emb, axis = 1) 96 | qh_emb = tf.reduce_max(qh_emb, axis = 1) 97 | 98 | ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) 99 | qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) 100 | 101 | c_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) 102 | q_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) 103 | 104 | c_emb = tf.concat([c_emb, ch_emb], axis=2) 105 | q_emb = tf.concat([q_emb, qh_emb], axis=2) 106 | 107 | c_emb = highway(c_emb, size = d, scope = "highway", dropout = self.dropout, reuse = None) 108 | q_emb = highway(q_emb, size = d, scope = "highway", dropout = self.dropout, reuse = True) 109 | 110 | with tf.variable_scope("Embedding_Encoder_Layer"): 111 | c = residual_block(c_emb, 112 | num_blocks = 1, 113 | num_conv_layers = 4, 114 | kernel_size = 7, 115 | mask = self.c_mask, 116 | num_filters = d, 117 | num_heads = nh, 118 | seq_len = self.c_len, 119 | scope = "Encoder_Residual_Block", 120 | bias = False, 121 | dropout = self.dropout) 122 | q = residual_block(q_emb, 123 | num_blocks = 1, 124 | num_conv_layers = 4, 125 | kernel_size = 7, 126 | mask = self.q_mask, 127 | num_filters = d, 128 | num_heads = nh, 129 | seq_len = self.q_len, 130 | scope = "Encoder_Residual_Block", 131 | reuse = True, # Share the weights between passage and question 132 | bias = False, 133 | dropout = self.dropout) 134 | 135 | with tf.variable_scope("Context_to_Query_Attention_Layer"): 136 | # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) 137 | # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) 138 | # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) 139 | S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob = 1.0 - self.dropout) 140 | mask_q = tf.expand_dims(self.q_mask, 1) 141 | S_ = tf.nn.softmax(mask_logits(S, mask = mask_q)) 142 | mask_c = tf.expand_dims(self.c_mask, 2) 143 | S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask = mask_c), dim = 1),(0,2,1)) 144 | self.c2q = tf.matmul(S_, q) 145 | self.q2c = tf.matmul(tf.matmul(S_, S_T), c) 146 | attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] 147 | 148 | with tf.variable_scope("Model_Encoder_Layer"): 149 | inputs = tf.concat(attention_outputs, axis = -1) 150 | self.enc = [conv(inputs, d, name = "input_projection")] 151 | for i in range(2): 152 | if i % 2 == 0: # dropout every 2 blocks 153 | self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) 154 | self.enc.append( 155 | residual_block(self.enc[i], 156 | num_blocks = 7, 157 | num_conv_layers = 2, 158 | kernel_size = 5, 159 | mask = self.c_mask, 160 | num_filters = d, 161 | num_heads = nh, 162 | seq_len = self.c_len, 163 | scope = "Model_Encoder", 164 | bias = False, 165 | reuse = True if i > 0 else None, 166 | dropout = self.dropout) 167 | ) 168 | 169 | with tf.variable_scope("Output_Layer"): 170 | concat = tf.concat([self.enc[1], self.enc[2]],axis = -1) 171 | # concat = conv(concat, 1, bias=False, name="start_pointer") 172 | # concat = tf.squeeze(concat) 173 | after_conv = tf.layers.conv1d( 174 | inputs=concat, 175 | filters=32, 176 | kernel_size=5, 177 | padding="same", 178 | activation=tf.nn.relu 179 | ) 180 | after_pool = tf.layers.max_pooling1d(inputs=after_conv,pool_size=4,strides=1) 181 | pool_flat = tf.reshape(after_pool, [-1, 97 * 32]) 182 | 183 | dense = tf.layers.dense(inputs=pool_flat,units=3,use_bias=False) 184 | # after_dropout = tf.layers.dropout(inputs=dense, rate=1.0 - self.dropout) 185 | # dense2 = tf.layers.dense(inputs=after_dropout,units=3,activation=tf.nn.relu) 186 | self.logits = dense 187 | 188 | # ans_logist = conv(tf.concat([self.enc[1], self.enc[2]],axis = -1),1, bias = False, name = "ans") 189 | # ans_logist = tf.squeeze(ans_logist, -1) 190 | # ans_logist = tf.nn.softmax(ans_logist) 191 | # ans_logist3 = tf.layers.dense(ans_logist,3,activation=tf.nn.relu) 192 | # self.logits = ans_logist3 193 | 194 | outer = tf.nn.softmax(self.logits) 195 | self.ansp = tf.argmax(outer, axis=1) 196 | 197 | if trainable: 198 | self.outer = outer 199 | 200 | # losses = tf.nn.softmax_cross_entropy_with_logits( 201 | # logits=dense2, labels=self.ans) 202 | # self.loss = tf.reduce_mean(losses) 203 | losses = tf.losses.softmax_cross_entropy(self.ans, self.logits) 204 | self.loss = losses 205 | 206 | self.acc = tf.reduce_mean(tf.cast(tf.equal(self.ansp, tf.argmax(self.ans, 1)), tf.float32)) 207 | 208 | if config.l2_norm is not None: 209 | variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) 210 | l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables) 211 | self.loss += l2_loss 212 | 213 | if config.decay is not None: 214 | self.var_ema = tf.train.ExponentialMovingAverage(config.decay) 215 | ema_op = self.var_ema.apply(tf.trainable_variables()) 216 | with tf.control_dependencies([ema_op]): 217 | self.loss = tf.identity(self.loss) 218 | 219 | self.assign_vars = [] 220 | for var in tf.global_variables(): 221 | v = self.var_ema.average(var) 222 | if v: 223 | self.assign_vars.append(tf.assign(var,v)) 224 | 225 | def get_loss(self): 226 | return self.loss 227 | 228 | def get_global_step(self): 229 | return self.global_step 230 | -------------------------------------------------------------------------------- /QANet/model_cla_ver646.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from QANet.layers import initializer, regularizer, residual_block, highway, conv, mask_logits, trilinear, total_params, optimized_trilinear_for_attention 3 | 4 | class Model(object): 5 | def __init__(self, config, batch, word_mat=None, char_mat=None, trainable=True,wrong=False, opt=False, demo = False, graph = None): 6 | self.config = config 7 | self.demo = demo 8 | self.graph = graph if graph is not None else tf.Graph() 9 | with self.graph.as_default(): 10 | 11 | self.global_step = tf.get_variable('global_step', shape=[], dtype=tf.int32, 12 | initializer=tf.constant_initializer(0), trainable=False) 13 | self.dropout = tf.placeholder_with_default(0.0, (), name="dropout") 14 | if self.demo: 15 | self.c = tf.placeholder(tf.int64, [None, config.test_para_limit],"context") 16 | self.q = tf.placeholder(tf.int64, [None, config.test_ques_limit],"question") 17 | self.ch = tf.placeholder(tf.int64, [None, config.test_para_limit, config.char_limit],"context_char") 18 | self.qh = tf.placeholder(tf.int64, [None, config.test_ques_limit, config.char_limit],"question_char") 19 | self.ans = tf.placeholder(tf.int64, [None, config.test_para_limit],"answer_index") 20 | else: 21 | if trainable or wrong: 22 | self.c, self.ch, self.q, self.qh, self.ans,self.qa_id = batch.get_next() 23 | else: 24 | self.c, self.ch, self.q, self.qh,self.id= batch.get_next() 25 | 26 | 27 | # self.word_unk = tf.get_variable("word_unk", shape = [config.glove_dim], initializer=initializer()) 28 | self.word_mat = tf.get_variable("word_mat", initializer=tf.constant( 29 | word_mat, dtype=tf.float32),trainable=False) 30 | # self.word_mat = tf.get_variable("word_mat",[len(word_mat),300],initializer=tf.constant_initializer( 31 | # word_mat, dtype=tf.float32), trainable=False) 32 | # self.char_mat = tf.get_variable( 33 | # "char_mat", initializer=tf.constant(char_mat, dtype=tf.float32)) 34 | self.char_mat = self.word_mat 35 | self.c_mask = tf.cast(self.c, tf.bool) 36 | self.q_mask = tf.cast(self.q, tf.bool) 37 | self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1) 38 | self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1) 39 | if opt: 40 | N, CL = config.batch_size if not self.demo else 1, config.char_limit 41 | self.c_maxlen = tf.reduce_max(self.c_len) 42 | self.q_maxlen = tf.reduce_max(self.q_len) 43 | 44 | self.c = tf.slice(self.c, [0, 0], [N, self.c_maxlen]) 45 | self.q = tf.slice(self.q, [0, 0], [N, self.q_maxlen]) 46 | self.c_mask = tf.slice(self.c_mask, [0, 0], [N, self.c_maxlen]) 47 | self.q_mask = tf.slice(self.q_mask, [0, 0], [N, self.q_maxlen]) 48 | self.ch = tf.slice(self.ch, [0, 0, 0], [N, self.c_maxlen, CL]) 49 | self.qh = tf.slice(self.qh, [0, 0, 0], [N, self.q_maxlen, CL]) 50 | # self.ans = tf.slice(self.ans, [0, 0], [N, self.c_maxlen]) 51 | 52 | else: 53 | self.c_maxlen, self.q_maxlen = config.para_limit, config.ques_limit 54 | 55 | self.ch_len = tf.reshape(tf.reduce_sum( 56 | tf.cast(tf.cast(self.ch, tf.bool), tf.int32), axis=2), [-1]) 57 | self.qh_len = tf.reshape(tf.reduce_sum( 58 | tf.cast(tf.cast(self.qh, tf.bool), tf.int32), axis=2), [-1]) 59 | 60 | self.forward(trainable) 61 | total_params() 62 | 63 | if trainable: 64 | self.lr = tf.minimum(config.learning_rate, 0.001 / tf.log(999.) * tf.log(tf.cast(self.global_step, tf.float32) + 1)) 65 | # self.lr = tf.train.cosine_decay_restarts(config.learning_rate, self.global_step, config.num_steps) 66 | # self.lr = tf.maximum(self.lr, config.end_learning_rate) 67 | self.opt = tf.train.AdamOptimizer(learning_rate = self.lr, beta1 = 0.8, beta2 = 0.999, epsilon = 1e-7) 68 | grads = self.opt.compute_gradients(self.loss) 69 | gradients, variables = zip(*grads) 70 | capped_grads, _ = tf.clip_by_global_norm( 71 | gradients, config.grad_clip) 72 | self.train_op = self.opt.apply_gradients( 73 | zip(capped_grads, variables), global_step=self.global_step) 74 | 75 | def forward(self,trainable): 76 | config = self.config 77 | N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads 78 | if not trainable: 79 | N = 1 80 | 81 | with tf.variable_scope("Input_Embedding_Layer"): 82 | ch_emb = tf.reshape(tf.nn.embedding_lookup( 83 | self.char_mat, self.ch), [N * PL, CL, dc]) 84 | qh_emb = tf.reshape(tf.nn.embedding_lookup( 85 | self.char_mat, self.qh), [N * QL, CL, dc]) 86 | ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) 87 | qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) 88 | 89 | # Bidaf style conv-highway encoder 90 | ch_emb = conv(ch_emb, d, 91 | bias = True, activation = tf.nn.relu, kernel_size = 1, name = "char_conv", reuse = None) 92 | qh_emb = conv(qh_emb, d, 93 | bias = True, activation = tf.nn.relu, kernel_size = 1, name = "char_conv", reuse = True) 94 | 95 | ch_emb = tf.reduce_max(ch_emb, axis = 1) 96 | qh_emb = tf.reduce_max(qh_emb, axis = 1) 97 | 98 | ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) 99 | qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) 100 | 101 | c_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) 102 | q_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) 103 | 104 | c_emb = tf.concat([c_emb, ch_emb], axis=2) 105 | q_emb = tf.concat([q_emb, qh_emb], axis=2) 106 | 107 | c_emb = highway(c_emb, size = d, scope = "highway", dropout = self.dropout, reuse = None) 108 | q_emb = highway(q_emb, size = d, scope = "highway", dropout = self.dropout, reuse = True) 109 | 110 | with tf.variable_scope("Embedding_Encoder_Layer"): 111 | c = residual_block(c_emb, 112 | num_blocks = 1, 113 | num_conv_layers = 4, 114 | kernel_size = 4, 115 | mask = self.c_mask, 116 | num_filters = d, 117 | num_heads = nh, 118 | seq_len = self.c_len, 119 | scope = "Encoder_Residual_Block", 120 | bias = False, 121 | dropout = self.dropout) 122 | q = residual_block(q_emb, 123 | num_blocks = 1, 124 | num_conv_layers = 4, 125 | kernel_size = 4, 126 | mask = self.q_mask, 127 | num_filters = d, 128 | num_heads = nh, 129 | seq_len = self.q_len, 130 | scope = "Encoder_Residual_Block", 131 | reuse = True, # Share the weights between passage and question 132 | bias = False, 133 | dropout = self.dropout) 134 | 135 | with tf.variable_scope("Context_to_Query_Attention_Layer"): 136 | # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) 137 | # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) 138 | # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) 139 | S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob = 1.0 - self.dropout) 140 | mask_q = tf.expand_dims(self.q_mask, 1) 141 | S_ = tf.nn.softmax(mask_logits(S, mask = mask_q)) 142 | mask_c = tf.expand_dims(self.c_mask, 2) 143 | S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask = mask_c), dim = 1),(0,2,1)) 144 | self.c2q = tf.matmul(S_, q) 145 | self.q2c = tf.matmul(tf.matmul(S_, S_T), c) 146 | attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] 147 | 148 | with tf.variable_scope("Model_Encoder_Layer"): 149 | inputs = tf.concat(attention_outputs, axis = -1) 150 | self.enc = [conv(inputs, d, name = "input_projection")] 151 | for i in range(2): 152 | if i % 2 == 0: # dropout every 2 blocks 153 | self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) 154 | self.enc.append( 155 | residual_block(self.enc[i], 156 | num_blocks = 7, 157 | num_conv_layers = 2, 158 | kernel_size = 5, 159 | mask = self.c_mask, 160 | num_filters = d, 161 | num_heads = nh, 162 | seq_len = self.c_len, 163 | scope = "Model_Encoder", 164 | bias = False, 165 | reuse = True if i > 0 else None, 166 | dropout = self.dropout) 167 | ) 168 | 169 | with tf.variable_scope("Output_Layer"): 170 | concat = tf.concat([self.enc[1], self.enc[2]],axis = -1) 171 | # concat = conv(concat, 1, bias=False, name="start_pointer") 172 | # concat = tf.squeeze(concat) 173 | after_conv = tf.layers.conv1d( 174 | inputs=concat, 175 | filters=32, 176 | kernel_size=5, 177 | padding="same", 178 | activation=tf.nn.relu 179 | ) 180 | after_pool = tf.layers.max_pooling1d(inputs=after_conv,pool_size=2,strides=1) 181 | pool_flat = tf.reshape(after_pool, [-1, 99 * 32]) 182 | 183 | dense = tf.layers.dense(inputs=pool_flat,units=3,use_bias=False) 184 | # after_dropout = tf.layers.dropout(inputs=dense, rate=1.0 - self.dropout) 185 | # dense2 = tf.layers.dense(inputs=after_dropout,units=3,activation=tf.nn.relu) 186 | self.logits = dense 187 | 188 | # ans_logist = conv(tf.concat([self.enc[1], self.enc[2]],axis = -1),1, bias = False, name = "ans") 189 | # ans_logist = tf.squeeze(ans_logist, -1) 190 | # ans_logist = tf.nn.softmax(ans_logist) 191 | # ans_logist3 = tf.layers.dense(ans_logist,3,activation=tf.nn.relu) 192 | # self.logits = ans_logist3 193 | 194 | outer = tf.nn.softmax(self.logits) 195 | self.ansp = tf.argmax(outer, axis=1) 196 | 197 | if trainable: 198 | self.outer = outer 199 | 200 | # losses = tf.nn.softmax_cross_entropy_with_logits( 201 | # logits=dense2, labels=self.ans) 202 | # self.loss = tf.reduce_mean(losses) 203 | losses = tf.losses.softmax_cross_entropy(self.ans, self.logits) 204 | self.loss = losses 205 | 206 | self.acc = tf.reduce_mean(tf.cast(tf.equal(self.ansp, tf.argmax(self.ans, 1)), tf.float32)) 207 | 208 | if config.l2_norm is not None: 209 | variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) 210 | l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables) 211 | self.loss += l2_loss 212 | 213 | if config.decay is not None: 214 | self.var_ema = tf.train.ExponentialMovingAverage(config.decay) 215 | ema_op = self.var_ema.apply(tf.trainable_variables()) 216 | with tf.control_dependencies([ema_op]): 217 | self.loss = tf.identity(self.loss) 218 | 219 | self.assign_vars = [] 220 | for var in tf.global_variables(): 221 | v = self.var_ema.average(var) 222 | if v: 223 | self.assign_vars.append(tf.assign(var,v)) 224 | 225 | def get_loss(self): 226 | return self.loss 227 | 228 | def get_global_step(self): 229 | return self.global_step 230 | -------------------------------------------------------------------------------- /QANet/models/ver20/event/log: -------------------------------------------------------------------------------- 1 | tensorboard tmp file -------------------------------------------------------------------------------- /QANet/models/ver20/model/log: -------------------------------------------------------------------------------- 1 | your model -------------------------------------------------------------------------------- /QANet/models/ver60/event/log: -------------------------------------------------------------------------------- 1 | tensorboard file -------------------------------------------------------------------------------- /QANet/models/ver60/model/log: -------------------------------------------------------------------------------- 1 | model here -------------------------------------------------------------------------------- /QANet/models/ver646/event/log: -------------------------------------------------------------------------------- 1 | tensorboard file -------------------------------------------------------------------------------- /QANet/models/ver646/model/log: -------------------------------------------------------------------------------- 1 | model file -------------------------------------------------------------------------------- /QANet/preprocess.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pickle as cPickle 3 | import json 4 | import numpy as np 5 | import jieba 6 | import thulac 7 | from tqdm import tqdm 8 | 9 | pythonVersion = "python3" 10 | cut = thulac.thulac(seg_only=True) 11 | 12 | def seg_line(line): 13 | # a =cut.cut(line, text=True) 14 | # b = a.split(" ") 15 | return list(jieba.cut(line)) 16 | # return b 17 | 18 | def seg_data(path): 19 | print ('start process ', path) 20 | data = [] 21 | if pythonVersion == "python3": 22 | with open(path, 'r',encoding='utf-8') as f: 23 | for line in tqdm(f): 24 | dic = json.loads(line) 25 | question = dic['query'] 26 | doc = dic['passage'] 27 | alternatives = dic['alternatives'] 28 | data.append([seg_line(question), seg_line(doc), alternatives.split('|'), dic['query_id']]) 29 | else: 30 | with open(path, 'r') as f: 31 | for line in f: 32 | dic = json.loads(line, encoding='utf-8') 33 | question = dic['query'] 34 | doc = dic['passage'] 35 | alternatives = dic['alternatives'] 36 | data.append([seg_line(question), seg_line(doc), alternatives.split('|'), dic['query_id']]) 37 | return data 38 | 39 | 40 | def build_word_count(data): 41 | wordCount = {} 42 | 43 | def add_count(lst): 44 | for word in lst: 45 | if word not in wordCount: 46 | wordCount[word] = 0 47 | wordCount[word] += 1 48 | 49 | for one in data: 50 | [add_count(x) for x in one[0:3]] 51 | print ('word type size ', len(wordCount)) 52 | return wordCount 53 | 54 | 55 | def build_word2id(wordCount, threshold=10): 56 | word2id = {'': 0, '': 1} 57 | for word in wordCount: 58 | if wordCount[word] >= threshold: 59 | if word not in word2id: 60 | word2id[word] = len(word2id) 61 | else: 62 | chars = list(word) 63 | for char in chars: 64 | if char not in word2id: 65 | word2id[char] = len(word2id) 66 | print ('processed word size ', len(word2id)) 67 | return word2id 68 | 69 | 70 | 71 | 72 | 73 | 74 | def get_answer_label(seg_query,ansTokens): 75 | shuffledAnsToken = [] 76 | query = "" 77 | for i in seg_query: 78 | query+=i 79 | label = None 80 | ansTokens = [x.strip() for x in ansTokens] 81 | unkownMark = False 82 | unkownIdx = -1 83 | unkownChar = ['无法确定', '无法确认', '不确定', '不能确定', 'wfqd', '无法选择', '无法确实', '无法取代', '取法确定', '无法确', '无法㾡', '无法去顶', '无确定', 84 | '无法去顶', '我放弃', '无法缺定', '无法无额定', '无法判断', '不清楚', '无人确定',"不知道"] 85 | 86 | for idx, token in enumerate(ansTokens): 87 | for ch in unkownChar: 88 | if token.find(ch) != -1: 89 | unkownMark = True 90 | unkownIdx = idx 91 | break 92 | if unkownMark: 93 | break 94 | # print("%s %s %s : %d %s"%(ansTokens[0],ansTokens[1],ansTokens[2],unkownIdx,ansTokens[unkownIdx])) 95 | minFindStart = 999999 96 | minIdx = -1 97 | if unkownMark == False: 98 | pass 99 | # print("%s %s %s unkonwn mark error" % (ansTokens[0], ansTokens[1], ansTokens[2])) 100 | else: 101 | for idx, token in enumerate(ansTokens): 102 | if unkownIdx == idx: 103 | continue 104 | tmpFindStart = query.find(token) 105 | if tmpFindStart == -1: 106 | tmpFindStart = 999999 107 | 108 | if minFindStart > tmpFindStart: 109 | minIdx = idx 110 | minFindStart = tmpFindStart 111 | if not (minIdx < 0 or minIdx > 2 or unkownMark < 0 or unkownMark > 2): 112 | if minIdx == 0: 113 | label = [1,0,0] 114 | elif unkownIdx == 0 : 115 | label = [0,0,1] 116 | else : label = [0,1,0] 117 | else: 118 | minIdx = -999 119 | pessimisticDic = {"不会", "不可以", "不是", "假的", "不要", "不靠谱", "不能", "没有", "不需要", "没出", "不给", "不用", "不可能", "不好", "不同意", 120 | "不对", "不算", "不行", "不快", "不能", "没用", "不合适", "不正常", "不好", "不可", "不正确", "不高", "不难", "不属于", "不合适", 121 | "不值钱", "不友好", "不幸运", "不应该", "不值"} 122 | for idx, token in enumerate(ansTokens): 123 | if idx == unkownIdx: 124 | continue 125 | for opt in pessimisticDic: 126 | if token.find(opt) != -1: 127 | minIdx = 3 - idx - unkownIdx 128 | if minIdx != -999: 129 | if minIdx == 0: 130 | label = [1, 0, 0] 131 | elif unkownIdx == 0: 132 | label = [0, 0, 1] 133 | else: 134 | label = [0, 1, 0] 135 | else: 136 | minIdx = -999 137 | for idx, token in enumerate(ansTokens): 138 | if token.find("不确定") == -1 and token.find("不能确定") == -1 and ( 139 | token.find("不") != -1 or token.find("否") != -1 or token.find( 140 | "没") != -1 or token.find("错") != -1): 141 | minIdx = 3 - idx - unkownIdx 142 | if minIdx != -999: 143 | if minIdx == 0: 144 | label = [1, 0, 0] 145 | elif unkownIdx == 0: 146 | label = [0, 0, 1] 147 | else: 148 | label = [0, 1, 0] 149 | else: 150 | print("after last process ,still failed") 151 | try: 152 | if label != None: 153 | if minIdx == 0: 154 | if unkownIdx == 1: 155 | shuffledAnsToken = [ansTokens[0],ansTokens[2],ansTokens[1]] 156 | elif unkownIdx ==2: 157 | shuffledAnsToken = [ansTokens[0], ansTokens[1], ansTokens[2]] 158 | elif minIdx == 1: 159 | if unkownIdx == 0: 160 | shuffledAnsToken = [ansTokens[1], ansTokens[2], ansTokens[0]] 161 | elif unkownIdx == 2: 162 | shuffledAnsToken = [ansTokens[1], ansTokens[0], ansTokens[2]] 163 | elif minIdx == 2: 164 | if unkownIdx == 0: 165 | shuffledAnsToken = [ansTokens[2], ansTokens[1], ansTokens[0]] 166 | elif unkownIdx == 1: 167 | shuffledAnsToken = [ansTokens[2], ansTokens[0], ansTokens[1]] 168 | except: 169 | shuffledAnsToken = [] 170 | 171 | return label,ansTokens,shuffledAnsToken 172 | 173 | 174 | 175 | 176 | def transform_data_to_id(raw_data, word2id,fileOut): 177 | data = [] 178 | 179 | def map_word_to_id(word): 180 | output = [] 181 | if word in word2id: 182 | output.append(word2id[word]) 183 | else: 184 | chars = list(word) 185 | for char in chars: 186 | if char in word2id: 187 | output.append(word2id[char]) 188 | else: 189 | output.append(1) 190 | return output 191 | 192 | def map_sent_to_id(sent): 193 | output = [] 194 | for word in sent: 195 | output.extend(map_word_to_id(word)) 196 | return output 197 | 198 | 199 | print("disposing...") 200 | for idx,one in enumerate(raw_data): 201 | question,doc,label= [],[],[] 202 | if word2id is not None: 203 | question = map_sent_to_id(one[0]) 204 | doc = map_sent_to_id(one[1]) 205 | 206 | label,ansTokens,shuffleRes = get_answer_label(one[0],one[2]) 207 | if label == None : 208 | data.append([question, doc, one[0],one[1],label, one[2], one[2], one[-1], idx,-1]) 209 | # fileOut.write("query id:{}| successful mark:{} | label:{} | orignal answer:{} | shuffleRes:{}| question:{} | doc:{}\n".format(one[-1],-1,label, one[2], one[2],one[0], one[1])) 210 | else: 211 | data.append([question,doc,one[0],one[1],ansTokens,label,shuffleRes,one[-1],idx,1]) 212 | try: 213 | if shuffleRes == []: 214 | pass 215 | # fileOut.write( 216 | # "===shuffle error===:query id:{}| successful mark:{} | label:{} | orignal answer:{} | shuffleRes:{}| question:{} | doc:{} | question idx:{},doc idx{}\n".format( 217 | # one[-1], 1,label, ansTokens, shuffleRes, one[0], one[1],question,doc)) 218 | else: 219 | pass 220 | # fileOut.write("query id:{}| successful mark:{} | label:{} | orignal answer:{} | shuffleRes:{}| question:{} | doc:{} | question idx:{},doc idx{}\n".format(one[-1],1,label,ansTokens,shuffleRes,one[0],one[1],question,doc)) 221 | except : 222 | pass 223 | # fileOut.write("===print error==={}\n".format(one[-1])) 224 | #print(label) 225 | print("data size : "+ str(len(data))) 226 | return data 227 | 228 | 229 | def load_pretrained_embeddings_word(embedding_path): 230 | """ 231 | loads the pretrained embeddings from embedding_path, 232 | tokens not in pretrained embeddings will be filtered 233 | Args: 234 | embedding_path: the path of the pretrained embedding file 235 | """ 236 | wordList = [] 237 | trainedEmbeddings = {} 238 | 239 | if pythonVersion == "python3": 240 | with open(embedding_path, 'r',encoding='utf-8') as file: 241 | while 1: 242 | read_line = file.readline() 243 | if not read_line: break 244 | line = read_line 245 | contents = line.strip().split() 246 | token = contents[0] 247 | wordList.append(token) 248 | trainedEmbeddings[token] = list(map(float, contents[1:])) 249 | else: 250 | with open(embedding_path, 'r') as file: 251 | while 1: 252 | read_line = file.readline() 253 | if not read_line: break 254 | line = read_line.decode("utf-8") 255 | contents = line.strip().split() 256 | token = contents[0] 257 | wordList.append(token) 258 | trainedEmbeddings[token] = list(map(float, contents[1:])) 259 | return wordList, trainedEmbeddings 260 | 261 | 262 | def build_word2id_embedding_from_pretrained_embedding(trainedEmbeddings, embed_dim): 263 | word2id = {'': 0, '': 1} 264 | for i,word in enumerate(trainedEmbeddings): 265 | word2id[word] = len(word2id) 266 | 267 | embeddings = np.zeros([len(word2id), embed_dim]) 268 | count = 0 269 | for i,token in enumerate(word2id): 270 | if token in trainedEmbeddings: 271 | # print("==========="+str(i)) 272 | try: 273 | embeddings[word2id[token]] = trainedEmbeddings[token] 274 | except Exception as e: 275 | count+=1 276 | print(e) 277 | print ("trainedEmbeddings[token]:"+token+":"+str(len(trainedEmbeddings[token]))) 278 | print (trainedEmbeddings[token]) 279 | print ("aborded embedding number:"+str(count)) 280 | return word2id, embeddings 281 | 282 | 283 | def process_data(data_path,in_path,out_path,testa_path, threshold , embed_dim , pretrained_embedding_path ,out_embedding_path , out_word2id_path): 284 | # train_file_path = data_path + 'ai_challenger_oqmrc_trainingset_20180816/ai_challenger_oqmrc_trainingset.json' 285 | train_file_path = in_path 286 | dev_file_path = out_path 287 | # test_a_file_path = data_path + testa_path 288 | # path_lst = [train_file_path, dev_file_path, test_a_file_path] 289 | path_lst = [train_file_path] 290 | # path_lst = ["./data/debug.json"] 291 | # output_path = [data_path + x for x in ['train2_compare.pickle', 'dev2xx.pickle', 'testa2xx.pickle']] 292 | output_path = [out_path] 293 | #output_path = ["./data/debug.pick"] 294 | return _process_data(path_lst, output_path ,embed_dim ,out_embedding_path,out_word2id_path, 295 | pretrained_embedding_path,loading_pretrained_embedding=-1,word_min_count=threshold) 296 | 297 | 298 | 299 | ###input file list , threshold , output file list 300 | def _process_data(path_lst,output_file_path,embed_dim,out_embedding_path,out_word2id_path, pretrained_embedding_path = "",loading_pretrained_embedding = 0 ,word_min_count=5): 301 | raw_data = [] 302 | #对原始语料分词 注意答案的分词方式是通过语料中的 | 直接分词的 303 | for path in path_lst: 304 | raw_data.append(seg_data(path)) 305 | word2id = None 306 | print ("seg is OK...") 307 | if loading_pretrained_embedding==1: 308 | wordList,trainedEmbeddings = load_pretrained_embeddings_word(pretrained_embedding_path) 309 | 310 | word2id,embedding = build_word2id_embedding_from_pretrained_embedding(trainedEmbeddings,embed_dim) # 对分词之后的建立所有,小于threshold的词将被分成char,然后加入词表 311 | print("embedding length:" + str(len(embedding)) + "word2id:" + str(len(word2id))) 312 | with open(out_embedding_path, 'wb') as f: 313 | cPickle.dump(embedding, f) 314 | print ("write word2id.obj to data/word2id.obj...") 315 | with open(out_word2id_path, 'wb') as f: 316 | cPickle.dump(word2id, f) 317 | #print("write words.table to data/words.table...") 318 | # with open('data/words.table', 'w') as f: 319 | # s = "" 320 | # for word in trainedEmbeddings: 321 | # s="{0}:{1}\n".format(word.decode('gbk').encode('utf-8'),trainedEmbeddings[word]) 322 | # f.write(s) 323 | 324 | 325 | 326 | elif loading_pretrained_embedding == 0: 327 | word_count = build_word_count( 328 | [y for x in raw_data for y in x]) # [[question,doc,answer,id],[],[]...] word_count = {"":} 329 | with open('data/word-count.obj', 'wb') as f: 330 | cPickle.dump(word_count, f) 331 | word2id = build_word2id(word_count, word_min_count) # 对分词之后的建立所有,小于threshold的词将被分成char,然后加入词表 332 | with open('data/word2id.obj', 'wb') as f: 333 | cPickle.dump(word2id, f) 334 | print("dispose data and write...") 335 | for one_raw_data, one_output_file_path in zip(raw_data, output_file_path): 336 | with open(one_output_file_path, 'wb') as f: 337 | with open("."+one_output_file_path.split('.')[1]+"_log.txt", 'w') as logFout: 338 | one_data = transform_data_to_id(one_raw_data, word2id,logFout) 339 | cPickle.dump(one_data, f) 340 | if word2id is not None : 341 | res = len(word2id) 342 | else : 343 | res = -1 344 | return res 345 | -------------------------------------------------------------------------------- /QANet/run_preprocess.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import argparse 3 | 4 | from QANet.preprocess import process_data 5 | 6 | 7 | def run_preprocess(in_path,out_path): 8 | 9 | dataRoot = './data/pickles' 10 | data = dataRoot 11 | threshold = 1 12 | embed_dim = 300 13 | pretrained_embedding_path = dataRoot+'jwe_word2vec_size300.txt' 14 | train_path = 'aic18/train2_dot_com.json' 15 | valid_path = 'aic18/dev2_dot_com.json' 16 | testa_path = 'aic18/aic_test.json' 17 | out_embedding_path = dataRoot+'embedding.table' 18 | out_word2id_path = dataRoot+'word2id.table' 19 | 20 | # train_path 是输入路径,valid_path是输出路径 21 | vocab_size = process_data(data, in_path,out_path,testa_path, 22 | threshold,embed_dim,pretrained_embedding_path, 23 | out_embedding_path,out_word2id_path) 24 | 25 | -------------------------------------------------------------------------------- /QANet/util.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | ''' 4 | This file is taken and modified from R-Net by HKUST-KnowComp 5 | https://github.com/HKUST-KnowComp/R-Net 6 | ''' 7 | def get_record_parser20(): 8 | def parse_example(serial_example): 9 | features = tf.parse_single_example(serial_example,features={ 10 | 'context_tokens_ids':tf.FixedLenFeature([],tf.string), 11 | 'context_chars_ids':tf.FixedLenFeature([],tf.string), 12 | 'ques_tokens_ids':tf.FixedLenFeature([],tf.string), 13 | 'ques_chars_ids':tf.FixedLenFeature([],tf.string), 14 | 'ans':tf.FixedLenFeature([],tf.string), 15 | 'q_id': tf.FixedLenFeature([], tf.string) 16 | }) 17 | context_tokens = tf.reshape(tf.decode_raw(features['context_tokens_ids'],tf.int64),[100]) 18 | context_chars = tf.reshape(tf.decode_raw(features['context_chars_ids'],tf.int64),[100,4]) 19 | ques_tokens = tf.reshape(tf.decode_raw(features['ques_tokens_ids'], tf.int64), [30]) 20 | ques_chars = tf.reshape(tf.decode_raw(features['ques_chars_ids'], tf.int64), [30, 4]) 21 | ans = tf.reshape(tf.decode_raw(features['ans'],tf.int64),[3,2]) 22 | q_id = tf.reshape(tf.decode_raw(features['q_id'], tf.int64), []) 23 | return context_tokens,context_chars,ques_tokens,ques_chars,ans,q_id 24 | return parse_example 25 | 26 | def get_record_parser60(): 27 | def parse_example(serial_example): 28 | features = tf.parse_single_example(serial_example, features={ 29 | 'context_tokens_ids': tf.FixedLenFeature([], tf.string), 30 | 'context_chars_ids': tf.FixedLenFeature([], tf.string), 31 | 'ques_tokens_ids': tf.FixedLenFeature([], tf.string), 32 | 'ques_chars_ids': tf.FixedLenFeature([], tf.string), 33 | 'ans': tf.FixedLenFeature([], tf.string), 34 | 'q_id': tf.FixedLenFeature([], tf.string) 35 | }) 36 | context_tokens = tf.reshape(tf.decode_raw(features['context_tokens_ids'], tf.int64), [100]) 37 | context_chars = tf.reshape(tf.decode_raw(features['context_chars_ids'], tf.int64), [100, 4]) 38 | ques_tokens = tf.reshape(tf.decode_raw(features['ques_tokens_ids'], tf.int64), [30]) 39 | ques_chars = tf.reshape(tf.decode_raw(features['ques_chars_ids'], tf.int64), [30, 4]) 40 | ans = tf.reshape(tf.decode_raw(features['ans'], tf.int64), [3]) 41 | q_id = tf.reshape(tf.decode_raw(features['q_id'], tf.int64), []) 42 | return context_tokens, context_chars, ques_tokens, ques_chars, ans,q_id 43 | return parse_example -------------------------------------------------------------------------------- /QANet/vote.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import jieba 4 | from tqdm import tqdm 5 | 6 | def get_answer_label(seg_query, ansTokens): 7 | shuffledAnsToken_index = [] 8 | shuffledAnsToken = [] 9 | query = "" 10 | for i in seg_query: 11 | query += i 12 | label = None 13 | ansTokens = [x.strip() for x in ansTokens] 14 | unkownMark = False 15 | unkownIdx = -1 16 | unkownChar = ['无法确定', '无法确认', '不确定', '不能确定', 'wfqd', '无法选择', '无法确实', '无法取代', '取法确定', '无法确', '无法㾡', '无法去顶', '无确定', 17 | '无法去顶', '我放弃', '无法缺定', '无法无额定', '无法判断', '不清楚', '无人确定', "不知道"] 18 | 19 | for idx, token in enumerate(ansTokens): 20 | for ch in unkownChar: 21 | if token.find(ch) != -1: 22 | unkownMark = True 23 | unkownIdx = idx 24 | break 25 | if unkownMark: 26 | break 27 | minFindStart = 999999 28 | minIdx = -1 29 | if unkownMark == False: 30 | pass 31 | else: 32 | for idx, token in enumerate(ansTokens): 33 | if unkownIdx == idx: 34 | continue 35 | tmpFindStart = query.find(token) 36 | if tmpFindStart == -1: 37 | tmpFindStart = 999999 38 | 39 | if minFindStart > tmpFindStart: 40 | minIdx = idx 41 | minFindStart = tmpFindStart 42 | if not (minIdx < 0 or minIdx > 2 or unkownMark < 0 or unkownMark > 2): 43 | if minIdx == 0: 44 | label = [1, 0, 0] 45 | elif unkownIdx == 0: 46 | label = [0, 0, 1] 47 | else: 48 | label = [0, 1, 0] 49 | else: 50 | minIdx = -999 51 | pessimisticDic = {"不会", "不可以", "不是", "假的", "不要", "不靠谱", "不能", "没有", "不需要", "没出", "不给", "不用", "不可能", "不好", 52 | "不同意", 53 | "不对", "不算", "不行", "不快", "不能", "没用", "不合适", "不正常", "不好", "不可", "不正确", "不高", "不难", "不属于", 54 | "不合适", 55 | "不值钱", "不友好", "不幸运", "不应该", "不值"} 56 | for idx, token in enumerate(ansTokens): 57 | if idx == unkownIdx: 58 | continue 59 | for opt in pessimisticDic: 60 | if token.find(opt) != -1: 61 | minIdx = 3 - idx - unkownIdx 62 | if minIdx != -999: 63 | if minIdx == 0: 64 | label = [1, 0, 0] 65 | elif unkownIdx == 0: 66 | label = [0, 0, 1] 67 | else: 68 | label = [0, 1, 0] 69 | else: 70 | minIdx = -999 71 | for idx, token in enumerate(ansTokens): 72 | if token.find("不确定") == -1 and token.find("不能确定") == -1 and ( 73 | token.find("不") != -1 or token.find("否") != -1 or token.find( 74 | "没") != -1 or token.find("错") != -1): 75 | minIdx = 3 - idx - unkownIdx 76 | if minIdx != -999: 77 | if minIdx == 0: 78 | label = [1, 0, 0] 79 | elif unkownIdx == 0: 80 | label = [0, 0, 1] 81 | else: 82 | label = [0, 1, 0] 83 | else: 84 | print("after last process ,still failed") 85 | try: 86 | if label != None: 87 | if minIdx == 0: 88 | if unkownIdx == 1: 89 | shuffledAnsToken_index = [0, 2, 1] 90 | elif unkownIdx == 2: 91 | shuffledAnsToken_index = [0, 1, 2] 92 | elif minIdx == 1: 93 | if unkownIdx == 0: 94 | shuffledAnsToken_index = [1, 2, 0] 95 | elif unkownIdx == 2: 96 | shuffledAnsToken_index = [1, 0, 2] 97 | elif minIdx == 2: 98 | if unkownIdx == 0: 99 | shuffledAnsToken_index = [2, 1, 0] 100 | elif unkownIdx == 1: 101 | shuffledAnsToken_index = [2, 0, 1] 102 | shuffledAnsToken = [ansTokens[i] for i in shuffledAnsToken_index] 103 | except: 104 | shuffledAnsToken_index = [] 105 | 106 | return label, ansTokens, shuffledAnsToken_index, shuffledAnsToken 107 | 108 | 109 | def modify_index_save(templetfile, savefile): 110 | outf = open(savefile, 'w', encoding='utf-8') 111 | inf = open(templetfile, 'r', encoding='utf-8') 112 | for line in tqdm(inf): 113 | line = json.loads(line) 114 | alternatives = line['alternatives'].split('|') 115 | ques_word = list(jieba.cut(line['query'])) 116 | query_id = line['query_id'] 117 | label, ans, index, shu_ans = get_answer_label(ques_word, alternatives) 118 | if len(shu_ans) == 0: 119 | shu_ans = ans 120 | if label is None: 121 | label = [1, 0, 0] 122 | dict = {'query_id': query_id, 'ans_label': label, 'shu_ans': shu_ans, 'index': index, } 123 | print(json.dumps(dict, ensure_ascii=False), file=outf) 124 | 125 | 126 | def modify(filename, templet, savefile): 127 | file_post = open(filename, 'r', encoding='utf-8') 128 | temp = open(templet, 'r', encoding='utf-8') 129 | savefile = open(savefile, 'w', encoding='utf-8') 130 | 131 | temp_data = {} 132 | for line_t in temp: 133 | line_t = json.loads(line_t) 134 | id = line_t['query_id'] 135 | temp_data[id] = {'ans_label': line_t['ans_label'], 'shu_ans': line_t['shu_ans'], 'index': line_t['index']} 136 | 137 | for line in file_post: 138 | line = json.loads(line) 139 | id_f = line['query_id'] 140 | result = temp_data[id_f] 141 | predict = [line['predict'][i] for i in result['index']] 142 | dict = {'query_id': id_f, 'predict': predict, 'ans_label': result['ans_label'], 'shu_ans': result['shu_ans']} 143 | print(json.dumps(dict, ensure_ascii=False), file=savefile) 144 | 145 | 146 | 147 | 148 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Intruduction 2 | This project is my code for [AiChallenger2018 Opinion Questions Machine Reading Comprehension](https://challenger.ai/competition/oqmrc2018). This project mainly has 2 models which are implemented in Tensorflow. 3 | * Model-1 is based on QANet, but I rewrite it in some details. 4 | * Model-2 is based on capsuleNet which mainly from [freefuiiismyname's project](https://github.com/freefuiiismyname/capsule-mrc) 5 | 6 | ## Dependencies 7 | * Python 3.6 8 | * Tensorflow 1.9.0 9 | * tqdm 10 | * gensim 11 | 12 | ## Data Sample 13 | { 14 | “query_id”:1, 15 | “query”:“维生c可以长期吃吗”, 16 | “url”: “xxx”, 17 | “passage”: “每天吃的维生素的量没有超过推荐量的话是没有太大问题的。”, 18 | “alternatives”:”可以|不可以|无法确定”, 19 | “answer”:“可以” 20 | } 21 | 22 | ## Performance 23 | I train each model on one GTX1080ti for 30 epochs, and report the best Performance on dev set. We finally run Model-1 on testa, the accuracy is 73.2. 24 | 25 | Model | Accuracy 26 | ---|--- 27 | Model-1(ensembled) | 73.66 28 | Model-2(ensembled) | 73.85 29 | 1&2 ensembled | 76.62 30 | 31 | ## Project Structure 32 | 33 | * capsuleNet: Model-2's codes 34 | * data: Model-2's data 35 | * QANet: Model-1's codes and data 36 | * start.sh: example for usage 37 | * vote_ser_new_word.py: vote ensemble file 38 | 39 | ## Details 40 | you can see more details in /QANet/README.md and /capsuleNet/README.md 41 | 42 | ## Reference 43 | some codes are borrowed from : 44 | 45 | [NLPLearn/QANet](https://github.com/NLPLearn/QANet) 46 | 47 | [freefuiiismyname/capsule-mrc](https://github.com/freefuiiismyname/capsule-mrc) 48 | -------------------------------------------------------------------------------- /capsuleNet/README.md: -------------------------------------------------------------------------------- 1 | ## Model-2:capsule-mrc 2 | 这个Model主要借鉴了freefuiiismyname大神开源的代码,下面的介绍也大多是源自他本人的项目,我仅仅添加了一点我的理解和修改。 3 | 这个模型是基于BiDAF那个框架进行的修改,主要的变化是把BiDAF框架中的Attention Flow Layer修改成了大赛baseline中用的到的multiway Attention,这个multiway attention可以去搜一下这篇论文《Multiway Attention Networks for Modeling Sentence Pairs》,然后modeling Layer去掉,取而代之的是一个capsule network,最后把编码过的alternatives与胶囊网络的输出点乘下。 4 | 5 | 如果希望了解capsule network,我推荐下面这两篇文章: 6 | 7 | [CapsNet ——胶囊网络原理](https://blog.csdn.net/godwriter/article/details/79216404) 8 | 9 | [先读懂CapsNet架构然后用TensorFlow实现,这应该是最详细的教程了](https://zhuanlan.zhihu.com/p/30753326) 10 | 11 | 这个Model包括了2个子模型用来做ensemble,分别是ver81和ver84两版: 12 | * ver81:用25w数据做的word2vec,lr=0.0005,acc=73.85 13 | * ver84: 用25w数据做的word2vec,使用了cosin_restart的learning rate变换方式,acc=73.74 14 | 15 | ## 模型图 16 | ![pic1](https://github.com/antdlx/aic18_rc/blob/master/capsuleNet/model.png) 17 | 18 | ## 模型思路 19 | **步骤1、问题编码** 20 | 21 | 使用bi-LSTM对query进行编码,它将作为passage和候选答案的背景信息。(无论是passage还是候选答案,都是基于query出现的,故而query应当作为两者的上下文) 22 | 23 | **步骤2、形成候选答案的各自意义** 24 | 25 | 使用bi-LSTM对三个候选答案进行编码,以步骤1输出的state作为lstm的初始状态,使它们拥有问题的上下文信息。编码后将每个候选答案看作capsule,分别代表了三个不同事件。 26 | 27 | **步骤3、形成passage对问题的理解** 28 | 29 | 对passage进行LSTM(使passage每个词语拥有上下文意思,state初始化同上)、match(与问题信息交互)、fuse(信息融合)、cnn(抽取关键信息)之后,形成N个特征capsule,代表了passage根据问题而抽取出的信息。 30 | 31 | **步骤4、以候选答案为中心,对passage信息进行聚类** 32 | 33 | 将passage中抽取出的信息,转换为候选答案capsule。当某答案编码与passage信息相近时,信息更容易为它提供支撑;反之,它受到的支撑将减少。经过几轮的动态路由迭代过程后,最终capsule的模长代表了该答案存在的程度。Softmax后,求出每个候选答案作为答案的概率。 34 | 35 | ## Usage 36 | download the [word2vec](https://pan.baidu.com/s/1Izg778MiUlcoqNMimWKjNQ) and put it into 'data/w2v' 37 | 38 | 直接运行run**.py即可,需要注意的是你需要指定两个参数才可以: 39 | * --mode:test/dev/train/prepro 需要首先选择prepro模式,预处理好数据,然后可以根据需要去选择test/dev/train 模式 40 | * --input: 一个路径,test/dev/train模式需要输入相应的文件路径,prepro模式需要输入test的文件路径 41 | -------------------------------------------------------------------------------- /capsuleNet/dataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | 3 | ''' 4 | This module implements data process strategies. 5 | ''' 6 | import json 7 | import logging 8 | import numpy as np 9 | import random 10 | 11 | 12 | class BRCDataset(object): 13 | ''' 14 | 该模块实现加载和使用数据集的api 15 | ''' 16 | 17 | def __init__(self, max_p_len, max_q_len, 18 | train_files=[], dev_files=[], test_files=[]): 19 | self.logger = logging.getLogger('brc') 20 | self.max_p_len = max_p_len 21 | self.max_q_len = max_q_len 22 | 23 | self.train_set, self.dev_set, self.test_set = [], [], [] 24 | if train_files: 25 | for train_file in train_files: 26 | self.train_set += self._load_dataset(train_file) 27 | self.logger.info('train {} ques.'.format(len(self.train_set))) 28 | 29 | if dev_files: 30 | for dev_file in dev_files: 31 | self.dev_set += self._load_dataset(dev_file) 32 | self.logger.info('dev {} ques'.format(len(self.dev_set))) 33 | 34 | if test_files: 35 | for test_file in test_files: 36 | self.test_set += self._load_dataset(test_file,test=True) 37 | self.logger.info('test {} ques'.format(len(self.test_set))) 38 | 39 | def _load_dataset(self, data_path, sampling=False,test=False): 40 | ''' 41 | 加载数据集 42 | ''' 43 | with open(data_path, encoding='utf-8') as fin: 44 | data_set = [] 45 | filter_long_para, filter_long_query, filter_zero_query = 0, 0, 0 46 | for lidx, line in enumerate(fin): 47 | if sampling: 48 | if random.randint(1, 10) > 1: 49 | continue 50 | sample = json.loads(line.strip()) 51 | if test != True: 52 | if len(sample['segmented_passage']) > self.max_p_len: 53 | filter_long_para += 1 54 | continue 55 | if len(sample['segmented_query']) > self.max_q_len: 56 | filter_long_query += 1 57 | continue 58 | if len(sample['segmented_query']) == 0: 59 | filter_zero_query += 1 60 | continue 61 | else: 62 | if len(sample['segmented_passage']) > self.max_p_len: 63 | filter_long_para += 1 64 | sample['segmented_passage'] = sample['segmented_passage'][:self.max_p_len] 65 | if len(sample['segmented_query']) > self.max_q_len: 66 | filter_long_query += 1 67 | sample['segmented_query'] = sample['segmented_query'][:self.max_q_len] 68 | if len(sample['segmented_query']) == 0: 69 | filter_zero_query += 1 70 | sample['segmented_query'].append(['Empty']) 71 | if len(sample['segmented_passage']) > self.max_p_len: 72 | filter_long_para += 1 73 | continue 74 | if len(sample['segmented_query']) > self.max_q_len: 75 | filter_long_query += 1 76 | continue 77 | if len(sample['segmented_query']) == 0: 78 | filter_zero_query += 1 79 | continue 80 | scores = [] 81 | if 'answer' in sample: 82 | fake_label = 0 83 | # fake_label = sample['label_answer'] 84 | alternatives = sample['alternatives'].split('|') 85 | for alternative in alternatives: 86 | score = 0 87 | if '无法确定' in alternative or '无法确认' in alternative or '无法确的' in alternative: 88 | score += 3 89 | elif '不' in alternative or '没' in alternative: 90 | score += 2 91 | scores.append(score) 92 | if sum(scores) < 5: 93 | sample['choose_type'] = 1.0 94 | else: 95 | sample['choose_type'] = 0.0 96 | f_index = scores.index(min(scores)) 97 | scores[f_index] = 10 98 | s_index = scores.index(min(scores)) 99 | scores[s_index] = 10 100 | t_index = scores.index(min(scores)) 101 | segmented_alternatives = [sample['segmented_alternatives'][f_index], 102 | sample['segmented_alternatives'][s_index], 103 | sample['segmented_alternatives'][t_index]] 104 | 105 | sample['segmented_alternatives'] = segmented_alternatives 106 | pos_alternatives = [sample['pos_alternatives'][f_index], sample['pos_alternatives'][s_index], 107 | sample['pos_alternatives'][t_index]] 108 | sample['pos_alternatives'] = pos_alternatives 109 | if f_index == fake_label: 110 | sample['label_answer'] = 0 111 | elif s_index == fake_label: 112 | sample['label_answer'] = 1 113 | else: 114 | sample['label_answer'] = 2 115 | else: 116 | alternatives = sample['alternatives'].split('|') 117 | for alternative in alternatives: 118 | score = 0 119 | if '无法确定' in alternative or '无法确认' in alternative or '无法确的' in alternative: 120 | score += 3 121 | elif '不' in alternative or '没' in alternative: 122 | score += 2 123 | scores.append(score) 124 | if sum(scores) < 5: 125 | sample['choose_type'] = 1.0 126 | else: 127 | sample['choose_type'] = 0.0 128 | data_set.append(sample) 129 | 130 | print('passage too long :', filter_long_para, 'query filte:', filter_long_query + filter_zero_query) 131 | return data_set 132 | 133 | def _one_mini_batch(self, data, indices, pad_id): 134 | ''' 135 | 一个最小的批次 136 | ''' 137 | batch_data = {'raw_data': [data[i] for i in indices], 138 | 'query_token_ids': [], 139 | 'query_length': [], 140 | 'passage_token_ids': [], 141 | 'passage_length': [], 142 | 'alternative_f_token_ids': [], 143 | 'alternatives_f_length': [], 144 | 'alternative_s_token_ids': [], 145 | 'alternatives_s_length': [], 146 | 'alternative_t_token_ids': [], 147 | 'alternatives_t_length': [], 148 | 'alternative': [], 149 | 'label_answer': [], 150 | 'choose_type': []} 151 | # 将每个样本的信息都记录到batch里 152 | for sidx, sample in enumerate(batch_data['raw_data']): 153 | batch_data['query_token_ids'].append(sample['query_token_ids']) 154 | batch_data['query_length'].append(len(sample['query_token_ids'])) 155 | batch_data['passage_token_ids'].append(sample['passage_token_ids']) 156 | batch_data['passage_length'].append(len(sample['passage_token_ids'])) 157 | 158 | if len(sample['alternatives_token_ids']) < 3: 159 | for i in range(0,3-len(sample['alternatives_token_ids'])): 160 | sample['alternatives_token_ids'].append([0]) 161 | 162 | batch_data['alternative_f_token_ids'].append(sample['alternatives_token_ids'][0]) 163 | batch_data['alternatives_f_length'].append(len(sample['alternatives_token_ids'][0])) 164 | 165 | 166 | batch_data['alternative_s_token_ids'].append(sample['alternatives_token_ids'][1]) 167 | batch_data['alternatives_s_length'].append(len(sample['alternatives_token_ids'][1])) 168 | 169 | batch_data['alternative_t_token_ids'].append(sample['alternatives_token_ids'][2]) 170 | batch_data['alternatives_t_length'].append(len(sample['alternatives_token_ids'][2])) 171 | batch_data['choose_type'].append(sample['choose_type']) 172 | try: 173 | batch_data['label_answer'].append(sample['label_answer']) 174 | except KeyError: 175 | batch_data['label_answer'].append(0) 176 | batch_data, padded_p_len, padded_q_len, padded_a_f_len, padded_a_s_len, padded_a_t_len = self._dynamic_padding( 177 | batch_data, pad_id) 178 | return batch_data 179 | 180 | def _dynamic_padding(self, batch_data, pad_id): 181 | ''' 182 | 根据pad_id动态填充batch_data 183 | ''' 184 | pad_p_len = min(self.max_p_len, max(batch_data['passage_length'])) 185 | pad_q_len = min(self.max_q_len, max(batch_data['query_length'])) 186 | pad_a_f_len = max(batch_data['alternatives_f_length']) 187 | pad_a_s_len = max(batch_data['alternatives_s_length']) 188 | pad_a_t_len = max(batch_data['alternatives_t_length']) 189 | batch_data['passage_token_ids'] = [(ids + [pad_id] * (pad_p_len - len(ids)))[: pad_p_len] 190 | for ids in batch_data['passage_token_ids']] 191 | batch_data['query_token_ids'] = [(ids + [pad_id] * (pad_q_len - len(ids)))[: pad_q_len] 192 | for ids in batch_data['query_token_ids']] 193 | batch_data['alternative_f_token_ids'] = [(ids + [pad_id] * (pad_a_f_len - len(ids)))[: pad_a_f_len] 194 | for ids in batch_data['alternative_f_token_ids']] 195 | batch_data['alternative_s_token_ids'] = [(ids + [pad_id] * (pad_a_s_len - len(ids)))[: pad_a_s_len] 196 | for ids in batch_data['alternative_s_token_ids']] 197 | batch_data['alternative_t_token_ids'] = [(ids + [pad_id] * (pad_a_t_len - len(ids)))[: pad_a_t_len] 198 | for ids in batch_data['alternative_t_token_ids']] 199 | return batch_data, pad_p_len, pad_q_len, pad_a_s_len, pad_a_s_len, pad_a_t_len 200 | 201 | def word_iter(self, set_name=None): 202 | ''' 203 | 遍历数据集里的所有词语 204 | Args: 205 | set_name: if it is set, then the specific set will be used 206 | Returns: 207 | a generator 208 | ''' 209 | if set_name is None: 210 | data_set = self.train_set + self.dev_set + self.test_set 211 | elif set_name == 'train': 212 | data_set = self.train_set 213 | elif set_name == 'dev': 214 | data_set = self.dev_set 215 | elif set_name == 'test': 216 | data_set = self.test_set 217 | else: 218 | raise NotImplementedError('No data set named as {}'.format(set_name)) 219 | if data_set is not None: 220 | for sample in data_set: 221 | for token in sample['segmented_passage']: 222 | yield token 223 | for token in sample['segmented_query']: 224 | yield token 225 | for tokens in sample['segmented_alternatives']: 226 | for token in tokens: 227 | yield token 228 | 229 | def convert_to_ids(self, vocab): 230 | ''' 231 | 把原始数据集里的问题和文章转化为id序列 232 | ''' 233 | for data_set in [self.train_set, self.dev_set, self.test_set]: 234 | if data_set is None: 235 | continue 236 | for sample in data_set: 237 | sample['query_token_ids'] = vocab.convert_to_ids(sample['segmented_query']) 238 | sample['passage_token_ids'] = vocab.convert_to_ids(sample['segmented_passage']) 239 | sample['alternatives_token_ids'] = [] 240 | for ans in sample['segmented_alternatives']: 241 | sample['alternatives_token_ids'].append(vocab.convert_to_ids(ans)) 242 | 243 | def gen_mini_batches(self, set_name, batch_size, pad_id, shuffle=True): 244 | ''' 245 | 对于任一个指定的数据集(train/dev/test)都通用的batch 246 | ''' 247 | if set_name == 'train': 248 | data = self.train_set 249 | elif set_name == 'dev': 250 | data = self.dev_set 251 | elif set_name == 'test': 252 | data = self.test_set 253 | else: 254 | raise NotImplementedError('no such {} set'.format(set_name)) 255 | data_size = len(data) 256 | indices = np.arange(data_size) 257 | if shuffle: 258 | np.random.shuffle(indices) 259 | for batch_start in np.arange(0, data_size, batch_size): 260 | batch_indices = indices[batch_start: batch_start + batch_size] 261 | yield self._one_mini_batch(data, batch_indices, pad_id) 262 | -------------------------------------------------------------------------------- /capsuleNet/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | -------------------------------------------------------------------------------- /capsuleNet/layers/basic_rnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | import tensorflow.contrib as tc 5 | 6 | 7 | def rnn(rnn_type, inputs, length, hidden_size, layer_num=1, dropout_keep_prob=None, concat=True, state=None, 8 | history=False): 9 | """ 10 | 实现 (Bi-)LSTM, (Bi-)GRU 和 (Bi-)RNN 11 | """ 12 | if history: 13 | if not rnn_type.startswith('bi'): 14 | cell = get_cell(rnn_type, hidden_size, layer_num, dropout_keep_prob) 15 | outputs, state = tf.nn.dynamic_rnn(cell, inputs, sequence_length=length, dtype=tf.float32) 16 | if rnn_type.endswith('lstm'): 17 | c, h = state 18 | state = h 19 | else: 20 | # 双向lstm,前向细胞、反向细胞 21 | cell_fw = get_cell(rnn_type, hidden_size, layer_num, dropout_keep_prob) 22 | cell_bw = get_cell(rnn_type, hidden_size, layer_num, dropout_keep_prob) 23 | outputs, state = tf.nn.bidirectional_dynamic_rnn( 24 | cell_bw, cell_fw, inputs, sequence_length=length, dtype=tf.float32, initial_state_fw=state, 25 | initial_state_bw=state 26 | ) 27 | # 获取双向状态 28 | state_fw, state_bw = state 29 | if rnn_type.endswith('lstm'): 30 | c_fw, h_fw = state_fw 31 | c_bw, h_bw = state_bw 32 | # 双向历史信息 33 | state_fw, state_bw = h_fw, h_bw 34 | if concat: 35 | outputs = tf.concat(outputs, 2) 36 | state = tf.concat([state_fw, state_bw], 1) 37 | else: 38 | outputs = outputs[0] + outputs[1] 39 | state = state_fw + state_bw 40 | return outputs, state 41 | else: 42 | if not rnn_type.startswith('bi'): 43 | cell = get_cell(rnn_type, hidden_size, layer_num, dropout_keep_prob) 44 | outputs, state = tf.nn.dynamic_rnn(cell, inputs, sequence_length=length, dtype=tf.float32) 45 | if rnn_type.endswith('lstm'): 46 | c, h = state 47 | state = h 48 | else: 49 | # 双向lstm,前向细胞、反向细胞 50 | cell_fw = get_cell(rnn_type, hidden_size, layer_num, dropout_keep_prob) 51 | cell_bw = get_cell(rnn_type, hidden_size, layer_num, dropout_keep_prob) 52 | outputs, state = tf.nn.bidirectional_dynamic_rnn( 53 | cell_bw, cell_fw, inputs, sequence_length=length, dtype=tf.float32 54 | ) 55 | # 获取双向状态 56 | state_fw, state_bw = state 57 | state = state_fw 58 | if concat: 59 | outputs = tf.concat(outputs, 2) 60 | else: 61 | outputs = outputs[0] + outputs[1] 62 | return outputs, state 63 | 64 | 65 | def get_cell(rnn_type, hidden_size, layer_num=1, dropout_keep_prob=None): 66 | """ 67 | 获取循环神经网络的细胞 68 | """ 69 | if rnn_type.endswith('lstm'): 70 | cell = tc.rnn.LSTMCell(num_units=hidden_size, state_is_tuple=True) 71 | elif rnn_type.endswith('gru'): 72 | cell = tc.rnn.GRUCell(num_units=hidden_size) 73 | elif rnn_type.endswith('rnn'): 74 | cell = tc.rnn.BasicRNNCell(num_units=hidden_size) 75 | else: 76 | raise NotImplementedError('Unsuported rnn type: {}'.format(rnn_type)) 77 | if dropout_keep_prob is not None: 78 | cell = tc.rnn.DropoutWrapper(cell, 79 | input_keep_prob=dropout_keep_prob, 80 | output_keep_prob=dropout_keep_prob) 81 | if layer_num > 1: 82 | cell = tc.rnn.MultiRNNCell([cell]*layer_num, state_is_tuple=True) 83 | return cell 84 | 85 | 86 | -------------------------------------------------------------------------------- /capsuleNet/layers/caps_layer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | 4 | epsilon = 1e-9 5 | 6 | 7 | def reduce_sum(input_tensor, axis=None, keepdims=False): 8 | try: 9 | return tf.reduce_sum(input_tensor, axis=axis, keepdims=keepdims) 10 | except: 11 | return tf.reduce_sum(input_tensor, axis=axis, keep_dims=keepdims) 12 | 13 | 14 | def softmax(logits, axis=None): 15 | try: 16 | return tf.nn.softmax(logits, axis=axis) 17 | except: 18 | return tf.nn.softmax(logits, dim=axis) 19 | 20 | 21 | def fc_capsule(sep_encodes, mask): 22 | attn = softmax(tf.layers.dense(sep_encodes, 1, activation=tf.nn.tanh) + mask, 1) 23 | return tf.reduce_sum(attn * sep_encodes, 1) 24 | 25 | 26 | def classify_capsule(fuse_encodes): 27 | attn = tf.expand_dims(softmax(tf.layers.dense(fuse_encodes, 3, activation=tf.nn.tanh), 1), 2) 28 | return tf.reduce_sum(attn * tf.expand_dims(fuse_encodes, 3), 1) 29 | 30 | 31 | def concat_capsules(capsule_af, capsule_as, capsule_at, mask_f, mask_s, mask_t): 32 | # batch , dim ,3 33 | concat_capsule = tf.concat( 34 | [tf.expand_dims(t, 2) for t in 35 | [fc_capsule(capsule_af, mask_f), fc_capsule(capsule_as, mask_s), fc_capsule(capsule_at, mask_t)]], 2) 36 | # batch ,1 , dim , 1 ,3 37 | expand_capsules = tf.expand_dims(concat_capsule, 1) 38 | alter_capsules = tf.expand_dims(expand_capsules, 3) 39 | return alter_capsules 40 | 41 | 42 | def routing(conv_capsules, alter_capsule, dim_size, conv_nums): 43 | inputs = tf.expand_dims(conv_capsules, 4) 44 | # w [batch size, time, dim ,channels, out capsules] 45 | caps_w = tf.get_variable('caps_weight', shape=(1, 1, dim_size, conv_nums, 3), dtype=tf.float32, 46 | initializer=tf.random_normal_initializer(stddev=0.1)) 47 | caps_b = tf.get_variable('caps_bias', shape=(1, 1, dim_size, conv_nums, 3)) 48 | # u [batch size, time, dim ,channels, out capsules] 49 | u_hat = caps_w * inputs + caps_b 50 | u_hat = u_hat * alter_capsule 51 | # batch , time , 1 , channels , out capsules 52 | b_ij = reduce_sum(0 * u_hat, axis=2, keepdims=True) 53 | iter_time = 3 54 | for r_iter in range(iter_time): 55 | with tf.variable_scope('iter_' + str(r_iter)): 56 | # softmax 57 | c_ij = softmax(b_ij, axis=4) 58 | # At last iteration, use `u_hat` in order to receive gradients from the following graph 59 | if r_iter == iter_time - 1: 60 | s_j = tf.multiply(c_ij, u_hat) 61 | s_j = reduce_sum(s_j, axis=1, keepdims=True) 62 | s_j = reduce_sum(s_j, axis=3, keepdims=True) 63 | v_j = squash(s_j) 64 | v_j = tf.reshape(v_j, [-1,dim_size,3]) 65 | else: # Inner iterations, do not apply backpropagation 66 | s_j = tf.multiply(c_ij, u_hat) 67 | s_j = reduce_sum(s_j, axis=1, keepdims=True) 68 | s_j = reduce_sum(s_j, axis=3, keepdims=True) 69 | v_j = squash(s_j) 70 | 71 | u_produce_v = reduce_sum(v_j * u_hat, axis=2, keepdims=True) 72 | 73 | # b_ij += tf.reduce_sum(u_produce_v, axis=0, keep_dims=True) 74 | b_ij += u_produce_v 75 | 76 | return v_j 77 | 78 | 79 | def squash(vector): 80 | vec_squared_norm = reduce_sum(tf.square(vector), -3, keepdims=True) 81 | scalar_factor = vec_squared_norm / (1 + vec_squared_norm) / tf.sqrt(vec_squared_norm + epsilon) 82 | vec_squashed = scalar_factor * vector # element-wise 83 | return vec_squashed 84 | -------------------------------------------------------------------------------- /capsuleNet/layers/match_layer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 该模块实现Match-LSTM和BiDAF算法 4 | """ 5 | 6 | import tensorflow as tf 7 | import tensorflow.contrib as tc 8 | from .basic_rnn import rnn 9 | 10 | 11 | class AttentionFlowMatchLayer(object): 12 | """ 13 | 实现注意力流层来计算文本-问题、问题-文本的注意力 14 | """ 15 | 16 | def __init__(self, hidden_size): 17 | self.hidden_size = hidden_size 18 | self.dim_size = hidden_size * 2 19 | 20 | """ 21 | 根据问题向量来匹配文章向量 22 | """ 23 | 24 | def match(self, passage_encodes, question_encodes): 25 | with tf.variable_scope('attn-match'): 26 | # bidaf 27 | sim_matrix = tf.matmul(passage_encodes, question_encodes, transpose_b=True) 28 | context2question_attn = tf.matmul(tf.nn.softmax(sim_matrix, -1), question_encodes) 29 | b = tf.nn.softmax(tf.expand_dims(tf.reduce_max(sim_matrix, 2), 1), -1) 30 | question2context_attn = tf.tile(tf.matmul(b, passage_encodes), 31 | [1, tf.shape(passage_encodes)[1], 1]) 32 | 33 | dnm_s1 = tf.expand_dims(passage_encodes, 2) 34 | dnm_s2 = tf.expand_dims(question_encodes, 1) 35 | 36 | # concat Attn 37 | sjt = tf.reduce_sum(dnm_s1 + dnm_s2, 3) 38 | ait = tf.nn.softmax(sjt, 2) 39 | qtc = tf.matmul(ait, question_encodes) 40 | 41 | # bi-linear Attn 42 | sjt = tf.matmul(passage_encodes, tf.transpose(question_encodes, perm=[0, 2, 1])) 43 | ait = tf.nn.softmax(sjt, 2) 44 | qtb = tf.matmul(ait, question_encodes) 45 | 46 | # dot Attn 47 | sjt = tf.reduce_sum(dnm_s1 * dnm_s2, 3) 48 | ait = tf.nn.softmax(sjt, 2) 49 | qtd = tf.matmul(ait, question_encodes) 50 | 51 | # minus Attn 52 | sjt = tf.reduce_sum(dnm_s1 - dnm_s2, 3) 53 | ait = tf.nn.softmax(sjt, 2) 54 | qtm = tf.matmul(ait, question_encodes) 55 | 56 | passage_outputs = tf.concat([passage_encodes, context2question_attn, 57 | passage_encodes * context2question_attn, 58 | passage_encodes * question2context_attn, qtc, qtb, qtd, qtm], -1) 59 | 60 | return passage_outputs, question_encodes 61 | 62 | 63 | class SelfMatchingLayer(object): 64 | """ 65 | Implements the self-matching layer. 66 | """ 67 | 68 | def __init__(self, hidden_size): 69 | self.hidden_size = hidden_size 70 | 71 | def match(self, passage_encodes, whole_passage_encodes, p_length, p_mask): 72 | def dot_attention(inputs, memory, mask, hidden, scope="dot_attention"): 73 | with tf.variable_scope(scope): 74 | JX = tf.shape(inputs)[1] 75 | 76 | with tf.variable_scope("attention"): 77 | inputs_ = tf.nn.relu( 78 | tf.layers.dense(inputs, hidden, use_bias=False)) 79 | memory_ = tf.nn.relu( 80 | tf.layers.dense(memory, hidden, use_bias=False)) 81 | outputs = tf.matmul(inputs_, tf.transpose( 82 | memory_, [0, 2, 1])) / (hidden ** 0.5) 83 | mask = tf.tile(tf.expand_dims(mask, axis=1), [1, JX, 1]) 84 | logits = tf.nn.softmax(outputs + mask) 85 | outputs = tf.matmul(logits, memory) 86 | res = tf.concat([inputs, outputs], axis=2) 87 | 88 | with tf.variable_scope("gate"): 89 | dim = res.get_shape().as_list()[-1] 90 | gate = tf.nn.sigmoid(tf.layers.dense(res, dim, use_bias=False)) 91 | return res * gate 92 | 93 | self_att = dot_attention(passage_encodes, whole_passage_encodes, p_mask, self.hidden_size) 94 | 95 | match_outputs, match_state = rnn('bi-lstm', self_att, p_length, self.hidden_size) 96 | return match_outputs, match_state 97 | -------------------------------------------------------------------------------- /capsuleNet/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/antdlx/aic18_rc/1e243f0b385e4d7645658020ecb202866b9829aa/capsuleNet/model.png -------------------------------------------------------------------------------- /capsuleNet/post_process.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import jieba 4 | from tqdm import tqdm 5 | 6 | 7 | def word_tokenize(sent): 8 | return list(jieba.cut(sent)) 9 | 10 | 11 | def process_file(filename, data_type, save_filename): 12 | print("Generating {} examples...".format(data_type)) 13 | save_ = open(save_filename, 'w', encoding='utf-8') 14 | datas = [] 15 | with open(filename, "r",encoding='utf8',errors='ignore') as fh: 16 | for line in tqdm(fh): 17 | try: 18 | dic = json.loads(line) 19 | except: 20 | continue 21 | 22 | question = dic['query'] 23 | # passage = re.sub(pattern, '', dic['passage']) 24 | passage = dic['passage'] 25 | alternatives = dic['alternatives'] 26 | pos_alternatives = alternatives.split('|') 27 | segmented_alternatives = [] 28 | for alt in pos_alternatives: 29 | segmented_alternatives.append(word_tokenize(alt)) 30 | ques_word = word_tokenize(question) 31 | passage_word = word_tokenize(passage) 32 | if data_type == 'test': 33 | data = {"segmented_passage": passage_word, "segmented_query": ques_word, 34 | "alternatives": alternatives, "pos_alternatives": pos_alternatives, 35 | "segmented_alternatives": segmented_alternatives,"query_id":dic['query_id']} 36 | else: 37 | data = {"segmented_passage": passage_word, "segmented_query": ques_word, 38 | "alternatives": alternatives, "pos_alternatives": pos_alternatives, 39 | "segmented_alternatives": segmented_alternatives, "answer": dic['answer'],"query_id":dic['query_id']} 40 | save_.write(json.dumps(data,ensure_ascii=False)+"\n") 41 | datas.append(data) 42 | # random.shuffle(examples) 43 | print("{} data in total".format(len(datas))) 44 | save_.close() 45 | return datas 46 | 47 | 48 | 49 | def prepro(ipath,mode,opath): 50 | # 1-valid 2-test 3-prepro 51 | if mode == 1: 52 | process_file(ipath, 'dev', opath) 53 | elif mode == 2: 54 | process_file(ipath, 'test', opath) 55 | elif mode == 3: 56 | process_file("../data/devset/dev_mini.json", 'dev', "../data/devset/dev_pre.json") 57 | process_file("../data/trainset/train_mini.json", 'train', "../data/trainset/train_pre.json") 58 | process_file(ipath, 'test', opath) 59 | -------------------------------------------------------------------------------- /capsuleNet/run81.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | 3 | 4 | # import sys 5 | # sys.path.append('..') 6 | 7 | import os 8 | 9 | from capsuleNet.post_process import prepro 10 | from capsuleNet.vocab import Vocab 11 | 12 | os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" 13 | import pickle 14 | import argparse 15 | import logging 16 | from capsuleNet.dataset import BRCDataset 17 | # from vocab import Vocab 18 | from capsuleNet.rc_model81 import RCModel 19 | 20 | 21 | def parse_args(): 22 | """ 23 | 解析命令行变量 24 | 25 | """ 26 | parser = argparse.ArgumentParser('Reading Comprehension on aic dataset') 27 | parser.add_argument('--mode', default="test", 28 | help="Running mode test/dev/train/prepro") 29 | parser.add_argument('--input', default="../data/devset/dev_mini.json", 30 | help='input path') 31 | parser.add_argument('--prepare', action='store_true', default=False, 32 | help='create the directories, prepare the vocabulary and embeddings') 33 | parser.add_argument('--train', action='store_true', default=False, 34 | help='train the model') 35 | parser.add_argument('--evaluate', action='store_true', default=False, 36 | help='evaluate the model on dev set') 37 | parser.add_argument('--predict', action='store_true', default=False, 38 | help='predict the answers for test set with trained model') 39 | parser.add_argument('--gpu', type=str, default='3', 40 | help='specify gpu device') 41 | 42 | train_settings = parser.add_argument_group('train settings') 43 | train_settings.add_argument('--optim', default='adam', 44 | help='optimizer type') 45 | train_settings.add_argument('--learning_rate', type=float, default=0.0005, 46 | help='learning rate') 47 | train_settings.add_argument('--weight_decay', type=float, default=0, 48 | help='weight decay') 49 | train_settings.add_argument('--dropout_keep_prob', type=float, default=1, 50 | help='dropout keep rate') 51 | train_settings.add_argument('--batch_size', type=int, default=10, 52 | help='train batch size') 53 | train_settings.add_argument('--epochs', type=int, default=4, 54 | help='train epochs') 55 | 56 | model_settings = parser.add_argument_group('model settings') 57 | model_settings.add_argument('--algo', choices=['BIDAF', 'MLSTM'], default='BIDAF', 58 | help='choose the algorithm to use') 59 | model_settings.add_argument('--load_epoch', default=1) 60 | model_settings.add_argument('--embed_size', type=int, default=300, 61 | help='size of the embeddings') 62 | model_settings.add_argument('--hidden_size', type=int, default=150, 63 | help='size of LSTM hidden units') 64 | model_settings.add_argument('--max_p_len', type=int, default=500, 65 | help='max length of passage') 66 | model_settings.add_argument('--max_q_len', type=int, default=30, 67 | help='max length of question') 68 | model_settings.add_argument('--max_a_len', type=int, default=10, 69 | help='max length of answer') 70 | 71 | path_settings = parser.add_argument_group('path settings') 72 | path_settings.add_argument('--train_files', nargs='+', 73 | default=['../data/trainset/train_pre.json'], 74 | help='list of files that contain the preprocessed train data') 75 | path_settings.add_argument('--dev_files', nargs='+', 76 | default=['../data/devset/dev_pre.json'], 77 | help='list of files that contain the preprocessed dev data') 78 | path_settings.add_argument('--test_files', nargs='+', 79 | default=['../data/v81/testset/test_pre.json'], 80 | help='list of files that contain the preprocessed test data') 81 | path_settings.add_argument('--vocab_dir', default='../data/v81/vocab/', 82 | help='the dir to save vocabulary') 83 | path_settings.add_argument('--model_dir', default='../data/v81/models/', 84 | help='the dir to store models') 85 | path_settings.add_argument('--result_dir', default='../data/v81/results/', 86 | help='the dir to output the results') 87 | path_settings.add_argument('--summary_dir', default='../data/v81/summary/', 88 | help='the dir to write tensorboard summary') 89 | path_settings.add_argument('--log_path', default='../data/v81/logging2', 90 | help='path of the log file. If not set, logs are printed to console') 91 | 92 | return parser.parse_args() 93 | 94 | 95 | def prepare(args): 96 | """ 97 | 检查数据,创建目录,准备词汇表和词嵌入 98 | checks data, creates the directories, prepare the vocabulary and embeddings 99 | """ 100 | logger = logging.getLogger("brc") 101 | logger.info('checking data file...') 102 | for data_path in args.train_files + args.dev_files + args.test_files: 103 | assert os.path.exists(data_path), '{} is not exits.'.format(data_path) 104 | logger.info('establish folder...') 105 | for dir_path in [args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir]: 106 | if not os.path.exists(dir_path): 107 | os.makedirs(dir_path) 108 | 109 | logger.info('establish vocab...') 110 | brc_data = BRCDataset(args.max_p_len, args.max_q_len, 111 | args.train_files, args.dev_files, args.test_files) 112 | vocab = Vocab(lower=True) 113 | print(vocab.size()) 114 | for word in brc_data.word_iter('train'): 115 | vocab.add(word) 116 | 117 | unfiltered_vocab_size = vocab.size() 118 | vocab.filter_tokens_by_cnt(min_cnt=2) 119 | 120 | print(vocab.size()) 121 | filtered_num = unfiltered_vocab_size - vocab.size() 122 | logger.info('filte {} words, final num is {}'.format(filtered_num, 123 | vocab.size())) 124 | logger.info('use w2v...') 125 | # vocab.randomly_init_embeddings(args.embed_size) 126 | vocab.load_pretrained_embeddings('../data/w2v/word2vec.model') 127 | 128 | print(vocab.size()) 129 | logger.info('save word table...') 130 | with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout: 131 | pickle.dump(vocab, fout) 132 | 133 | logger.info('finish prepro!') 134 | 135 | 136 | def train(args): 137 | """ 138 | 训练阅读理解模型 139 | """ 140 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 141 | logger = logging.getLogger("brc") 142 | 143 | file_handler = logging.FileHandler(args.log_path) 144 | file_handler.setLevel(logging.INFO) 145 | file_handler.setFormatter(formatter) 146 | logger.addHandler(file_handler) 147 | 148 | console_handler = logging.StreamHandler() 149 | console_handler.setLevel(logging.INFO) 150 | console_handler.setFormatter(formatter) 151 | logger.addHandler(console_handler) 152 | 153 | logger.info(args) 154 | 155 | logger.info('loading datasets and vocab.data...') 156 | with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: 157 | vocab = pickle.load(fin) 158 | brc_data = BRCDataset(args.max_p_len, args.max_q_len, 159 | args.train_files, args.dev_files) 160 | logger.info('changing to id...') 161 | brc_data.convert_to_ids(vocab) 162 | logger.info('init model...') 163 | rc_model = RCModel(vocab, args) 164 | logger.info('training model...') 165 | rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir, 166 | save_prefix=args.algo, 167 | dropout_keep_prob=args.dropout_keep_prob) 168 | logger.info('finish training!') 169 | 170 | 171 | def evaluate(args): 172 | """ 173 | 对训练好的模型进行验证 174 | """ 175 | logger = logging.getLogger("brc") 176 | logger.info('loading datasets and vocab.data...') 177 | with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: 178 | vocab = pickle.load(fin) 179 | assert len(args.dev_files) > 0, 'can not find valid file.' 180 | brc_data = BRCDataset(args.max_p_len, args.max_q_len, dev_files=args.dev_files) 181 | logger.info('change txt to id list') 182 | brc_data.convert_to_ids(vocab) 183 | logger.info('reloading model...') 184 | rc_model = RCModel(vocab, args) 185 | rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo + '_{}'.format(args.epochs)) 186 | logger.info('valid model...') 187 | dev_batches = brc_data.gen_mini_batches('dev', args.batch_size, 188 | pad_id=vocab.get_id(vocab.pad_token), shuffle=False) 189 | dev_loss = rc_model.evaluate(dev_batches, result_dir=args.result_dir, result_prefix='valid_v81') 190 | logger.info('dev loss is: {}'.format(dev_loss)) 191 | logger.info('save predict ans to {}'.format(os.path.join(args.result_dir))) 192 | 193 | 194 | def predict(args): 195 | """ 196 | 预测测试文件的答案 197 | """ 198 | logger = logging.getLogger("brc") 199 | logger.info('loading datasets and vocab.data...') 200 | with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: 201 | vocab = pickle.load(fin) 202 | assert len(args.test_files) > 0, 'can not find test file.' 203 | brc_data = BRCDataset(args.max_p_len, args.max_q_len, 204 | test_files=args.test_files) 205 | logger.info('change txt to id...') 206 | brc_data.convert_to_ids(vocab) 207 | logger.info('reloading model...') 208 | rc_model = RCModel(vocab, args) 209 | rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo + '_{}'.format(args.epochs)) 210 | logger.info('predict ans...') 211 | test_batches = brc_data.gen_mini_batches('test', args.batch_size, 212 | pad_id=vocab.get_id(vocab.pad_token), shuffle=False) 213 | rc_model.evaluate(test_batches, 214 | result_dir=args.result_dir, result_prefix='test_v81') 215 | 216 | 217 | def run(): 218 | """ 219 | 预训练并运行整个系统. 220 | """ 221 | args = parse_args() 222 | 223 | if args.mode == "dev": 224 | prepro(args.input,1,'../data/devset/dev_pre.json') 225 | args.evaluate = True 226 | elif args.mode == "test": 227 | prepro(args.input, 2, '../data/v81/testset/test_pre.json') 228 | args.predict = True 229 | elif args.mode == "prepro": 230 | args.prepare = True 231 | prepro(args.input, 3, '../data/v81/testset/test_pre.json') 232 | elif args.mode == "train": 233 | prepro(args.input, 1, '../data/trainset/train_pre.json') 234 | args.train = True 235 | 236 | logger = logging.getLogger("brc") 237 | logger.setLevel(logging.INFO) 238 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 239 | if args.log_path: 240 | file_handler = logging.FileHandler(args.log_path) 241 | file_handler.setLevel(logging.INFO) 242 | file_handler.setFormatter(formatter) 243 | logger.addHandler(file_handler) 244 | else: 245 | console_handler = logging.StreamHandler() 246 | console_handler.setLevel(logging.INFO) 247 | console_handler.setFormatter(formatter) 248 | logger.addHandler(console_handler) 249 | 250 | logger.info('Running with args : {}'.format(args)) 251 | 252 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 253 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu 254 | 255 | if args.prepare: 256 | prepare(args) 257 | if args.train: 258 | train(args) 259 | if args.evaluate: 260 | evaluate(args) 261 | if args.predict: 262 | predict(args) 263 | 264 | 265 | if __name__ == '__main__': 266 | run() 267 | -------------------------------------------------------------------------------- /capsuleNet/run84.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | 3 | 4 | # import sys 5 | # sys.path.append('..') 6 | 7 | import os 8 | 9 | from capsuleNet.post_process import prepro 10 | 11 | os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" 12 | import pickle 13 | import argparse 14 | import logging 15 | from capsuleNet.dataset import BRCDataset 16 | from capsuleNet.vocab import Vocab 17 | from capsuleNet.rc_model84 import RCModel 18 | 19 | 20 | def parse_args(): 21 | """ 22 | 解析命令行变量 23 | 24 | """ 25 | parser = argparse.ArgumentParser('Reading Comprehension on aic dataset') 26 | parser.add_argument('--mode', default="test", 27 | help="Running mode test/dev/train/prepro") 28 | parser.add_argument('--input', default="../data/v84/testset/test_mini.json", 29 | help='input path') 30 | parser.add_argument('--prepare', action='store_true', default=False, 31 | help='create the directories, prepare the vocabulary and embeddings') 32 | parser.add_argument('--train', action='store_true', default=False, 33 | help='train the model') 34 | parser.add_argument('--evaluate', action='store_true', default=False, 35 | help='evaluate the model on dev set') 36 | parser.add_argument('--predict', action='store_true', default=False, 37 | help='predict the answers for test set with trained model') 38 | parser.add_argument('--gpu', type=str, default='3', 39 | help='specify gpu device') 40 | 41 | train_settings = parser.add_argument_group('train settings') 42 | train_settings.add_argument('--optim', default='adam', 43 | help='optimizer type') 44 | train_settings.add_argument('--learning_rate', type=float, default=0.001, 45 | help='learning rate') 46 | train_settings.add_argument('--weight_decay', type=float, default=0, 47 | help='weight decay') 48 | train_settings.add_argument('--dropout_keep_prob', type=float, default=1, 49 | help='dropout keep rate') 50 | train_settings.add_argument('--batch_size', type=int, default=10, 51 | help='train batch size') 52 | train_settings.add_argument('--epochs', type=int, default=4, 53 | help='train epochs') 54 | 55 | model_settings = parser.add_argument_group('model settings') 56 | model_settings.add_argument('--algo', choices=['BIDAF', 'MLSTM'], default='BIDAF', 57 | help='choose the algorithm to use') 58 | model_settings.add_argument('--load_epoch', default=1) 59 | model_settings.add_argument('--embed_size', type=int, default=300, 60 | help='size of the embeddings') 61 | model_settings.add_argument('--hidden_size', type=int, default=150, 62 | help='size of LSTM hidden units') 63 | model_settings.add_argument('--max_p_len', type=int, default=500, 64 | help='max length of passage') 65 | model_settings.add_argument('--max_q_len', type=int, default=30, 66 | help='max length of question') 67 | model_settings.add_argument('--max_a_len', type=int, default=10, 68 | help='max length of answer') 69 | 70 | path_settings = parser.add_argument_group('path settings') 71 | path_settings.add_argument('--train_files', nargs='+', 72 | default=['../data/trainset/train_pre.json'], 73 | help='list of files that contain the preprocessed train data') 74 | path_settings.add_argument('--dev_files', nargs='+', 75 | default=['../data/devset/dev_pre.json'], 76 | help='list of files that contain the preprocessed dev data') 77 | path_settings.add_argument('--test_files', nargs='+', 78 | default=['../data/v84/testset/test_pre.json'], 79 | help='list of files that contain the preprocessed test data') 80 | path_settings.add_argument('--vocab_dir', default='../data/v84/vocab/', 81 | help='the dir to save vocabulary') 82 | path_settings.add_argument('--model_dir', default='../data/v84/models/', 83 | help='the dir to store models') 84 | path_settings.add_argument('--result_dir', default='../data/v84/results/', 85 | help='the dir to output the results') 86 | path_settings.add_argument('--summary_dir', default='../data/v84/summary/', 87 | help='the dir to write tensorboard summary') 88 | path_settings.add_argument('--log_path', default='../data/v84/logging2', 89 | help='path of the log file. If not set, logs are printed to console') 90 | 91 | return parser.parse_args() 92 | 93 | 94 | def prepare(args): 95 | """ 96 | 检查数据,创建目录,准备词汇表和词嵌入 97 | checks data, creates the directories, prepare the vocabulary and embeddings 98 | """ 99 | logger = logging.getLogger("brc") 100 | logger.info('checking data file...') 101 | for data_path in args.train_files + args.dev_files + args.test_files: 102 | assert os.path.exists(data_path), '{} is not exits.'.format(data_path) 103 | logger.info('establish folder...') 104 | for dir_path in [args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir]: 105 | if not os.path.exists(dir_path): 106 | os.makedirs(dir_path) 107 | 108 | logger.info('establish vocab...') 109 | brc_data = BRCDataset(args.max_p_len, args.max_q_len, 110 | args.train_files, args.dev_files, args.test_files) 111 | vocab = Vocab(lower=True) 112 | print(vocab.size()) 113 | for word in brc_data.word_iter('train'): 114 | vocab.add(word) 115 | 116 | unfiltered_vocab_size = vocab.size() 117 | vocab.filter_tokens_by_cnt(min_cnt=2) 118 | 119 | print(vocab.size()) 120 | filtered_num = unfiltered_vocab_size - vocab.size() 121 | logger.info('filte {} words, final num is {}'.format(filtered_num, 122 | vocab.size())) 123 | logger.info('use w2v...') 124 | # vocab.randomly_init_embeddings(args.embed_size) 125 | vocab.load_pretrained_embeddings('../data/w2v/word2vec.model') 126 | 127 | print(vocab.size()) 128 | logger.info('save word table...') 129 | with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout: 130 | pickle.dump(vocab, fout) 131 | 132 | logger.info('finish prepro!') 133 | 134 | 135 | def train(args): 136 | """ 137 | 训练阅读理解模型 138 | """ 139 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 140 | logger = logging.getLogger("brc") 141 | 142 | file_handler = logging.FileHandler(args.log_path) 143 | file_handler.setLevel(logging.INFO) 144 | file_handler.setFormatter(formatter) 145 | logger.addHandler(file_handler) 146 | 147 | console_handler = logging.StreamHandler() 148 | console_handler.setLevel(logging.INFO) 149 | console_handler.setFormatter(formatter) 150 | logger.addHandler(console_handler) 151 | 152 | logger.info(args) 153 | 154 | logger.info('loading datasets and vocab.data...') 155 | with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: 156 | vocab = pickle.load(fin) 157 | brc_data = BRCDataset(args.max_p_len, args.max_q_len, 158 | args.train_files, args.dev_files) 159 | logger.info('changing to id...') 160 | brc_data.convert_to_ids(vocab) 161 | logger.info('init model...') 162 | rc_model = RCModel(vocab, args) 163 | logger.info('training model...') 164 | rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir, 165 | save_prefix=args.algo, 166 | dropout_keep_prob=args.dropout_keep_prob) 167 | logger.info('finish training!') 168 | 169 | 170 | def evaluate(args): 171 | """ 172 | 对训练好的模型进行验证 173 | """ 174 | logger = logging.getLogger("brc") 175 | logger.info('loading datasets and vocab.data...') 176 | with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: 177 | vocab = pickle.load(fin) 178 | assert len(args.dev_files) > 0, 'can not find valid file.' 179 | brc_data = BRCDataset(args.max_p_len, args.max_q_len, dev_files=args.dev_files) 180 | logger.info('change txt to id list') 181 | brc_data.convert_to_ids(vocab) 182 | logger.info('reloading model...') 183 | rc_model = RCModel(vocab, args) 184 | rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo + '_{}'.format(args.epochs)) 185 | logger.info('valid model...') 186 | dev_batches = brc_data.gen_mini_batches('dev', args.batch_size, 187 | pad_id=vocab.get_id(vocab.pad_token), shuffle=False) 188 | dev_loss = rc_model.evaluate( 189 | dev_batches, result_dir=args.result_dir, result_prefix='valid_v84') 190 | logger.info('dev loss is: {}'.format(dev_loss)) 191 | logger.info('save predict ans to {}'.format(os.path.join(args.result_dir))) 192 | 193 | 194 | def predict(args): 195 | """ 196 | 预测测试文件的答案 197 | """ 198 | logger = logging.getLogger("brc") 199 | logger.info('loading datasets and vocab.data...') 200 | with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: 201 | vocab = pickle.load(fin) 202 | assert len(args.test_files) > 0, 'can not find test file.' 203 | brc_data = BRCDataset(args.max_p_len, args.max_q_len, 204 | test_files=args.test_files) 205 | logger.info('change txt to id...') 206 | brc_data.convert_to_ids(vocab) 207 | logger.info('reloading model...') 208 | rc_model = RCModel(vocab, args) 209 | rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo + '_{}'.format(args.epochs)) 210 | logger.info('predict ans...') 211 | test_batches = brc_data.gen_mini_batches('test', args.batch_size, 212 | pad_id=vocab.get_id(vocab.pad_token), shuffle=False) 213 | rc_model.evaluate(test_batches, 214 | result_dir=args.result_dir, result_prefix='test_v84') 215 | 216 | 217 | def run(): 218 | """ 219 | 预训练并运行整个系统. 220 | """ 221 | args = parse_args() 222 | 223 | if args.mode == "dev": 224 | prepro(args.input,1,'../data/devset/dev_pre.json') 225 | args.evaluate = True 226 | elif args.mode == "test": 227 | prepro(args.input, 2, '../data/v84/testset/test_pre.json') 228 | args.predict = True 229 | elif args.mode == "prepro": 230 | args.prepare = True 231 | prepro(args.input, 3, '../data/v84/testset/test_pre.json') 232 | elif args.mode == "train": 233 | prepro(args.input, 1, '../data/trainset/train_pre.json') 234 | args.train = True 235 | 236 | logger = logging.getLogger("brc") 237 | logger.setLevel(logging.INFO) 238 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 239 | if args.log_path: 240 | file_handler = logging.FileHandler(args.log_path) 241 | file_handler.setLevel(logging.INFO) 242 | file_handler.setFormatter(formatter) 243 | logger.addHandler(file_handler) 244 | else: 245 | console_handler = logging.StreamHandler() 246 | console_handler.setLevel(logging.INFO) 247 | console_handler.setFormatter(formatter) 248 | logger.addHandler(console_handler) 249 | 250 | logger.info('Running with args : {}'.format(args)) 251 | 252 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 253 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu 254 | 255 | if args.prepare: 256 | prepare(args) 257 | if args.train: 258 | train(args) 259 | if args.evaluate: 260 | evaluate(args) 261 | if args.predict: 262 | predict(args) 263 | 264 | if __name__ == '__main__': 265 | run() 266 | -------------------------------------------------------------------------------- /capsuleNet/vocab.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | """ 3 | This module implements the Vocab class for converting string to id and back 4 | """ 5 | 6 | import numpy as np 7 | from gensim.models import Word2Vec 8 | 9 | 10 | class Vocab(object): 11 | """ 12 | 通过tokens一致的嵌入,实现一个词汇表来存储数据里的tokens(词语) 13 | """ 14 | 15 | def __init__(self, filename=None, initial_tokens=None, lower=False): 16 | self.id2token = {} 17 | self.token2id = {} 18 | self.token_cnt = {} 19 | self.lower = lower 20 | 21 | self.embed_dim = None 22 | self.embeddings = None 23 | 24 | self.pad_token = '' 25 | self.unk_token = '' 26 | 27 | self.initial_tokens = initial_tokens if initial_tokens is not None else [] 28 | self.initial_tokens.extend([self.pad_token, self.unk_token]) 29 | for token in self.initial_tokens: 30 | self.add(token) 31 | 32 | if filename is not None: 33 | self.load_from_file(filename) 34 | 35 | def size(self): 36 | """ 37 | 获取词汇表的大小 38 | """ 39 | return len(self.id2token) 40 | 41 | def load_from_file(self, file_path): 42 | """ 43 | 从文件路径加载词汇表 44 | """ 45 | for line in open(file_path, 'r', encoding='utf-8'): 46 | token = line.rstrip('\n') 47 | self.add(token) 48 | 49 | def get_id(self, token): 50 | """ 51 | 获得某个词语的id,如果词语不在词汇表中,则返回未知词标识 52 | """ 53 | token = token.lower() if self.lower else token 54 | try: 55 | return self.token2id[token] 56 | except KeyError: 57 | return self.token2id[self.unk_token] 58 | 59 | def get_token(self, idx): 60 | """ 61 | 获取与id相对应的词语,如果id不在词汇表中,返回未知标识 62 | """ 63 | try: 64 | return self.id2token[idx] 65 | except KeyError: 66 | return self.unk_token 67 | 68 | def add(self, token, cnt=1): 69 | """ 70 | 把词语加入到词汇表 71 | """ 72 | token = token.lower() if self.lower else token 73 | if token in self.token2id: 74 | idx = self.token2id[token] 75 | else: 76 | idx = len(self.id2token) 77 | self.id2token[idx] = token 78 | self.token2id[token] = idx 79 | if cnt > 0: 80 | if token in self.token_cnt: 81 | self.token_cnt[token] += cnt 82 | else: 83 | self.token_cnt[token] = cnt 84 | return idx 85 | 86 | def filter_tokens_by_cnt(self, min_cnt): 87 | """ 88 | 过滤掉一些低频词 89 | """ 90 | filtered_tokens = [token for token in self.token2id if self.token_cnt[token] >= min_cnt] 91 | # rebuild the token x id map 92 | self.token2id = {} 93 | self.id2token = {} 94 | for token in self.initial_tokens: 95 | self.add(token, cnt=0) 96 | for token in filtered_tokens: 97 | self.add(token, cnt=0) 98 | 99 | def randomly_init_embeddings(self, embed_dim): 100 | """ 101 | 随机初始化词向量 102 | """ 103 | self.embed_dim = embed_dim 104 | self.embeddings = np.random.rand(self.size(), embed_dim) 105 | for token in [self.pad_token, self.unk_token]: 106 | self.embeddings[self.get_id(token)] = np.zeros([self.embed_dim]) 107 | 108 | def load_pretrained_embeddings(self, embedding_path): 109 | """ 110 | 根据文件路径,加载预训练的词向量 111 | 不在该词向量集中的词语将被过滤 112 | Args: 113 | embedding_path: the path of the pretrained embedding file 114 | """ 115 | print('load embedding path {}'.format(embedding_path)) 116 | model = Word2Vec.load(embedding_path) 117 | trained_embeddings = {} 118 | for token in model.wv.vocab: 119 | if token not in self.token2id: 120 | continue 121 | trained_embeddings[token] = model.wv[token] 122 | if self.embed_dim is None: 123 | self.embed_dim = len(model.wv[token]) 124 | 125 | filtered_tokens = trained_embeddings.keys() 126 | # 重构词语和id的映射关系 127 | self.token2id = {} 128 | self.id2token = {} 129 | for token in self.initial_tokens: 130 | self.add(token, cnt=0) 131 | for token in filtered_tokens: 132 | self.add(token, cnt=0) 133 | # 加载词嵌入 134 | self.embeddings = np.zeros([self.size(), self.embed_dim]) 135 | count = 0 136 | for token in self.token2id.keys(): 137 | if token in trained_embeddings: 138 | self.embeddings[self.get_id(token)] = trained_embeddings[token] 139 | count +=1 140 | print('init vec num{}'.format(count)) 141 | 142 | def load_pretrained_embeddings_txt(self, embedding_path): 143 | """ 144 | 根据文件路径,加载预训练的词向量 145 | 不在该词向量集中的词语将被过滤 146 | Args: 147 | embedding_path: the path of the pretrained embedding file 148 | """ 149 | word_dim = 300 150 | embedding_table = [] 151 | word2id = {} 152 | cnt = -1 153 | fr = open(embedding_path, 'r', encoding='utf8', errors='ignore') 154 | 155 | for line in fr: 156 | row = line.strip().split(' ') 157 | if len(row[1:]) == word_dim: 158 | cnt += 1 159 | word2id[row[0]] = cnt 160 | # vocab.append(row[0]) 161 | embedding_table.append(np.array(row[1:], dtype=np.float32).tolist()) 162 | if cnt % 10000 == 0: 163 | print("loading w2v :current is {0}".format(cnt)) 164 | print("vocab_size is {0}, embed_table size is {1}".format(len(word2id), len(embedding_table))) 165 | print("loaded word2vec") 166 | fr.close() 167 | 168 | print('load embedding path {}'.format(embedding_path)) 169 | trained_embeddings = {} 170 | for token in word2id.keys(): 171 | if token not in self.token2id: 172 | continue 173 | trained_embeddings[token] = embedding_table[word2id[token]] 174 | if self.embed_dim is None: 175 | self.embed_dim = word_dim 176 | 177 | filtered_tokens = trained_embeddings.keys() 178 | # 重构词语和id的映射关系 179 | self.token2id = {} 180 | self.id2token = {} 181 | for token in self.initial_tokens: 182 | self.add(token, cnt=0) 183 | for token in filtered_tokens: 184 | self.add(token, cnt=0) 185 | # 加载词嵌入 186 | self.embeddings = np.zeros([self.size(), self.embed_dim]) 187 | count = 0 188 | for token in self.token2id.keys(): 189 | if token in trained_embeddings: 190 | self.embeddings[self.get_id(token)] = trained_embeddings[token] 191 | count +=1 192 | print('word vec init num{}'.format(count)) 193 | 194 | def convert_to_ids(self, tokens): 195 | """ 196 | 将一组词语转化为id序列 197 | """ 198 | vec = [self.get_id(label) for label in tokens] 199 | return vec 200 | 201 | def recover_from_ids(self, ids, stop_id=None): 202 | """ 203 | 将一组id序列转化为一组词语 204 | """ 205 | tokens = [] 206 | for i in ids: 207 | tokens += [self.get_token(i)] 208 | if stop_id is not None and i == stop_id: 209 | break 210 | return tokens 211 | -------------------------------------------------------------------------------- /data/devset/dev_mini.json: -------------------------------------------------------------------------------- 1 | {"url": "http://iask.sina.com.cn/key/5a18d46b84aedabb5c07a131.html", "alternatives": "有|没有|无法确定", "passage": "动漫好看的H:爱的魔法,KEY的作品,喧嚣学院,草莓100%,双恋,爱丽丝学园,灼眼的夏娜,我的女神,赐予护女神的祝福,旋风管家,全金属狂潮,初音岛,命运之夜,心跳回忆。", "query_id": 250001, "answer": "有", "query": "有没有好看的h"} 2 | {"url": "http://www.120ask.com/question/65970789.htm", "alternatives": "能|不能|无法确定", "passage": "醋泡鸡蛋确实具有一定美白嫩化肌肤、提高皮肤亮度、祛斑的效果,因为白醋中含有的醋酸可以加速表皮新陈代谢、软化角质,鸡蛋清中的蛋白质可以嫩化肌肤,收缩毛孔的作用。", "query_id": 250002, "answer": "能", "query": "醋泡鸡蛋真能去斑吗"} 3 | {"url": "http://wenwen.sogou.com/z/q166740184.htm", "alternatives": "听不懂|听得懂|无法确定", "passage": "人有人言,兽有兽语。动物是不会听懂人说话的", "query_id": 250003, "answer": "听不懂", "query": "老鼠听得懂人话吗"} 4 | {"url": "http://wenwen.sogou.com/z/q705319471.htm", "alternatives": "无法确定|大|不大", "passage": "1.前期投资约5-10万元设备投资:柜台、门面装修、电脑及简单家具,一次性投入约2万元。2.3个月运转费用:一家店新开张,要作好两三个月没有生意的准备,最好事先筹备好3个月的运转费用3万元左右。3.进货款:新店开张,店里要备好大约价值2万元的汽车装潢材料。当然,如果有供应商愿意让你代销装潢材料,卖出去再结算,那这一笔费用可以省下。4.手续费:一般来说,注册资金为50万元的企业,代理费用约三四千元。B.每月支出1.房租:在较高档的居民小区附近,租一个20-40平方米的门面,加上水电和物业管理费,一般花费在每月2000-5000元。2.员工工资:开一家汽车装潢小店,至少要聘请一名电工和两名贴膜工。电工月薪在1200-1500元左右,贴膜工大约月薪千元。加上给员工加缴“三金”,每月工资支出约4000元。3.税收:每月固定税收大约500元。4.每月交际费用:不算很高,大约1000元就可以了。", "query_id": 250004, "answer": "无法确定", "query": "开洗车店投资大吗"} 5 | {"url": "http://www.169kang.com/question/166710467.html", "alternatives": "会|不会|无法确定", "passage": "性接触没有保护措施,是有感染的几率的,艾滋病没有特异性的症状。", "query_id": 250005, "answer": "会", "query": "类似性行为会不会感染艾滋病"} 6 | {"url": "http://www.120ask.com/question/36999066.htm", "alternatives": "不能|能|无法确定", "passage": "最起码再来月经后在考虑上环,一般在月经干净后3天左右去上环,这时候是最佳的时间。现在还没有来月经,与生育有关系,所以不用担心的。再说月经周期与心情、压力、饮食、内分泌等也有关系,注意保持心情舒畅,不要有大的心理压力。", "query_id": 250006, "answer": "不能", "query": "产后没来月经能上环么"} 7 | {"url": "http://baike.baidu.com/item/%E6%83%A0%E5%B7%9E%E5%8D%97%E7%AB%99/9326466?fr=aladdin", "alternatives": "有|没有|无法确定", "passage": "惠州南站 惠州南站,位于惠州市惠阳区,是厦深铁路沿线大站之一。隶属广州铁路(集团)公司管辖,现为一等站。 位 置 惠州市惠阳区淡水新桥 序号 车次 等级 始发站   终到站 出发时间   到达时间 到时 发时 车站 到达查询站历时 1 G6343 高速 潮汕   广州南 06:09   08:45 07:37 07:38 惠州南 1小时28分   当日到达", "query_id": 250007, "answer": "有", "query": "惠州淡水有高铁站吗"} 8 | {"url": "https://zhidao.baidu.com/question/246119134944261724", "alternatives": "有|没有|无法确定", "passage": "陈立农有弟弟吗 一个妹妹一个弟弟 当然是亲生的 妹妹比弟弟大", "query_id": 250008, "answer": "有", "query": "陈立农有兄弟姐妹吗"} 9 | {"url": "http://m.iask.sina.com.cn/b/2260918.html", "alternatives": "是|否|无法确定", "passage": "飞机在起飞和降落时是最危险的。飞中远距离的飞机所携带的燃油比较多,如果不放掉大部分燃油,着陆时对起落架等部位的冲击力太大,容易发生意外,同时燃油较少,发生事故时,火灾也不会太严。", "query_id": 250009, "answer": "是", "query": "飞机降落前是否放油"} 10 | {"url": "http://club.xywy.com/static/20150427/65656405.htm", "alternatives": "不行|行|无法确定", "passage": "建议入院行X片检查,必要时行踝关节CT检查排除骨折。如无骨折,则考虑韧带损伤,建议受伤关节制动(不活动)、不负重行走,或保护性活动关节,休息;急性期(伤后48小时)内可冷敷消肿,48小时后可热敷促进血液循环以利消肿,局部可外敷消肿止痛药。必须经过冷敷", "query_id": 250010, "answer": "不行", "query": "脚扭伤不冷敷行吗"} -------------------------------------------------------------------------------- /data/v81/logging2: -------------------------------------------------------------------------------- 1 | 2018-11-20 23:28:26,188 - brc - INFO - Running with args : Namespace(algo='BIDAF', batch_size=4, dev_files=['../data/devset/dev_pre.json'], dropout_keep_prob=1, embed_size=300, epochs=6, evaluate=False, gpu='3', hidden_size=150, input='../data/v81/testset/test_mini.json', learning_rate=0.0005, load_epoch=1, log_path='../data/v81/logging2', max_a_len=10, max_p_len=500, max_q_len=30, mode='prepro', model_dir='../data/v81/models/', optim='adam', predict=False, prepare=True, result_dir='../data/v81/results/', summary_dir='../data/v81/summary/', test_files=['../data/v81/testset/test_pre.json'], train=False, train_files=['../data/trainset/train_pre.json'], vocab_dir='../data/v81/vocab/', weight_decay=0) 2 | 2018-11-20 23:28:26,188 - brc - INFO - checking data file... 3 | 2018-11-20 23:28:26,188 - brc - INFO - establish folder... 4 | 2018-11-20 23:28:26,188 - brc - INFO - establish vocab... 5 | 2018-11-20 23:28:26,191 - brc - INFO - train 119 ques. 6 | 2018-11-20 23:28:26,191 - brc - INFO - dev 10 ques 7 | 2018-11-20 23:28:26,191 - brc - INFO - test 10 ques 8 | 2018-11-20 23:28:26,198 - brc - INFO - filte 1595 words, final num is 899 9 | 2018-11-20 23:28:26,198 - brc - INFO - use w2v... 10 | 2018-11-20 23:28:46,716 - brc - INFO - Running with args : Namespace(algo='BIDAF', batch_size=4, dev_files=['../data/devset/dev_pre.json'], dropout_keep_prob=1, embed_size=300, epochs=6, evaluate=False, gpu='3', hidden_size=150, input='../data/v81/testset/test_mini.json', learning_rate=0.0005, load_epoch=1, log_path='../data/v81/logging2', max_a_len=10, max_p_len=500, max_q_len=30, mode='prepro', model_dir='../data/v81/models/', optim='adam', predict=False, prepare=True, result_dir='../data/v81/results/', summary_dir='../data/v81/summary/', test_files=['../data/v81/testset/test_pre.json'], train=False, train_files=['../data/trainset/train_pre.json'], vocab_dir='../data/v81/vocab/', weight_decay=0) 11 | 2018-11-20 23:28:46,716 - brc - INFO - checking data file... 12 | 2018-11-20 23:28:46,716 - brc - INFO - establish folder... 13 | 2018-11-20 23:28:46,716 - brc - INFO - establish vocab... 14 | 2018-11-20 23:28:46,719 - brc - INFO - train 119 ques. 15 | 2018-11-20 23:28:46,720 - brc - INFO - dev 10 ques 16 | 2018-11-20 23:28:46,720 - brc - INFO - test 10 ques 17 | 2018-11-20 23:28:46,727 - brc - INFO - filte 1595 words, final num is 899 18 | 2018-11-20 23:28:46,727 - brc - INFO - use w2v... 19 | 2018-11-20 23:28:47,277 - brc - INFO - save word table... 20 | 2018-11-20 23:28:47,281 - brc - INFO - finish prepro! 21 | 2018-11-20 23:29:51,975 - brc - INFO - Running with args : Namespace(algo='BIDAF', batch_size=4, dev_files=['../data/devset/dev_pre.json'], dropout_keep_prob=1, embed_size=300, epochs=6, evaluate=False, gpu='3', hidden_size=150, input='../data/trainset/train_mini.json', learning_rate=0.0005, load_epoch=1, log_path='../data/v81/logging2', max_a_len=10, max_p_len=500, max_q_len=30, mode='train', model_dir='../data/v81/models/', optim='adam', predict=False, prepare=False, result_dir='../data/v81/results/', summary_dir='../data/v81/summary/', test_files=['../data/v81/testset/test_pre.json'], train=True, train_files=['../data/trainset/train_pre.json'], vocab_dir='../data/v81/vocab/', weight_decay=0) 22 | 2018-11-20 23:29:51,976 - brc - INFO - Namespace(algo='BIDAF', batch_size=4, dev_files=['../data/devset/dev_pre.json'], dropout_keep_prob=1, embed_size=300, epochs=6, evaluate=False, gpu='3', hidden_size=150, input='../data/trainset/train_mini.json', learning_rate=0.0005, load_epoch=1, log_path='../data/v81/logging2', max_a_len=10, max_p_len=500, max_q_len=30, mode='train', model_dir='../data/v81/models/', optim='adam', predict=False, prepare=False, result_dir='../data/v81/results/', summary_dir='../data/v81/summary/', test_files=['../data/v81/testset/test_pre.json'], train=True, train_files=['../data/trainset/train_pre.json'], vocab_dir='../data/v81/vocab/', weight_decay=0) 23 | 2018-11-20 23:29:51,976 - brc - INFO - Namespace(algo='BIDAF', batch_size=4, dev_files=['../data/devset/dev_pre.json'], dropout_keep_prob=1, embed_size=300, epochs=6, evaluate=False, gpu='3', hidden_size=150, input='../data/trainset/train_mini.json', learning_rate=0.0005, load_epoch=1, log_path='../data/v81/logging2', max_a_len=10, max_p_len=500, max_q_len=30, mode='train', model_dir='../data/v81/models/', optim='adam', predict=False, prepare=False, result_dir='../data/v81/results/', summary_dir='../data/v81/summary/', test_files=['../data/v81/testset/test_pre.json'], train=True, train_files=['../data/trainset/train_pre.json'], vocab_dir='../data/v81/vocab/', weight_decay=0) 24 | 2018-11-20 23:29:51,976 - brc - INFO - loading datasets and vocab.data... 25 | 2018-11-20 23:29:51,976 - brc - INFO - loading datasets and vocab.data... 26 | 2018-11-20 23:29:51,981 - brc - INFO - train 119 ques. 27 | 2018-11-20 23:29:51,981 - brc - INFO - train 119 ques. 28 | 2018-11-20 23:29:51,982 - brc - INFO - dev 10 ques 29 | 2018-11-20 23:29:51,982 - brc - INFO - dev 10 ques 30 | 2018-11-20 23:29:51,982 - brc - INFO - changing to id... 31 | 2018-11-20 23:29:51,982 - brc - INFO - changing to id... 32 | 2018-11-20 23:29:51,986 - brc - INFO - init model... 33 | 2018-11-20 23:29:51,986 - brc - INFO - init model... 34 | 2018-11-20 23:29:58,121 - brc - INFO - building graph used: 6.1334497928619385 secs 35 | 2018-11-20 23:29:58,121 - brc - INFO - building graph used: 6.1334497928619385 secs 36 | 2018-11-20 23:29:59,100 - brc - INFO - training model... 37 | 2018-11-20 23:29:59,100 - brc - INFO - training model... 38 | 2018-11-20 23:29:59,101 - brc - INFO - The 1 time to train model 39 | 2018-11-20 23:29:59,101 - brc - INFO - The 1 time to train model 40 | 2018-11-20 23:30:36,768 - brc - INFO - This batch average train loss 1 is 1.0019565925878637 41 | 2018-11-20 23:30:36,768 - brc - INFO - This batch average train loss 1 is 1.0019565925878637 42 | 2018-11-20 23:30:38,496 - brc - INFO - dev acc 0.6 43 | 2018-11-20 23:30:38,496 - brc - INFO - dev acc 0.6 44 | 2018-11-20 23:30:38,496 - brc - INFO - dev average loss is 0.9330900430679321 45 | 2018-11-20 23:30:38,496 - brc - INFO - dev average loss is 0.9330900430679321 46 | 2018-11-20 23:30:39,873 - brc - INFO - save model to ../data/v81/models/, prefix is BIDAF_1. 47 | 2018-11-20 23:30:39,873 - brc - INFO - save model to ../data/v81/models/, prefix is BIDAF_1. 48 | 2018-11-20 23:30:39,874 - brc - INFO - The 2 time to train model 49 | 2018-11-20 23:30:39,874 - brc - INFO - The 2 time to train model 50 | 2018-11-20 23:31:13,823 - brc - INFO - This batch average train loss 2 is 0.8855785507113993 51 | 2018-11-20 23:31:13,823 - brc - INFO - This batch average train loss 2 is 0.8855785507113993 52 | 2018-11-20 23:31:14,837 - brc - INFO - dev acc 0.5 53 | 2018-11-20 23:31:14,837 - brc - INFO - dev acc 0.5 54 | 2018-11-20 23:31:14,837 - brc - INFO - dev average loss is 0.9231869220733643 55 | 2018-11-20 23:31:14,837 - brc - INFO - dev average loss is 0.9231869220733643 56 | 2018-11-20 23:31:16,009 - brc - INFO - save model to ../data/v81/models/, prefix is BIDAF_2. 57 | 2018-11-20 23:31:16,009 - brc - INFO - save model to ../data/v81/models/, prefix is BIDAF_2. 58 | 2018-11-20 23:31:16,010 - brc - INFO - The 3 time to train model 59 | 2018-11-20 23:31:16,010 - brc - INFO - The 3 time to train model 60 | 2018-11-20 23:31:50,170 - brc - INFO - This batch average train loss 3 is 0.709995542003327 61 | 2018-11-20 23:31:50,170 - brc - INFO - This batch average train loss 3 is 0.709995542003327 62 | 2018-11-20 23:31:51,016 - brc - INFO - dev acc 0.5 63 | 2018-11-20 23:31:51,016 - brc - INFO - dev acc 0.5 64 | 2018-11-20 23:31:51,016 - brc - INFO - dev average loss is 0.8982053995132446 65 | 2018-11-20 23:31:51,016 - brc - INFO - dev average loss is 0.8982053995132446 66 | 2018-11-20 23:31:52,128 - brc - INFO - save model to ../data/v81/models/, prefix is BIDAF_3. 67 | 2018-11-20 23:31:52,128 - brc - INFO - save model to ../data/v81/models/, prefix is BIDAF_3. 68 | 2018-11-20 23:31:52,128 - brc - INFO - The 4 time to train model 69 | 2018-11-20 23:31:52,128 - brc - INFO - The 4 time to train model 70 | 2018-11-20 23:32:30,965 - brc - INFO - Running with args : Namespace(algo='BIDAF', batch_size=4, dev_files=['../data/devset/dev_pre.json'], dropout_keep_prob=1, embed_size=300, epochs=3, evaluate=True, gpu='3', hidden_size=150, input='../data/devset/dev_mini.json', learning_rate=0.0005, load_epoch=1, log_path='../data/v81/logging2', max_a_len=10, max_p_len=500, max_q_len=30, mode='valid', model_dir='../data/v81/models/', optim='adam', predict=False, prepare=False, result_dir='../data/v81/results/', summary_dir='../data/v81/summary/', test_files=['../data/v81/testset/test_pre.json'], train=False, train_files=['../data/trainset/train_pre.json'], vocab_dir='../data/v81/vocab/', weight_decay=0) 71 | 2018-11-20 23:32:30,965 - brc - INFO - loading datasets and vocab.data... 72 | 2018-11-20 23:32:30,968 - brc - INFO - dev 10 ques 73 | 2018-11-20 23:32:30,968 - brc - INFO - change txt to id list 74 | 2018-11-20 23:32:30,968 - brc - INFO - reloading model... 75 | 2018-11-20 23:32:37,216 - brc - INFO - building graph used: 6.246552228927612 secs 76 | 2018-11-20 23:32:38,491 - brc - INFO - reload ../data/v81/models/ model, prefix BIDAF_3 77 | 2018-11-20 23:32:38,491 - brc - INFO - valid model... 78 | 2018-11-20 23:32:40,409 - brc - INFO - save valid_v81 ans to ../data/v81/results/valid_v81.txt 79 | 2018-11-20 23:32:40,411 - brc - INFO - dev loss is: (0.8982053995132446,) 80 | 2018-11-20 23:32:40,411 - brc - INFO - save predict ans to ../data/v81/results/ 81 | 2018-11-20 23:33:13,071 - brc - INFO - Running with args : Namespace(algo='BIDAF', batch_size=4, dev_files=['../data/devset/dev_pre.json'], dropout_keep_prob=1, embed_size=300, epochs=3, evaluate=False, gpu='3', hidden_size=150, input='../data/devset/dev_mini.json', learning_rate=0.0005, load_epoch=1, log_path='../data/v81/logging2', max_a_len=10, max_p_len=500, max_q_len=30, mode='test', model_dir='../data/v81/models/', optim='adam', predict=True, prepare=False, result_dir='../data/v81/results/', summary_dir='../data/v81/summary/', test_files=['../data/v81/testset/test_pre.json'], train=False, train_files=['../data/trainset/train_pre.json'], vocab_dir='../data/v81/vocab/', weight_decay=0) 82 | 2018-11-20 23:33:13,071 - brc - INFO - loading datasets and vocab.data... 83 | 2018-11-20 23:33:13,074 - brc - INFO - test 10 ques 84 | 2018-11-20 23:33:13,074 - brc - INFO - change txt to id... 85 | 2018-11-20 23:33:13,075 - brc - INFO - reloading model... 86 | 2018-11-20 23:33:19,080 - brc - INFO - building graph used: 6.00429105758667 secs 87 | 2018-11-20 23:33:20,293 - brc - INFO - reload ../data/v81/models/ model, prefix BIDAF_3 88 | 2018-11-20 23:33:20,293 - brc - INFO - predict ans... 89 | 2018-11-20 23:33:21,939 - brc - INFO - save test_v81 ans to ../data/v81/results/test_v81.txt 90 | -------------------------------------------------------------------------------- /data/v81/testset/test_mini.json: -------------------------------------------------------------------------------- 1 | {"url": "http://gsrb.gansudaily.com.cn/system/2009/08/23/011235562.shtml", "query": "武威的公交卡古浪能不能用", "query_id": 280001, "alternatives": "能|不能|无法确定", "passage": "武威公交一体化纪实 10家运输公司中标经营包括凉州区、古浪、民勤、天祝在内的城乡公交线路。经过收编、整合、更新,开通城乡公交客运班线23条,统一投放80辆高档次客运车辆,由运输公司统一管理。实际上,运营在这些线路的新型双开门公交车的标准、设施已远远超过城区公交车。武威运管部门通过市场竞争和行业引导,建立退出机制,规范经营行为,提升服务质量。   去年11月下旬,武威市区至古浪县城和凉州区50公里范围内的乡镇全部开通城乡公交,凉州区28个乡镇300个行政村更是全部覆盖城乡公交,率先实现“乡乡通公交,村村通客车”。这些城乡公交定时、定班、定点、定线,城乡公交均等化延伸到农民的家门口。“乡村小公交起到了穿针引线、走村串巷的功能。”沈兴国说。"} 2 | {"url": "http://wenwen.sogou.com/z/q701006723.htm", "query": "能买到无线偷拍器吗", "query_id": 280002, "alternatives": "能|不能|无法确定", "passage": "现在这个社会什么买不到,只要你有钱是不是 欢迎光临【深圳平安安防】无线的有线的都有呢,看你喜欢什么样的了,在这里就不多介绍了,也不好介绍有需要的话你可以进去看一看"} 3 | {"url": "http://wenwen.sogou.com/z/q763575352.htm", "query": "中安信业减免还款是真实的吗", "query_id": 280003, "alternatives": "是真的|不是真的|无法确定", "passage": "请问朋友们网上中安信业代款是真的吗? 【百度反诈骗联盟团队】特别提醒:网上发布的所有只凭身份证就可以贷款或者信用卡的信息都是低级骗局,无论公司是否注册备案,都不要相信,骗子先骗你签订传真合同,并按捺手印,然后会一步步骗取你先支付首月利息、履约费、保证金、保险费、担保费、放款费、公证费、征信费、抵押金、开卡费等等,还会以你银行流水不足、查验你的还款能力或者是验资为名,要求你将自己账户上所有的资金打至骗子的账户,如果你不按骗子的要求交纳费用,骗子会以你已经和他们签了合同为名,威胁要起诉你违约,并威胁你赔偿巨额违约金,这实为低级的诈骗手段和典型诈骗!请永远记住,凡是对方以任何理由要求你先支付任何费用的,都是绝对的诈骗,无论在任何情况下,都不要先给其他人汇款、转账,以免被骗!更不要相信骗子的任何威胁,由于对方涉嫌诈骗,所以,和骗子签的合约没有任何法律效力,更不存在违约之说。所以,特此提醒广大网友,不要相信网上各种投资、融资担保公司以及各类小额贷公司发布的此类贷款或者卡信息,特别是北京、上海、广州、深圳等大城市的这类公司基本都是骗子公司!如果被骗,无论金额大小,都请选择报警!如此猖狂诈骗,还请各地公安机关大力打击和整顿! "} 4 | {"url": "http://www.mama.cn/ask/q13547252-p1.html", "query": "petct医保报销吗", "query_id": 280004, "alternatives": "能|不能|无法确定", "passage": "对于这些的话也可以咨询一下你的直属上司或者是领导,他们专业的意见也都是可以的。"} 5 | {"url": "http://www.d1xz.net/astro/Cancer/art117849.aspx", "query": "巨蟹座慢热么", "query_id": 280005, "alternatives": "慢热|不慢热|无法确定", "passage": "在巨蟹座当中,慢热型的性格,更是让她们的爱心与细腻,更好的发挥到极致。"} 6 | {"url": "http://www.169kang.com/question/369685826.html", "query": "菊花茶叶能一起泡吗", "query_id": 280006, "alternatives": "能|不能|无法确定", "passage": "菊花有清热解毒、清肝明目的作用,茶叶尤其绿茶同样具有清热解毒的作用,两者一起泡茶无碍。"} 7 | {"url": "http://www.169kang.com/question/409628430.html", "query": "嗓子疼吃感康行吗", "query_id": 280007, "alternatives": "行|不行|无法确定", "passage": "引起咽喉疼痛不适的原因多是由于扁桃体炎或是急性咽炎所导致,感康片主要用于感冒不适有一定的作用,如发热,头痛,鼻塞。"} 8 | {"url": "http://www.ali213.net/news/html/2014-7/109714.html", "query": "漫威电影美队换人了吗", "query_id": 280008, "alternatives": "换了|没换|无法确定", "passage": "漫威近日宣布,“猎鹰”山姆·威尔森将代替史蒂夫·罗杰斯,成为10月新系列漫画的新任美国队长!"} 9 | {"url": "http://www.abcb.net.cn/ximan-wulumuqijiunianyiwujiaoyu.html", "query": "新疆是九年义务教育还是十二年义务教育", "query_id": 280009, "alternatives": "九年义务教育|十二年义务教育|无法确定", "passage": ".我国现在实行的还是9年义务教育..至少目前没有准确消息说要实行12点义务教育..不实行新疆不属于这次的试点地区不."} 10 | {"url": "http://cq.bendibao.com/traffic/2018412/72718.shtm", "query": "重庆星期六限号吗", "query_id": 280010, "alternatives": "限|不限|无法确定", "passage": "星期六、星期日因法定节假日调休变为工作日的,不实施尾号限行措施。"} -------------------------------------------------------------------------------- /data/v84/logging2: -------------------------------------------------------------------------------- 1 | 2018-11-20 23:37:15,222 - brc - INFO - Running with args : Namespace(algo='BIDAF', batch_size=100, dev_files=['../data/devset/dev_pre.json'], dropout_keep_prob=1, embed_size=300, epochs=6, evaluate=False, gpu='3', hidden_size=150, input='..data/trainset/train_mini.json', learning_rate=0.001, load_epoch=1, log_path='../data/v84/logging2', max_a_len=10, max_p_len=500, max_q_len=30, mode='train', model_dir='../data/v84/models/', optim='adam', predict=False, prepare=True, result_dir='../data/v84/results/', summary_dir='../data/v84/summary/', test_files=['../data/v84/testset/test_pre.json'], train=False, train_files=['../data/trainset/train_pre.json'], vocab_dir='../data/v84/vocab/', weight_decay=0) 2 | 2018-11-20 23:37:15,222 - brc - INFO - checking data file... 3 | 2018-11-20 23:37:15,222 - brc - INFO - establish folder... 4 | 2018-11-20 23:37:15,222 - brc - INFO - establish vocab... 5 | 2018-11-20 23:37:15,225 - brc - INFO - train 119 ques. 6 | 2018-11-20 23:37:15,226 - brc - INFO - dev 10 ques 7 | 2018-11-20 23:37:15,226 - brc - INFO - test 10 ques 8 | 2018-11-20 23:37:15,233 - brc - INFO - filte 1595 words, final num is 899 9 | 2018-11-20 23:37:15,233 - brc - INFO - use w2v... 10 | 2018-11-20 23:37:15,769 - brc - INFO - save word table... 11 | 2018-11-20 23:37:15,773 - brc - INFO - finish prepro! 12 | 2018-11-20 23:38:30,266 - brc - INFO - Running with args : Namespace(algo='BIDAF', batch_size=100, dev_files=['../data/devset/dev_pre.json'], dropout_keep_prob=1, embed_size=300, epochs=6, evaluate=False, gpu='3', hidden_size=150, input='../data/trainset/train_mini.json', learning_rate=0.001, load_epoch=1, log_path='../data/v84/logging2', max_a_len=10, max_p_len=500, max_q_len=30, mode='train', model_dir='../data/v84/models/', optim='adam', predict=False, prepare=False, result_dir='../data/v84/results/', summary_dir='../data/v84/summary/', test_files=['../data/v84/testset/test_pre.json'], train=True, train_files=['../data/trainset/train_pre.json'], vocab_dir='../data/v84/vocab/', weight_decay=0) 13 | 2018-11-20 23:38:30,266 - brc - INFO - Namespace(algo='BIDAF', batch_size=100, dev_files=['../data/devset/dev_pre.json'], dropout_keep_prob=1, embed_size=300, epochs=6, evaluate=False, gpu='3', hidden_size=150, input='../data/trainset/train_mini.json', learning_rate=0.001, load_epoch=1, log_path='../data/v84/logging2', max_a_len=10, max_p_len=500, max_q_len=30, mode='train', model_dir='../data/v84/models/', optim='adam', predict=False, prepare=False, result_dir='../data/v84/results/', summary_dir='../data/v84/summary/', test_files=['../data/v84/testset/test_pre.json'], train=True, train_files=['../data/trainset/train_pre.json'], vocab_dir='../data/v84/vocab/', weight_decay=0) 14 | 2018-11-20 23:38:30,266 - brc - INFO - Namespace(algo='BIDAF', batch_size=100, dev_files=['../data/devset/dev_pre.json'], dropout_keep_prob=1, embed_size=300, epochs=6, evaluate=False, gpu='3', hidden_size=150, input='../data/trainset/train_mini.json', learning_rate=0.001, load_epoch=1, log_path='../data/v84/logging2', max_a_len=10, max_p_len=500, max_q_len=30, mode='train', model_dir='../data/v84/models/', optim='adam', predict=False, prepare=False, result_dir='../data/v84/results/', summary_dir='../data/v84/summary/', test_files=['../data/v84/testset/test_pre.json'], train=True, train_files=['../data/trainset/train_pre.json'], vocab_dir='../data/v84/vocab/', weight_decay=0) 15 | 2018-11-20 23:38:30,267 - brc - INFO - loading datasets and vocab.data... 16 | 2018-11-20 23:38:30,267 - brc - INFO - loading datasets and vocab.data... 17 | 2018-11-20 23:38:30,272 - brc - INFO - train 119 ques. 18 | 2018-11-20 23:38:30,272 - brc - INFO - train 119 ques. 19 | 2018-11-20 23:38:30,272 - brc - INFO - dev 10 ques 20 | 2018-11-20 23:38:30,272 - brc - INFO - dev 10 ques 21 | 2018-11-20 23:38:30,273 - brc - INFO - changing to id... 22 | 2018-11-20 23:38:30,273 - brc - INFO - changing to id... 23 | 2018-11-20 23:38:30,277 - brc - INFO - init model... 24 | 2018-11-20 23:38:30,277 - brc - INFO - init model... 25 | 2018-11-20 23:38:36,255 - brc - INFO - building graph used: 5.974062919616699 secs 26 | 2018-11-20 23:38:36,255 - brc - INFO - building graph used: 5.974062919616699 secs 27 | 2018-11-20 23:38:37,220 - brc - INFO - training model... 28 | 2018-11-20 23:38:37,220 - brc - INFO - training model... 29 | 2018-11-20 23:38:37,220 - brc - INFO - The 1 time to train model 30 | 2018-11-20 23:38:37,220 - brc - INFO - The 1 time to train model 31 | 2018-11-20 23:39:25,058 - brc - INFO - This batch average train loss 1 is 1.1325449572891748 32 | 2018-11-20 23:39:25,058 - brc - INFO - This batch average train loss 1 is 1.1325449572891748 33 | 2018-11-20 23:39:26,778 - brc - INFO - dev acc 0.3 34 | 2018-11-20 23:39:26,778 - brc - INFO - dev acc 0.3 35 | 2018-11-20 23:39:26,778 - brc - INFO - dev average loss is 1.2414356470108032 36 | 2018-11-20 23:39:26,778 - brc - INFO - dev average loss is 1.2414356470108032 37 | 2018-11-20 23:39:28,185 - brc - INFO - save model to ../data/v84/models/, prefix is BIDAF_1. 38 | 2018-11-20 23:39:28,185 - brc - INFO - save model to ../data/v84/models/, prefix is BIDAF_1. 39 | 2018-11-20 23:39:28,186 - brc - INFO - The 2 time to train model 40 | 2018-11-20 23:39:28,186 - brc - INFO - The 2 time to train model 41 | 2018-11-20 23:39:54,635 - brc - INFO - This batch average train loss 2 is 1.159237619708566 42 | 2018-11-20 23:39:54,635 - brc - INFO - This batch average train loss 2 is 1.159237619708566 43 | 2018-11-20 23:39:55,654 - brc - INFO - dev acc 0.5 44 | 2018-11-20 23:39:55,654 - brc - INFO - dev acc 0.5 45 | 2018-11-20 23:39:55,655 - brc - INFO - dev average loss is 1.086735486984253 46 | 2018-11-20 23:39:55,655 - brc - INFO - dev average loss is 1.086735486984253 47 | 2018-11-20 23:39:57,109 - brc - INFO - save model to ../data/v84/models/, prefix is BIDAF_2. 48 | 2018-11-20 23:39:57,109 - brc - INFO - save model to ../data/v84/models/, prefix is BIDAF_2. 49 | 2018-11-20 23:39:57,110 - brc - INFO - The 3 time to train model 50 | 2018-11-20 23:39:57,110 - brc - INFO - The 3 time to train model 51 | 2018-11-20 23:40:42,709 - brc - INFO - This batch average train loss 3 is 0.889101853891581 52 | 2018-11-20 23:40:42,709 - brc - INFO - This batch average train loss 3 is 0.889101853891581 53 | 2018-11-20 23:40:43,719 - brc - INFO - dev acc 0.6 54 | 2018-11-20 23:40:43,719 - brc - INFO - dev acc 0.6 55 | 2018-11-20 23:40:43,719 - brc - INFO - dev average loss is 1.1423981189727783 56 | 2018-11-20 23:40:43,719 - brc - INFO - dev average loss is 1.1423981189727783 57 | 2018-11-20 23:40:45,125 - brc - INFO - save model to ../data/v84/models/, prefix is BIDAF_3. 58 | 2018-11-20 23:40:45,125 - brc - INFO - save model to ../data/v84/models/, prefix is BIDAF_3. 59 | 2018-11-20 23:40:45,125 - brc - INFO - The 4 time to train model 60 | 2018-11-20 23:40:45,125 - brc - INFO - The 4 time to train model 61 | 2018-11-20 23:41:25,324 - brc - INFO - This batch average train loss 4 is 0.814598362986781 62 | 2018-11-20 23:41:25,324 - brc - INFO - This batch average train loss 4 is 0.814598362986781 63 | 2018-11-20 23:41:26,094 - brc - INFO - dev acc 0.5 64 | 2018-11-20 23:41:26,094 - brc - INFO - dev acc 0.5 65 | 2018-11-20 23:41:26,095 - brc - INFO - dev average loss is 0.9756466746330261 66 | 2018-11-20 23:41:26,095 - brc - INFO - dev average loss is 0.9756466746330261 67 | 2018-11-20 23:41:27,356 - brc - INFO - save model to ../data/v84/models/, prefix is BIDAF_4. 68 | 2018-11-20 23:41:27,356 - brc - INFO - save model to ../data/v84/models/, prefix is BIDAF_4. 69 | 2018-11-20 23:41:27,356 - brc - INFO - The 5 time to train model 70 | 2018-11-20 23:41:27,356 - brc - INFO - The 5 time to train model 71 | 2018-11-20 23:41:51,584 - brc - INFO - This batch average train loss 5 is 0.7426063959338084 72 | 2018-11-20 23:41:51,584 - brc - INFO - This batch average train loss 5 is 0.7426063959338084 73 | 2018-11-20 23:41:52,496 - brc - INFO - dev acc 0.5 74 | 2018-11-20 23:41:52,496 - brc - INFO - dev acc 0.5 75 | 2018-11-20 23:41:52,496 - brc - INFO - dev average loss is 0.8413645625114441 76 | 2018-11-20 23:41:52,496 - brc - INFO - dev average loss is 0.8413645625114441 77 | 2018-11-20 23:41:53,775 - brc - INFO - save model to ../data/v84/models/, prefix is BIDAF_5. 78 | 2018-11-20 23:41:53,775 - brc - INFO - save model to ../data/v84/models/, prefix is BIDAF_5. 79 | 2018-11-20 23:41:53,775 - brc - INFO - The 6 time to train model 80 | 2018-11-20 23:41:53,775 - brc - INFO - The 6 time to train model 81 | 2018-11-20 23:42:31,250 - brc - INFO - This batch average train loss 6 is 0.6743937124725149 82 | 2018-11-20 23:42:31,250 - brc - INFO - This batch average train loss 6 is 0.6743937124725149 83 | 2018-11-20 23:42:32,086 - brc - INFO - dev acc 0.6 84 | 2018-11-20 23:42:32,086 - brc - INFO - dev acc 0.6 85 | 2018-11-20 23:42:32,086 - brc - INFO - dev average loss is 0.928860068321228 86 | 2018-11-20 23:42:32,086 - brc - INFO - dev average loss is 0.928860068321228 87 | 2018-11-20 23:42:33,416 - brc - INFO - save model to ../data/v84/models/, prefix is BIDAF_6. 88 | 2018-11-20 23:42:33,416 - brc - INFO - save model to ../data/v84/models/, prefix is BIDAF_6. 89 | 2018-11-20 23:42:33,416 - brc - INFO - finish training! 90 | 2018-11-20 23:42:33,416 - brc - INFO - finish training! 91 | 2018-11-20 23:43:12,178 - brc - INFO - Running with args : Namespace(algo='BIDAF', batch_size=10, dev_files=['../data/devset/dev_pre.json'], dropout_keep_prob=1, embed_size=300, epochs=4, evaluate=True, gpu='3', hidden_size=150, input='../data/devset/dev_mini.json', learning_rate=0.001, load_epoch=1, log_path='../data/v84/logging2', max_a_len=10, max_p_len=500, max_q_len=30, mode='dev', model_dir='../data/v84/models/', optim='adam', predict=False, prepare=False, result_dir='../data/v84/results/', summary_dir='../data/v84/summary/', test_files=['../data/v84/testset/test_pre.json'], train=False, train_files=['../data/trainset/train_pre.json'], vocab_dir='../data/v84/vocab/', weight_decay=0) 92 | 2018-11-20 23:43:12,178 - brc - INFO - loading datasets and vocab.data... 93 | 2018-11-20 23:43:12,183 - brc - INFO - dev 10 ques 94 | 2018-11-20 23:43:12,183 - brc - INFO - change txt to id list 95 | 2018-11-20 23:43:12,184 - brc - INFO - reloading model... 96 | 2018-11-20 23:43:18,248 - brc - INFO - building graph used: 6.060601234436035 secs 97 | 2018-11-20 23:43:19,485 - brc - INFO - reload ../data/v84/models/ model, prefix BIDAF_4 98 | 2018-11-20 23:43:19,486 - brc - INFO - valid model... 99 | 2018-11-20 23:43:21,059 - brc - INFO - save valid_v84 ans to ../data/v84/results/valid_v84.txt 100 | 2018-11-20 23:43:21,059 - brc - INFO - dev loss is: (0.9756466746330261,) 101 | 2018-11-20 23:43:21,059 - brc - INFO - save predict ans to ../data/v84/results/ 102 | 2018-11-20 23:44:19,849 - brc - INFO - Running with args : Namespace(algo='BIDAF', batch_size=10, dev_files=['../data/devset/dev_pre.json'], dropout_keep_prob=1, embed_size=300, epochs=4, evaluate=False, gpu='3', hidden_size=150, input='../data/v84/testset/test_mini.json', learning_rate=0.001, load_epoch=1, log_path='../data/v84/logging2', max_a_len=10, max_p_len=500, max_q_len=30, mode='test', model_dir='../data/v84/models/', optim='adam', predict=True, prepare=False, result_dir='../data/v84/results/', summary_dir='../data/v84/summary/', test_files=['../data/v84/testset/test_pre.json'], train=False, train_files=['../data/trainset/train_pre.json'], vocab_dir='../data/v84/vocab/', weight_decay=0) 103 | 2018-11-20 23:44:19,849 - brc - INFO - loading datasets and vocab.data... 104 | 2018-11-20 23:44:19,853 - brc - INFO - test 10 ques 105 | 2018-11-20 23:44:19,853 - brc - INFO - change txt to id... 106 | 2018-11-20 23:44:19,853 - brc - INFO - reloading model... 107 | 2018-11-20 23:44:26,431 - brc - INFO - building graph used: 6.57421612739563 secs 108 | 2018-11-20 23:44:27,842 - brc - INFO - reload ../data/v84/models/ model, prefix BIDAF_4 109 | 2018-11-20 23:44:27,842 - brc - INFO - predict ans... 110 | 2018-11-20 23:44:29,964 - brc - INFO - save test_v84 ans to ../data/v84/results/test_v84.txt 111 | -------------------------------------------------------------------------------- /data/v84/testset/test_mini.json: -------------------------------------------------------------------------------- 1 | {"url": "http://gsrb.gansudaily.com.cn/system/2009/08/23/011235562.shtml", "query": "武威的公交卡古浪能不能用", "query_id": 280001, "alternatives": "能|不能|无法确定", "passage": "武威公交一体化纪实 10家运输公司中标经营包括凉州区、古浪、民勤、天祝在内的城乡公交线路。经过收编、整合、更新,开通城乡公交客运班线23条,统一投放80辆高档次客运车辆,由运输公司统一管理。实际上,运营在这些线路的新型双开门公交车的标准、设施已远远超过城区公交车。武威运管部门通过市场竞争和行业引导,建立退出机制,规范经营行为,提升服务质量。   去年11月下旬,武威市区至古浪县城和凉州区50公里范围内的乡镇全部开通城乡公交,凉州区28个乡镇300个行政村更是全部覆盖城乡公交,率先实现“乡乡通公交,村村通客车”。这些城乡公交定时、定班、定点、定线,城乡公交均等化延伸到农民的家门口。“乡村小公交起到了穿针引线、走村串巷的功能。”沈兴国说。"} 2 | {"url": "http://wenwen.sogou.com/z/q701006723.htm", "query": "能买到无线偷拍器吗", "query_id": 280002, "alternatives": "能|不能|无法确定", "passage": "现在这个社会什么买不到,只要你有钱是不是 欢迎光临【深圳平安安防】无线的有线的都有呢,看你喜欢什么样的了,在这里就不多介绍了,也不好介绍有需要的话你可以进去看一看"} 3 | {"url": "http://wenwen.sogou.com/z/q763575352.htm", "query": "中安信业减免还款是真实的吗", "query_id": 280003, "alternatives": "是真的|不是真的|无法确定", "passage": "请问朋友们网上中安信业代款是真的吗? 【百度反诈骗联盟团队】特别提醒:网上发布的所有只凭身份证就可以贷款或者信用卡的信息都是低级骗局,无论公司是否注册备案,都不要相信,骗子先骗你签订传真合同,并按捺手印,然后会一步步骗取你先支付首月利息、履约费、保证金、保险费、担保费、放款费、公证费、征信费、抵押金、开卡费等等,还会以你银行流水不足、查验你的还款能力或者是验资为名,要求你将自己账户上所有的资金打至骗子的账户,如果你不按骗子的要求交纳费用,骗子会以你已经和他们签了合同为名,威胁要起诉你违约,并威胁你赔偿巨额违约金,这实为低级的诈骗手段和典型诈骗!请永远记住,凡是对方以任何理由要求你先支付任何费用的,都是绝对的诈骗,无论在任何情况下,都不要先给其他人汇款、转账,以免被骗!更不要相信骗子的任何威胁,由于对方涉嫌诈骗,所以,和骗子签的合约没有任何法律效力,更不存在违约之说。所以,特此提醒广大网友,不要相信网上各种投资、融资担保公司以及各类小额贷公司发布的此类贷款或者卡信息,特别是北京、上海、广州、深圳等大城市的这类公司基本都是骗子公司!如果被骗,无论金额大小,都请选择报警!如此猖狂诈骗,还请各地公安机关大力打击和整顿! "} 4 | {"url": "http://www.mama.cn/ask/q13547252-p1.html", "query": "petct医保报销吗", "query_id": 280004, "alternatives": "能|不能|无法确定", "passage": "对于这些的话也可以咨询一下你的直属上司或者是领导,他们专业的意见也都是可以的。"} 5 | {"url": "http://www.d1xz.net/astro/Cancer/art117849.aspx", "query": "巨蟹座慢热么", "query_id": 280005, "alternatives": "慢热|不慢热|无法确定", "passage": "在巨蟹座当中,慢热型的性格,更是让她们的爱心与细腻,更好的发挥到极致。"} 6 | {"url": "http://www.169kang.com/question/369685826.html", "query": "菊花茶叶能一起泡吗", "query_id": 280006, "alternatives": "能|不能|无法确定", "passage": "菊花有清热解毒、清肝明目的作用,茶叶尤其绿茶同样具有清热解毒的作用,两者一起泡茶无碍。"} 7 | {"url": "http://www.169kang.com/question/409628430.html", "query": "嗓子疼吃感康行吗", "query_id": 280007, "alternatives": "行|不行|无法确定", "passage": "引起咽喉疼痛不适的原因多是由于扁桃体炎或是急性咽炎所导致,感康片主要用于感冒不适有一定的作用,如发热,头痛,鼻塞。"} 8 | {"url": "http://www.ali213.net/news/html/2014-7/109714.html", "query": "漫威电影美队换人了吗", "query_id": 280008, "alternatives": "换了|没换|无法确定", "passage": "漫威近日宣布,“猎鹰”山姆·威尔森将代替史蒂夫·罗杰斯,成为10月新系列漫画的新任美国队长!"} 9 | {"url": "http://www.abcb.net.cn/ximan-wulumuqijiunianyiwujiaoyu.html", "query": "新疆是九年义务教育还是十二年义务教育", "query_id": 280009, "alternatives": "九年义务教育|十二年义务教育|无法确定", "passage": ".我国现在实行的还是9年义务教育..至少目前没有准确消息说要实行12点义务教育..不实行新疆不属于这次的试点地区不."} 10 | {"url": "http://cq.bendibao.com/traffic/2018412/72718.shtm", "query": "重庆星期六限号吗", "query_id": 280010, "alternatives": "限|不限|无法确定", "passage": "星期六、星期日因法定节假日调休变为工作日的,不实施尾号限行措施。"} -------------------------------------------------------------------------------- /data/w2v/log: -------------------------------------------------------------------------------- 1 | you should put word2vec here 2 | -------------------------------------------------------------------------------- /start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd capsule-mrc81/capsuleNet-mrc/ 3 | python3 run81.py --mode 'test' --input '/search/work/input/data' 4 | python3 run84.py --mode 'test' --input '/search/work/input/data' 5 | 6 | cd ../../QA_Test 7 | python3 config_cla_v646.py --mode 'test' --input '/search/work/input/data' 8 | python3 config_cla_v60.py --mode 'test' --input '/search/work/input/data' 9 | python3 config_ans_v20.py --mode 'test' --input '/search/work/input/data' 10 | 11 | cd .. 12 | python3 vote_ser_new_word.py --mode 'predict' --input 'test' --predict_file '/search/work/output/result' 13 | -------------------------------------------------------------------------------- /vote_ser_new_word.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import argparse 4 | import numpy as np 5 | 6 | import jieba 7 | import itertools 8 | 9 | from tqdm import tqdm 10 | 11 | 12 | def parse_args(): 13 | parser = argparse.ArgumentParser('ensemble') 14 | parser.add_argument('--ensemble_size', type=int, default=5, help='5') 15 | parser.add_argument('--mode', type=str, default='ensemble', help='ensemble/predict') 16 | parser.add_argument('--input', type=str, default='valid', help='valid/test') 17 | parser.add_argument('--input_file_dev', type=str, default='QANet/datasets/aic18/valid_mini.json', 18 | help='/data1/lcn/project/aichallenger/data/input/valid.json') 19 | parser.add_argument('--input_file_test', type=str, default='QANet/datasets/aic18/test_mini.json', 20 | help='/data1/lcn/project/aichallenger/data/input/test.json') 21 | parser.add_argument('--predict_file', type=str, default='predict_final', 22 | help='list of files that contain the preprocessed dev data') 23 | parser.add_argument('--root_file', type=str, default='/data1/lcn/project/aichallenger/py/ensemble/valid/', help='') 24 | parser.add_argument('--template_dev', type=str, 25 | default='QANet/datasets/aic18/template_dev.json', 26 | help='') 27 | parser.add_argument('--template_test', type=str, 28 | default='QANet/datasets/aic18/template_test.json', 29 | help='') 30 | parser.add_argument('--files', nargs='+', default=['',''], 31 | help='list of files that contain the preprocessed train data') 32 | return parser.parse_args() 33 | 34 | 35 | def get_answer_label(seg_query, ansTokens): 36 | shuffledAnsToken_index = [] 37 | shuffledAnsToken = [] 38 | query = "" 39 | for i in seg_query: 40 | query += i 41 | label = None 42 | ansTokens = [x.strip() for x in ansTokens] 43 | unkownMark = False 44 | unkownIdx = -1 45 | unkownChar = ['无法确定', '无法确认', '不确定', '不能确定', 'wfqd', '无法选择', '无法确实', '无法取代', '取法确定', '无法确', '无法㾡', '无法去顶', '无确定', 46 | '无法去顶', '我放弃', '无法缺定', '无法无额定', '无法判断', '不清楚', '无人确定', "不知道"] 47 | 48 | for idx, token in enumerate(ansTokens): 49 | for ch in unkownChar: 50 | if token.find(ch) != -1: 51 | unkownMark = True 52 | unkownIdx = idx 53 | break 54 | if unkownMark: 55 | break 56 | # print("%s %s %s : %d %s"%(ansTokens[0],ansTokens[1],ansTokens[2],unkownIdx,ansTokens[unkownIdx])) 57 | minFindStart = 999999 58 | minIdx = -1 59 | if unkownMark == False: 60 | pass 61 | # print("%s %s %s unkonwn mark error" % (ansTokens[0], ansTokens[1], ansTokens[2])) 62 | else: 63 | for idx, token in enumerate(ansTokens): 64 | if unkownIdx == idx: 65 | continue 66 | tmpFindStart = query.find(token) 67 | if tmpFindStart == -1: 68 | tmpFindStart = 999999 69 | 70 | if minFindStart > tmpFindStart: 71 | minIdx = idx 72 | minFindStart = tmpFindStart 73 | if not (minIdx < 0 or minIdx > 2 or unkownMark < 0 or unkownMark > 2): 74 | if minIdx == 0: 75 | label = [1, 0, 0] 76 | elif unkownIdx == 0: 77 | label = [0, 0, 1] 78 | else: 79 | label = [0, 1, 0] 80 | else: 81 | minIdx = -999 82 | pessimisticDic = {"不会", "不可以", "不是", "假的", "不要", "不靠谱", "不能", "没有", "不需要", "没出", "不给", "不用", "不可能", "不好", 83 | "不同意", 84 | "不对", "不算", "不行", "不快", "不能", "没用", "不合适", "不正常", "不好", "不可", "不正确", "不高", "不难", "不属于", 85 | "不合适", 86 | "不值钱", "不友好", "不幸运", "不应该", "不值"} 87 | for idx, token in enumerate(ansTokens): 88 | if idx == unkownIdx: 89 | continue 90 | for opt in pessimisticDic: 91 | if token.find(opt) != -1: 92 | minIdx = 3 - idx - unkownIdx 93 | if minIdx != -999: 94 | if minIdx == 0: 95 | label = [1, 0, 0] 96 | elif unkownIdx == 0: 97 | label = [0, 0, 1] 98 | else: 99 | label = [0, 1, 0] 100 | else: 101 | minIdx = -999 102 | for idx, token in enumerate(ansTokens): 103 | if token.find("不确定") == -1 and token.find("不能确定") == -1 and ( 104 | token.find("不") != -1 or token.find("否") != -1 or token.find( 105 | "没") != -1 or token.find("错") != -1): 106 | minIdx = 3 - idx - unkownIdx 107 | if minIdx != -999: 108 | if minIdx == 0: 109 | label = [1, 0, 0] 110 | elif unkownIdx == 0: 111 | label = [0, 0, 1] 112 | else: 113 | label = [0, 1, 0] 114 | else: 115 | print("after last process ,still failed") 116 | try: 117 | if label != None: 118 | if minIdx == 0: 119 | if unkownIdx == 1: 120 | shuffledAnsToken_index = [0, 2, 1] 121 | elif unkownIdx == 2: 122 | shuffledAnsToken_index = [0, 1, 2] 123 | elif minIdx == 1: 124 | if unkownIdx == 0: 125 | shuffledAnsToken_index = [1, 2, 0] 126 | elif unkownIdx == 2: 127 | shuffledAnsToken_index = [1, 0, 2] 128 | elif minIdx == 2: 129 | if unkownIdx == 0: 130 | shuffledAnsToken_index = [2, 1, 0] 131 | elif unkownIdx == 1: 132 | shuffledAnsToken_index = [2, 0, 1] 133 | shuffledAnsToken = [ansTokens[i] for i in shuffledAnsToken_index] 134 | except: 135 | shuffledAnsToken_index = [] 136 | 137 | return label, ansTokens, shuffledAnsToken_index, shuffledAnsToken 138 | 139 | 140 | def modify_index_save(input_file, savefile): 141 | # print(input_file,savefile) 142 | outf = open(savefile, 'w', encoding='utf-8') 143 | inf = open(input_file, 'r', encoding='utf-8') 144 | for line in tqdm(inf): 145 | line = json.loads(line) 146 | alternatives = line['alternatives'].split('|') 147 | ques_word = list(jieba.cut(line['query'])) 148 | query_id = line['query_id'] 149 | label, ans, index, shu_ans = get_answer_label(ques_word, alternatives) 150 | if len(shu_ans) == 0: 151 | shu_ans = ans 152 | if label is None: 153 | label = [1, 0, 0] 154 | dict = {'query_id': query_id, 'ans_label': label, 'shu_ans': shu_ans, 'index': index, } 155 | # print(json.dumps(dict, ensure_ascii=False), file=outf) 156 | outf.write(json.dumps(dict, ensure_ascii=False)) 157 | outf.write('\n') 158 | outf.close() 159 | 160 | 161 | def ensemble_5(files, template_file, ensemble_size=3): 162 | threshold = ensemble_size // 2 + 1 163 | 164 | total = 0 165 | total_right = 0 166 | total_right_random = 0 167 | total_wrong = 0 168 | temp_data = {} 169 | template_f = open(template_file, 'r', encoding='utf-8') 170 | template = {} 171 | for line_ in template_f: 172 | line_ = json.loads(line_) 173 | shu_ans = line_['shu_ans'] 174 | true_ans = shu_ans[np.argmax(line_['ans_label'])] 175 | template[line_['query_id']] = {'shu_ans': shu_ans, 'true_ans': true_ans} 176 | 177 | for index, path in enumerate(files): 178 | file = open(path, 'r', encoding='utf-8') 179 | for line in file: 180 | line = json.loads(line) 181 | id = line['query_id'] 182 | 183 | if 'predict' in line.keys(): 184 | if len(line['predict']) == 0: 185 | predict_index = 0 186 | else: 187 | predict_index = np.argmax(line['predict']) 188 | predict_word = template[id]['shu_ans'][predict_index] 189 | else: 190 | predict_word = line['pred_answer'] 191 | 192 | if index == 0 or (index != 0 and id not in temp_data.keys()): 193 | temp_data[id] = {} 194 | temp_data[id]['true_ans'] = template[id]['true_ans'] 195 | 196 | if predict_word not in temp_data[id].keys(): 197 | temp_data[id][predict_word] = 1 198 | else: 199 | temp_data[id][predict_word] += 1 200 | 201 | for id, pre in temp_data.items(): 202 | # print('----') 203 | total += 1 204 | label = pre['true_ans'] 205 | for key, value in pre.items(): 206 | if key != 'true_ans': 207 | if value >= threshold: 208 | flag = 0 209 | pre = key 210 | break 211 | else: 212 | flag = 1 213 | pre = 0 214 | if flag == 1: # if not get the rigth result of vote,select the first answer of alternatives(in valid is always first) 215 | total_right_random += 1 216 | elif flag == 0 and pre == label: 217 | total_right += 1 218 | elif flag == 0 and pre != label: 219 | total_wrong += 1 220 | print('{}/{} instances,random {},and the acc is {},total_wrong is {},'.format 221 | (total_right, total, total_right_random, (total_right+total_right_random) / total, total_wrong, )) 222 | return (total_right+total_right_random) / total 223 | 224 | 225 | def ensemble_predict_5(predict_file, template_file, files, ensemble_size=3): 226 | ouf = open(predict_file, 'w', encoding='utf-8') 227 | threshold = ensemble_size // 2 + 1 228 | 229 | total = 0 230 | temp_data = {} 231 | 232 | template_f = open(template_file, 'r', encoding='utf-8') 233 | template = {} 234 | for line_ in template_f: 235 | line_ = json.loads(line_) 236 | shu_ans = line_['shu_ans'] 237 | template[line_['query_id']] = {'shu_ans': shu_ans} 238 | 239 | for index, path in enumerate(files): 240 | file = open(path, 'r', encoding='utf-8') 241 | for line in file: 242 | line = json.loads(line) 243 | id = line['query_id'] 244 | if index == 0 or (index != 0 and id not in temp_data.keys()): 245 | temp_data[id] = {} 246 | temp_data[id]['shu_ans'] = template[id]['shu_ans'] 247 | 248 | if 'predict' in line.keys(): 249 | if len(line['predict']) == 0: 250 | predict_index = 0 251 | else: 252 | predict_index = np.argmax(line['predict']) 253 | try: 254 | shu_ans = template[id]['shu_ans'] 255 | predict_word = shu_ans[predict_index] 256 | except: 257 | predict_word = template[id]['shu_ans'][0] 258 | else: 259 | predict_word = line['pred_answer'] 260 | 261 | if predict_word not in temp_data[id].keys(): 262 | temp_data[id][predict_word] = 1 263 | else: 264 | temp_data[id][predict_word] += 1 265 | for id, pre in temp_data.items(): 266 | total += 1 267 | for key, value in pre.items(): 268 | if key != 'shu_ans': 269 | if value >= threshold: 270 | flag = 0 271 | predict = key 272 | break 273 | else: 274 | flag = 1 275 | 276 | if flag == 1: 277 | predict = pre['shu_ans'][0] 278 | 279 | ouf.write((str(id) + '\t' + predict + '\n')) 280 | ouf.close() 281 | 282 | 283 | if __name__ == '__main__': 284 | args = parse_args() 285 | 286 | files_ = [] 287 | files_test=['data/v81/results/test_v81.txt', 288 | 'data/v84/results/test_v84.txt', 289 | 'QANet/answers/ver20/sorted_testa_v20.txt', 290 | 'QANet/answers/ver60/testa_v60.txt', 291 | 'QANet/answers/ver646/testa_v646.txt'] 292 | files_valid=['data/v81/results/valid_v81.txt', 293 | 'data/v84/results/valid_v84.txt', 294 | 'QANet/answers/ver20/sorted_valid_v20.txt', 295 | 'QANet/answers/ver60/valid_v60.txt', 296 | 'QANet/answers/ver646/valid_v646.txt' 297 | ] 298 | if args.mode == 'ensemble': 299 | modify_index_save(args.input_file_dev, args.template_dev) 300 | ensemble_5(files_valid, args.template_dev, args.ensemble_size) 301 | else: 302 | modify_index_save(args.input_file_test, args.template_test) 303 | ensemble_predict_5(args.predict_file, args.template_test, files_test, args.ensemble_size) --------------------------------------------------------------------------------