├── .gitignore ├── AAAI2022_demo ├── semantic_parsing.ipynb └── text_classification.ipynb ├── CLIQ-ai2021_demo ├── semantic_parsing.ipynb └── text_classification.ipynb ├── DLG4NLP@ICLR2022_demo ├── Math-word-problem │ ├── config.yaml │ ├── data │ │ └── raw │ │ │ ├── test.txt │ │ │ ├── train.txt │ │ │ └── valid.txt │ ├── imgs │ │ └── g2t.png │ ├── math_word_problem.ipynb │ └── utils.py └── text_classification.ipynb ├── GraphML2022_demo ├── Math-word-problem │ ├── .ipynb_checkpoints │ │ └── math_word_problem-checkpoint.ipynb │ ├── config.yaml │ ├── data │ │ └── raw │ │ │ ├── test.txt │ │ │ ├── train.txt │ │ │ └── valid.txt │ ├── imgs │ │ └── g2t.png │ ├── math_word_problem.ipynb │ └── utils.py └── text_classification.ipynb ├── IJCAI2021_demo ├── kg_completion │ ├── .ipynb_checkpoints │ │ └── kgc-checkpoint.ipynb │ ├── __init__.py │ ├── data │ │ └── kinship │ │ │ ├── e1rel_to_e2_full.json │ │ │ ├── e1rel_to_e2_ranking_dev.json │ │ │ ├── e1rel_to_e2_ranking_test.json │ │ │ ├── e1rel_to_e2_train.json │ │ │ ├── test.txt │ │ │ ├── train.txt │ │ │ └── valid.txt │ ├── evaluation.py │ ├── kgc.ipynb │ ├── kinship │ │ ├── processed │ │ │ └── KG_graph.pt │ │ └── raw │ │ │ └── kinship.tar.gz │ ├── model.py │ ├── preprocess.sh │ ├── spodernet │ │ ├── __init__.py │ │ ├── backends │ │ │ ├── __init__.py │ │ │ ├── tfbackend.py │ │ │ ├── tfmodels.py │ │ │ ├── torchbackend.py │ │ │ └── torchmodels.py │ │ ├── data │ │ │ ├── __init__.py │ │ │ └── snli2spoder.py │ │ ├── frontend.py │ │ ├── hooks.py │ │ ├── interfaces.py │ │ ├── preprocessing │ │ │ ├── __init__.py │ │ │ ├── batching.py │ │ │ ├── pipeline.py │ │ │ ├── processors.py │ │ │ └── vocab.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── cuda_utils.py │ │ │ ├── global_config.py │ │ │ ├── logger.py │ │ │ ├── spacy_util.py │ │ │ └── util.py │ └── wrangle_KG.py ├── math_word_problem_solving │ ├── .ipynb_checkpoints │ │ └── math_word_problem-checkpoint.ipynb │ ├── config.yaml │ ├── data │ │ ├── processed │ │ │ └── DependencyGraph │ │ │ │ ├── data.pt │ │ │ │ └── vocab.pt │ │ └── raw │ │ │ ├── test.txt │ │ │ ├── train.txt │ │ │ └── valid.txt │ ├── imgs │ │ └── g2t.png │ ├── math_word_problem.ipynb │ └── utils.py ├── semantic_parsing.ipynb └── text_classification.ipynb ├── KDD2021_demo ├── kg_completion │ ├── .ipynb_checkpoints │ │ └── kgc-checkpoint.ipynb │ ├── __init__.py │ ├── data │ │ └── kinship │ │ │ ├── e1rel_to_e2_full.json │ │ │ ├── e1rel_to_e2_ranking_dev.json │ │ │ ├── e1rel_to_e2_ranking_test.json │ │ │ ├── e1rel_to_e2_train.json │ │ │ ├── test.txt │ │ │ ├── train.txt │ │ │ └── valid.txt │ ├── evaluation.py │ ├── kgc.ipynb │ ├── kinship │ │ ├── processed │ │ │ └── KG_graph.pt │ │ └── raw │ │ │ └── kinship.tar.gz │ ├── model.py │ ├── preprocess.sh │ ├── saved_models │ │ └── kinship_ggnn_distmult_0.2_0.25.model │ ├── spodernet │ │ ├── __init__.py │ │ ├── backends │ │ │ ├── __init__.py │ │ │ ├── tfbackend.py │ │ │ ├── tfmodels.py │ │ │ ├── torchbackend.py │ │ │ └── torchmodels.py │ │ ├── data │ │ │ ├── __init__.py │ │ │ └── snli2spoder.py │ │ ├── frontend.py │ │ ├── hooks.py │ │ ├── interfaces.py │ │ ├── preprocessing │ │ │ ├── __init__.py │ │ │ ├── batching.py │ │ │ ├── pipeline.py │ │ │ ├── processors.py │ │ │ └── vocab.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── cuda_utils.py │ │ │ ├── global_config.py │ │ │ ├── logger.py │ │ │ ├── spacy_util.py │ │ │ └── util.py │ └── wrangle_KG.py ├── math_word_problem_solving │ ├── .ipynb_checkpoints │ │ └── math_word_problem-checkpoint.ipynb │ ├── config.yaml │ ├── data │ │ ├── processed │ │ │ └── DependencyGraph │ │ │ │ ├── data.pt │ │ │ │ └── vocab.pt │ │ └── raw │ │ │ ├── test.txt │ │ │ ├── train.txt │ │ │ └── valid.txt │ ├── imgs │ │ └── g2t.png │ ├── math_word_problem.ipynb │ └── utils.py ├── semantic_parsing.ipynb └── text_classification.ipynb ├── LICENSE ├── NAACL2021_demo ├── semantic_parsing.ipynb └── text_classification.ipynb ├── README.md ├── SIGIR2021_demo ├── semantic_parsing.ipynb └── text_classification.ipynb ├── TheWebConf2022_demo ├── Math-word-problem │ ├── config.yaml │ ├── data │ │ ├── processed │ │ │ └── NodeEmbGraph │ │ │ │ ├── data.pt │ │ │ │ └── vocab.pt │ │ └── raw │ │ │ ├── test.txt │ │ │ ├── train.txt │ │ │ └── valid.txt │ ├── imgs │ │ └── g2t.png │ ├── math_word_problem.ipynb │ └── utils.py └── text_classification.ipynb ├── config ├── jobs │ ├── gat_bi_sep_dynamic_node_emb.yaml │ └── gat_bi_sep_dynamic_node_emb_v2.yaml └── trec │ ├── graphsage_bi_fuse_static_dependency.yaml │ └── graphsage_bi_fuse_static_dependency_v2.yaml └── data ├── jobs ├── processed │ ├── NodeEmbGraph │ │ ├── data.pt │ │ └── vocab.pt │ └── node_emb_graph │ │ ├── data.pt │ │ └── vocab.pt └── raw │ ├── sequence.pt │ ├── sequence.txt │ ├── test.txt │ ├── train.txt │ ├── vocab.f.txt │ └── vocab.q.txt └── trec ├── processed └── dependency_graph │ ├── data.pt │ ├── label.pt │ └── vocab.pt └── raw ├── test.txt └── train.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | */__pycache__ 3 | */.ipynb_checkpoints 4 | */out 5 | */.vector_cache 6 | *.DS_Store 7 | */*/out 8 | */*/.vector_cache/ 9 | *.pt -------------------------------------------------------------------------------- /DLG4NLP@ICLR2022_demo/Math-word-problem/config.yaml: -------------------------------------------------------------------------------- 1 | graph_construction_name: "node_emb" 2 | graph_embedding_name: "graphsage" 3 | decoder_name: "stdtree" 4 | 5 | graph_construction_args: 6 | graph_construction_share: 7 | graph_name: 'node_emb' 8 | root_dir: "data" 9 | topology_subdir: 'NodeEmbGraph' 10 | thread_number: 4 11 | port: 9000 12 | timeout: 15000 13 | 14 | graph_construction_private: 15 | edge_strategy: 'homogeneous' 16 | merge_strategy: 'tailhead' 17 | sequential_link: true 18 | as_node: false 19 | sim_metric_type: 'weighted_cosine' 20 | num_heads: 1 21 | top_k_neigh: null 22 | epsilon_neigh: 0.5 23 | smoothness_ratio: 0.1 24 | connectivity_ratio: 0.05 25 | sparsity_ratio: 0.1 26 | 27 | graph_initialization_args: 28 | input_size: 300 29 | hidden_size: 300 30 | word_dropout: 0.1 31 | rnn_dropout: 0.1 32 | fix_bert_emb: false 33 | fix_word_emb: false 34 | embedding_style: 35 | single_token_item: true 36 | emb_strategy: "w2v_bilstm" 37 | num_rnn_layers: 1 38 | bert_model_name: null 39 | bert_lower_case: null 40 | 41 | graph_embedding_args: 42 | graph_embedding_share: 43 | num_layers: 1 44 | input_size: 300 45 | hidden_size: 300 46 | output_size: 300 47 | direction_option: "undirected" 48 | feat_drop: 0.0 49 | attn_drop: 0.0 50 | 51 | graph_embedding_private: 52 | aggregator_type: "lstm" 53 | bias: true 54 | norm: null 55 | activation: "relu" 56 | use_edge_weight: true 57 | 58 | decoder_args: 59 | rnn_decoder_share: 60 | rnn_type: "lstm" 61 | input_size: 300 62 | hidden_size: 300 63 | rnn_emb_input_size: 300 64 | use_copy: true 65 | graph_pooling_strategy: null 66 | attention_type: "uniform" 67 | fuse_strategy: "concatenate" 68 | dropout: 0.3 69 | teacher_forcing_rate: 1.0 70 | 71 | rnn_decoder_private: 72 | max_decoder_step: 35 73 | max_tree_depth: 8 74 | use_sibling: false 75 | -------------------------------------------------------------------------------- /DLG4NLP@ICLR2022_demo/Math-word-problem/imgs/g2t.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/DLG4NLP@ICLR2022_demo/Math-word-problem/imgs/g2t.png -------------------------------------------------------------------------------- /DLG4NLP@ICLR2022_demo/Math-word-problem/utils.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import torch 3 | import sympy 4 | from random import randint 5 | from sympy.parsing.sympy_parser import parse_expr 6 | 7 | def convert_to_string(idx_list, form_manager): 8 | w_list = [] 9 | for i in range(len(idx_list)): 10 | w_list.append(form_manager.get_idx_symbol(int(idx_list[i]))) 11 | return " ".join(w_list) 12 | 13 | def is_all_same(c1, c2, form_manager): 14 | all_same = False 15 | if len(c1) == len(c2): 16 | all_same = True 17 | for j in range(len(c1)): 18 | if c1[j] != c2[j]: 19 | all_same = False 20 | break 21 | if all_same == False: 22 | if is_solution_same(c1, c2, form_manager): 23 | return True 24 | return False 25 | else: 26 | return True 27 | 28 | 29 | def is_solution_same(i1, i2, form_manager): 30 | c1 = " ".join([form_manager.get_idx_symbol(x) for x in i1]) 31 | c2 = " ".join([form_manager.get_idx_symbol(x) for x in i2]) 32 | if ('=' not in c1) or ('=' not in c2): 33 | return False 34 | elif (form_manager.unk_token in c1) or (form_manager.unk_token in c2): 35 | return False 36 | else: 37 | try: 38 | s1 = c1.split('=') 39 | s2 = c2.split('=') 40 | eq1 = [] 41 | eq2 = [] 42 | x = sympy.Symbol('x') 43 | eq1.append(parse_expr(s1[0])) 44 | eq1.append(parse_expr(s1[1])) 45 | eq2.append(parse_expr(s2[0])) 46 | eq2.append(parse_expr(s2[1])) 47 | res1 = sympy.solve(sympy.Eq(eq1[0], eq1[1]), x) 48 | res2 = sympy.solve(sympy.Eq(eq2[0], eq2[1]), x) 49 | 50 | if not res1 or not res2: 51 | return False 52 | if res1[0] == res2[0]: 53 | # print("Excution_true: ", c1, '\t', c2) 54 | pass 55 | return res1[0] == res2[0] 56 | 57 | except BaseException: 58 | # print("Excution_error: ", c1, '\t', c2) 59 | pass 60 | return False 61 | 62 | def compute_accuracy(candidate_list, reference_list, form_manager): 63 | if len(candidate_list) != len(reference_list): 64 | print("candidate list has length {}, reference list has length {}\n".format( 65 | len(candidate_list), len(reference_list))) 66 | len_min = min(len(candidate_list), len(reference_list)) 67 | c = 0 68 | for i in range(len_min): 69 | if is_all_same(candidate_list[i], reference_list[i], form_manager): 70 | c = c+1 71 | else: 72 | pass 73 | return c/float(len_min) 74 | 75 | 76 | def compute_tree_accuracy(candidate_list_, reference_list_, form_manager): 77 | candidate_list = [] 78 | for i in range(len(candidate_list_)): 79 | candidate_list.append(candidate_list_[i]) 80 | reference_list = [] 81 | for i in range(len(reference_list_)): 82 | reference_list.append(reference_list_[i]) 83 | return compute_accuracy(candidate_list, reference_list, form_manager) 84 | 85 | def prepare_ext_vocab(batch_graph, src_vocab, device): 86 | oov_dict = copy.deepcopy(src_vocab) 87 | token_matrix = [] 88 | for n in batch_graph.node_attributes: 89 | node_token = n["token"] 90 | if (n.get("type") is None or n.get("type") == 0) and oov_dict.get_symbol_idx( 91 | node_token 92 | ) == oov_dict.get_symbol_idx(oov_dict.unk_token): 93 | oov_dict.add_symbol(node_token) 94 | token_matrix.append(oov_dict.get_symbol_idx(node_token)) 95 | batch_graph.node_features["token_id_oov"] = torch.tensor(token_matrix, dtype=torch.long).to( 96 | device 97 | ) 98 | return oov_dict -------------------------------------------------------------------------------- /GraphML2022_demo/Math-word-problem/config.yaml: -------------------------------------------------------------------------------- 1 | graph_construction_name: "node_emb" 2 | graph_embedding_name: "graphsage" 3 | decoder_name: "stdtree" 4 | 5 | graph_construction_args: 6 | graph_construction_share: 7 | graph_name: 'node_emb' 8 | root_dir: "data" 9 | topology_subdir: 'NodeEmbGraph' 10 | thread_number: 4 11 | port: 9000 12 | timeout: 15000 13 | 14 | graph_construction_private: 15 | edge_strategy: 'homogeneous' 16 | merge_strategy: 'tailhead' 17 | sequential_link: true 18 | as_node: false 19 | sim_metric_type: 'weighted_cosine' 20 | num_heads: 1 21 | top_k_neigh: null 22 | epsilon_neigh: 0.5 23 | smoothness_ratio: 0.1 24 | connectivity_ratio: 0.05 25 | sparsity_ratio: 0.1 26 | 27 | graph_initialization_args: 28 | input_size: 300 29 | hidden_size: 300 30 | word_dropout: 0.1 31 | rnn_dropout: 0.1 32 | fix_bert_emb: false 33 | fix_word_emb: false 34 | embedding_style: 35 | single_token_item: true 36 | emb_strategy: "w2v_bilstm" 37 | num_rnn_layers: 1 38 | bert_model_name: null 39 | bert_lower_case: null 40 | 41 | graph_embedding_args: 42 | graph_embedding_share: 43 | num_layers: 1 44 | input_size: 300 45 | hidden_size: 300 46 | output_size: 300 47 | direction_option: "undirected" 48 | feat_drop: 0.0 49 | attn_drop: 0.0 50 | 51 | graph_embedding_private: 52 | aggregator_type: "lstm" 53 | bias: true 54 | norm: null 55 | activation: "relu" 56 | use_edge_weight: true 57 | 58 | decoder_args: 59 | rnn_decoder_share: 60 | rnn_type: "lstm" 61 | input_size: 300 62 | hidden_size: 300 63 | rnn_emb_input_size: 300 64 | use_copy: true 65 | graph_pooling_strategy: null 66 | attention_type: "uniform" 67 | fuse_strategy: "concatenate" 68 | dropout: 0.3 69 | teacher_forcing_rate: 1.0 70 | 71 | rnn_decoder_private: 72 | max_decoder_step: 35 73 | max_tree_depth: 8 74 | use_sibling: false 75 | -------------------------------------------------------------------------------- /GraphML2022_demo/Math-word-problem/imgs/g2t.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/GraphML2022_demo/Math-word-problem/imgs/g2t.png -------------------------------------------------------------------------------- /GraphML2022_demo/Math-word-problem/utils.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import torch 3 | import sympy 4 | from random import randint 5 | from sympy.parsing.sympy_parser import parse_expr 6 | 7 | def convert_to_string(idx_list, form_manager): 8 | w_list = [] 9 | for i in range(len(idx_list)): 10 | w_list.append(form_manager.get_idx_symbol(int(idx_list[i]))) 11 | return " ".join(w_list) 12 | 13 | def is_all_same(c1, c2, form_manager): 14 | all_same = False 15 | if len(c1) == len(c2): 16 | all_same = True 17 | for j in range(len(c1)): 18 | if c1[j] != c2[j]: 19 | all_same = False 20 | break 21 | if all_same == False: 22 | if is_solution_same(c1, c2, form_manager): 23 | return True 24 | return False 25 | else: 26 | return True 27 | 28 | 29 | def is_solution_same(i1, i2, form_manager): 30 | c1 = " ".join([form_manager.get_idx_symbol(x) for x in i1]) 31 | c2 = " ".join([form_manager.get_idx_symbol(x) for x in i2]) 32 | if ('=' not in c1) or ('=' not in c2): 33 | return False 34 | elif (form_manager.unk_token in c1) or (form_manager.unk_token in c2): 35 | return False 36 | else: 37 | try: 38 | s1 = c1.split('=') 39 | s2 = c2.split('=') 40 | eq1 = [] 41 | eq2 = [] 42 | x = sympy.Symbol('x') 43 | eq1.append(parse_expr(s1[0])) 44 | eq1.append(parse_expr(s1[1])) 45 | eq2.append(parse_expr(s2[0])) 46 | eq2.append(parse_expr(s2[1])) 47 | res1 = sympy.solve(sympy.Eq(eq1[0], eq1[1]), x) 48 | res2 = sympy.solve(sympy.Eq(eq2[0], eq2[1]), x) 49 | 50 | if not res1 or not res2: 51 | return False 52 | if res1[0] == res2[0]: 53 | # print("Excution_true: ", c1, '\t', c2) 54 | pass 55 | return res1[0] == res2[0] 56 | 57 | except BaseException: 58 | # print("Excution_error: ", c1, '\t', c2) 59 | pass 60 | return False 61 | 62 | def compute_accuracy(candidate_list, reference_list, form_manager): 63 | if len(candidate_list) != len(reference_list): 64 | print("candidate list has length {}, reference list has length {}\n".format( 65 | len(candidate_list), len(reference_list))) 66 | len_min = min(len(candidate_list), len(reference_list)) 67 | c = 0 68 | for i in range(len_min): 69 | if is_all_same(candidate_list[i], reference_list[i], form_manager): 70 | c = c+1 71 | else: 72 | pass 73 | return c/float(len_min) 74 | 75 | 76 | def compute_tree_accuracy(candidate_list_, reference_list_, form_manager): 77 | candidate_list = [] 78 | for i in range(len(candidate_list_)): 79 | candidate_list.append(candidate_list_[i]) 80 | reference_list = [] 81 | for i in range(len(reference_list_)): 82 | reference_list.append(reference_list_[i]) 83 | return compute_accuracy(candidate_list, reference_list, form_manager) 84 | 85 | def prepare_ext_vocab(batch_graph, src_vocab, device): 86 | oov_dict = copy.deepcopy(src_vocab) 87 | token_matrix = [] 88 | for n in batch_graph.node_attributes: 89 | node_token = n["token"] 90 | if (n.get("type") is None or n.get("type") == 0) and oov_dict.get_symbol_idx( 91 | node_token 92 | ) == oov_dict.get_symbol_idx(oov_dict.unk_token): 93 | oov_dict.add_symbol(node_token) 94 | token_matrix.append(oov_dict.get_symbol_idx(node_token)) 95 | batch_graph.node_features["token_id_oov"] = torch.tensor(token_matrix, dtype=torch.long).to( 96 | device 97 | ) 98 | return oov_dict -------------------------------------------------------------------------------- /IJCAI2021_demo/kg_completion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/IJCAI2021_demo/kg_completion/__init__.py -------------------------------------------------------------------------------- /IJCAI2021_demo/kg_completion/evaluation.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import datetime 4 | 5 | from spodernet.utils.logger import Logger 6 | from torch.autograd import Variable 7 | from sklearn import metrics 8 | 9 | log = Logger('evaluation{0}.py.txt'.format(datetime.datetime.now())) 10 | 11 | def ranking_and_hits(model, dev_rank_batcher, vocab, name, kg_graph=None): 12 | log.info('') 13 | log.info('-'*50) 14 | log.info(name) 15 | log.info('-'*50) 16 | log.info('') 17 | hits_left = [] 18 | hits_right = [] 19 | hits = [] 20 | ranks = [] 21 | ranks_left = [] 22 | ranks_right = [] 23 | for i in range(10): 24 | hits_left.append([]) 25 | hits_right.append([]) 26 | hits.append([]) 27 | 28 | for i, str2var in enumerate(dev_rank_batcher): 29 | e1 = str2var['e1'] 30 | e2 = str2var['e2'] 31 | rel = str2var['rel'] 32 | rel_reverse = str2var['rel_eval'] 33 | e2_multi1 = str2var['e2_multi1'].float() 34 | e2_multi2 = str2var['e2_multi2'].float() 35 | pred1 = model.forward(e1, rel, kg_graph) 36 | pred2 = model.forward(e2, rel_reverse, kg_graph) 37 | pred1, pred2 = pred1.data, pred2.data 38 | e1, e2 = e1.data, e2.data 39 | e2_multi1, e2_multi2 = e2_multi1.data, e2_multi2.data 40 | for i in range(e1.shape[0]): 41 | # these filters contain ALL labels 42 | filter1 = e2_multi1[i].long() 43 | filter2 = e2_multi2[i].long() 44 | 45 | num = e1[i, 0].item() 46 | # save the prediction that is relevant 47 | target_value1 = pred1[i,e2[i, 0].item()].item() 48 | target_value2 = pred2[i,e1[i, 0].item()].item() 49 | # zero all known cases (this are not interesting) 50 | # this corresponds to the filtered setting 51 | pred1[i][filter1] = 0.0 52 | pred2[i][filter2] = 0.0 53 | # write base the saved values 54 | pred1[i][e2[i]] = target_value1 55 | pred2[i][e1[i]] = target_value2 56 | 57 | 58 | # sort and rank 59 | max_values, argsort1 = torch.sort(pred1, 1, descending=True) 60 | max_values, argsort2 = torch.sort(pred2, 1, descending=True) 61 | 62 | argsort1 = argsort1.cpu().numpy() 63 | argsort2 = argsort2.cpu().numpy() 64 | for i in range(e1.shape[0]): 65 | # find the rank of the target entities 66 | rank1 = np.where(argsort1[i]==e2[i, 0].item())[0][0] 67 | rank2 = np.where(argsort2[i]==e1[i, 0].item())[0][0] 68 | # rank+1, since the lowest rank is rank 1 not rank 0 69 | ranks.append(rank1+1) 70 | ranks_left.append(rank1+1) 71 | ranks.append(rank2+1) 72 | ranks_right.append(rank2+1) 73 | 74 | # this could be done more elegantly, but here you go 75 | for hits_level in range(10): 76 | if rank1 <= hits_level: 77 | hits[hits_level].append(1.0) 78 | hits_left[hits_level].append(1.0) 79 | else: 80 | hits[hits_level].append(0.0) 81 | hits_left[hits_level].append(0.0) 82 | 83 | if rank2 <= hits_level: 84 | hits[hits_level].append(1.0) 85 | hits_right[hits_level].append(1.0) 86 | else: 87 | hits[hits_level].append(0.0) 88 | hits_right[hits_level].append(0.0) 89 | 90 | dev_rank_batcher.state.loss = [0] 91 | 92 | for i in range(10): 93 | log.info('Hits left @{0}: {1}'.format(i+1, np.mean(hits_left[i]))) 94 | log.info('Hits right @{0}: {1}'.format(i+1, np.mean(hits_right[i]))) 95 | log.info('Hits @{0}: {1}'.format(i+1, np.mean(hits[i]))) 96 | log.info('Mean rank left: {0}', np.mean(ranks_left)) 97 | log.info('Mean rank right: {0}', np.mean(ranks_right)) 98 | log.info('Mean rank: {0}', np.mean(ranks)) 99 | log.info('Mean reciprocal rank left: {0}', np.mean(1./np.array(ranks_left))) 100 | log.info('Mean reciprocal rank right: {0}', np.mean(1./np.array(ranks_right))) 101 | log.info('Mean reciprocal rank: {0}', np.mean(1./np.array(ranks))) 102 | 103 | return np.mean(1./np.array(ranks)) -------------------------------------------------------------------------------- /IJCAI2021_demo/kg_completion/kinship/processed/KG_graph.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/IJCAI2021_demo/kg_completion/kinship/processed/KG_graph.pt -------------------------------------------------------------------------------- /IJCAI2021_demo/kg_completion/kinship/raw/kinship.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/IJCAI2021_demo/kg_completion/kinship/raw/kinship.tar.gz -------------------------------------------------------------------------------- /IJCAI2021_demo/kg_completion/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | mkdir kg_completion/data 3 | mkdir kg_completion/data/kinship 4 | mkdir kg_completion/saved_models 5 | tar -xvf kg_completion/kinship/raw/kinship.tar.gz -C kg_completion/data/kinship 6 | python kg_completion/wrangle_KG.py kinship 7 | -------------------------------------------------------------------------------- /IJCAI2021_demo/kg_completion/spodernet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/IJCAI2021_demo/kg_completion/spodernet/__init__.py -------------------------------------------------------------------------------- /IJCAI2021_demo/kg_completion/spodernet/backends/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/IJCAI2021_demo/kg_completion/spodernet/backends/__init__.py -------------------------------------------------------------------------------- /IJCAI2021_demo/kg_completion/spodernet/backends/tfbackend.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from spodernet.interfaces import IAtBatchPreparedObservable 4 | from spodernet.utils.util import Timer 5 | from spodernet.utils.global_config import Config 6 | 7 | class TensorFlowConfig: 8 | inp = None 9 | support = None 10 | input_length = None 11 | support_length = None 12 | target = None 13 | index = None 14 | sess = None 15 | 16 | @staticmethod 17 | def init_batch_size(batch_size): 18 | TensorFlowConfig.inp = tf.placeholder(tf.int64, [batch_size, None]) 19 | TensorFlowConfig.support = tf.placeholder(tf.int64, [batch_size, None]) 20 | TensorFlowConfig.input_length = tf.placeholder(tf.int64, [batch_size,]) 21 | TensorFlowConfig.support_length = tf.placeholder(tf.int64, [batch_size,]) 22 | TensorFlowConfig.target = tf.placeholder(tf.int64, [batch_size]) 23 | TensorFlowConfig.index = tf.placeholder(tf.int64, [batch_size]) 24 | 25 | @staticmethod 26 | def get_session(): 27 | if TensorFlowConfig.sess is None: 28 | TensorFlowConfig.sess = tf.Session() 29 | return TensorFlowConfig.sess 30 | 31 | 32 | 33 | class TensorFlowConverter(IAtBatchPreparedObservable): 34 | 35 | def at_batch_prepared(self, batch_parts): 36 | inp, inp_len, sup, sup_len, t, idx = batch_parts 37 | if TensorFlowConfig.inp == None: 38 | log.error('You need to initialize the batch size via TensorflowConfig.init_batch_size(batchsize)!') 39 | feed_dict = {} 40 | feed_dict[TensorFlowConfig.inp] = inp 41 | feed_dict[TensorFlowConfig.support] = sup 42 | feed_dict[TensorFlowConfig.input_length] = inp_len 43 | feed_dict[TensorFlowConfig.support_length] = sup_len 44 | feed_dict[TensorFlowConfig.target] = t 45 | feed_dict[TensorFlowConfig.index] = idx 46 | 47 | str2var = {} 48 | str2var['input'] = TensorFlowConfig.inp 49 | str2var['input_length'] = TensorFlowConfig.input_length 50 | str2var['support'] = TensorFlowConfig.support 51 | str2var['support_length'] = TensorFlowConfig.support_length 52 | str2var['target'] = TensorFlowConfig.target 53 | str2var['index'] = TensorFlowConfig.index 54 | 55 | return str2var, feed_dict 56 | 57 | def build_str2var_dict(): 58 | str2var = {} 59 | if TensorFlowConfig.inp is not None: 60 | str2var['input'] = TensorFlowConfig.inp 61 | if TensorFlowConfig.support is not None: 62 | str2var['support'] = TensorFlowConfig.support 63 | if TensorFlowConfig.target is not None: 64 | str2var['target'] = TensorFlowConfig.target 65 | if TensorFlowConfig.input_length is not None: 66 | str2var['input_length'] = TensorFlowConfig.input_length 67 | if TensorFlowConfig.support_length is not None: 68 | str2var['support_length'] = TensorFlowConfig.support_length 69 | if TensorFlowConfig.index is not None: 70 | str2var['index'] = TensorFlowConfig.index 71 | return str2var 72 | 73 | class TFTrainer(object): 74 | def __init__(self, model): 75 | self.sess = TensorFlowConfig.get_session() 76 | str2var = build_str2var_dict() 77 | self.logits, self.loss, self.argmax = model.forward(str2var) 78 | optimizer = tf.train.AdamOptimizer(0.001) 79 | 80 | if Config.L2 != 0.0: 81 | self.loss += tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()]) * Config.L2 82 | 83 | self.min_op = optimizer.minimize(self.loss) 84 | 85 | tf.global_variables_initializer().run(session=self.sess) 86 | 87 | def train_model(self, batcher, epochs=1, iterations=None): 88 | for epoch in range(epochs): 89 | for i, (str2var, feed_dict) in enumerate(batcher): 90 | _, argmax_values = self.sess.run([self.min_op, self.argmax], feed_dict=feed_dict) 91 | 92 | batcher.state.argmax = argmax_values 93 | batcher.state.targets = feed_dict[TensorFlowConfig.target] 94 | 95 | if iterations > 0: 96 | if i == iterations: break 97 | 98 | def eval_model(self, batcher, iterations=None): 99 | for i, (str2var, feed_dict) in enumerate(batcher): 100 | argmax_values = self.sess.run([self.argmax], feed_dict=feed_dict)[0] 101 | 102 | batcher.state.argmax = argmax_values 103 | batcher.state.targets = feed_dict[TensorFlowConfig.target] 104 | 105 | if iterations > 0: 106 | if i == iterations: break 107 | 108 | -------------------------------------------------------------------------------- /IJCAI2021_demo/kg_completion/spodernet/backends/tfmodels.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow import placeholder 3 | from spodernet.backends.tfbackend import TensorFlowConfig 4 | from spodernet.utils.global_config import Config 5 | from spodernet.frontend import AbstractModel 6 | import numpy as np 7 | 8 | def reader(inputs, lengths, output_size, contexts=(None, None), scope=None): 9 | with tf.variable_scope(scope or "reader") as varscope: 10 | 11 | cell = tf.contrib.rnn.LSTMCell(output_size, state_is_tuple=True,initializer=tf.contrib.layers.xavier_initializer()) 12 | 13 | cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=1.0-Config.dropout) 14 | 15 | outputs, states = tf.nn.bidirectional_dynamic_rnn( 16 | cell, 17 | cell, 18 | inputs, 19 | sequence_length=lengths, 20 | initial_state_fw=contexts[0], 21 | initial_state_bw=contexts[1], 22 | dtype=tf.float32) 23 | 24 | return outputs, states 25 | 26 | def predictor(inputs, targets, target_size): 27 | init = tf.contrib.layers.xavier_initializer(uniform=True) #uniform=False for truncated normal 28 | logits = tf.contrib.layers.fully_connected(inputs, target_size, weights_initializer=init, activation_fn=None) 29 | 30 | loss = tf.reduce_mean( 31 | tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, 32 | labels=targets), name='predictor_loss') 33 | predict = tf.arg_max(tf.nn.softmax(logits), 1, name='prediction') 34 | return [logits, loss, predict] 35 | 36 | 37 | class TFEmbedding(AbstractModel): 38 | 39 | def __init__(self, embedding_size, num_embeddings, scope=None): 40 | super(TFEmbedding, self).__init__() 41 | 42 | self.embedding_size = embedding_size 43 | self.scope = scope 44 | self.num_embeddings = num_embeddings 45 | 46 | def forward(self, str2var, *args): 47 | self.expected_str2var_keys(str2var, ['input', 'support']) 48 | self.expected_args('None', 'None') 49 | self.generated_outputs('input idx, support idx', 'both sequences have shape = [batch, timesteps, embedding dim]') 50 | 51 | embeddings = tf.get_variable("embeddings", [self.num_embeddings, self.embedding_size], 52 | initializer=tf.random_normal_initializer(0., 1./np.sqrt(self.embedding_size)), 53 | trainable=True, dtype="float32") 54 | 55 | with tf.variable_scope("embedders") as varscope: 56 | seqQ = tf.nn.embedding_lookup(embeddings, TensorFlowConfig.inp) 57 | varscope.reuse_variables() 58 | seqS = tf.nn.embedding_lookup(embeddings, TensorFlowConfig.support) 59 | 60 | return seqQ, seqS 61 | 62 | class TFPairedBiDirectionalLSTM(AbstractModel): 63 | 64 | def __init__(self, hidden_size, scope=None, conditional_encoding=True): 65 | super(TFPairedBiDirectionalLSTM, self).__init__() 66 | self.hidden_size = hidden_size 67 | self.scope = scope 68 | if not conditional_encoding: 69 | raise NotImplementedError("conditional_encoding=False is not implemented yet.") 70 | 71 | def forward(self, str2var, *args): 72 | self.expected_str2var_keys(str2var, ['input_length', 'support_length']) 73 | self.expected_args('seq input, seq support', 'dimension of both: [batch, timesteps, embedding dim]') 74 | self.generated_outputs('stacked outputs of last timestep', 'dim is [batch_size, 2x hidden size]') 75 | 76 | seqQ, seqS = args 77 | 78 | with tf.variable_scope(self.scope or "conditional_reader_seq1") as varscope1: 79 | #seq1_states: (c_fw, h_fw), (c_bw, h_bw) 80 | _, seq1_states = reader(seqQ, TensorFlowConfig.input_length, self.hidden_size, scope=varscope1) 81 | with tf.variable_scope(self.scope or "conditional_reader_seq2") as varscope2: 82 | varscope1.reuse_variables() 83 | # each [batch_size x max_seq_length x output_size] 84 | outputs, states = reader(seqS, TensorFlowConfig.support_length, self.hidden_size, seq1_states, scope=varscope2) 85 | 86 | output = tf.concat([states[0][1], states[1][1]], 1) 87 | 88 | return [output] 89 | 90 | class TFSoftmaxCrossEntropy(AbstractModel): 91 | 92 | def __init__(self, num_labels): 93 | super(TFSoftmaxCrossEntropy, self).__init__() 94 | self.num_labels = num_labels 95 | 96 | def forward(self, str2var, *args): 97 | self.expected_str2var_keys(str2var, ['target']) 98 | self.expected_args('some inputs', 'dimension: [batch, any]') 99 | self.generated_outputs('logits, loss, argmax', 'dimensions: logits = [batch, labels], loss = 1x1, argmax = [batch, 1]') 100 | outputs_prev_layer = args[0] 101 | 102 | logits, loss, argmax = predictor(outputs_prev_layer, TensorFlowConfig.target, self.num_labels) 103 | 104 | return [logits, loss, argmax] 105 | -------------------------------------------------------------------------------- /IJCAI2021_demo/kg_completion/spodernet/backends/torchbackend.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from torch.autograd import Variable 3 | from itertools import chain 4 | 5 | import torch 6 | import numpy as np 7 | 8 | from spodernet.interfaces import IAtBatchPreparedObservable 9 | from spodernet.utils.util import Timer 10 | from spodernet.utils.global_config import Config 11 | 12 | class TorchConverter(IAtBatchPreparedObservable): 13 | def __init__(self, is_volatile): 14 | self.is_volatile = is_volatile 15 | 16 | def at_batch_prepared(self, str2var): 17 | for key in str2var.keys(): 18 | if 'length' in key: continue 19 | if str2var[key].dtype == np.int32: 20 | str2var[key] = np.int64(str2var[key]) 21 | str2var[key] = Variable(torch.from_numpy(str2var[key]), volatile=self.is_volatile) 22 | return str2var 23 | 24 | class TorchCUDAConverter(IAtBatchPreparedObservable): 25 | def __init__(self, device_id): 26 | self.device_id = device_id 27 | 28 | def at_batch_prepared(self, str2var): 29 | for key in str2var.keys(): 30 | if 'length' in key: continue 31 | str2var[key] = str2var[key].cuda(self.device_id, True) 32 | return str2var 33 | 34 | 35 | class TorchNegativeSampling(IAtBatchPreparedObservable): 36 | def __init__(self, max_index, keys_to_corrupt=['input', 'target']): 37 | self.max_index = max_index 38 | self.keys_to_corrupt = keys_to_corrupt 39 | self.rdm = np.random.RandomState(34534) 40 | 41 | def at_batch_prepared(self, str2var): 42 | samples_per_key = Config.batch_size/len(self.keys_to_corrupt) 43 | for i, key in enumerate(self.keys_to_corrupt): 44 | variable = str2var[key] 45 | new_idx = self.rdm.choice(self.max_index, samples_per_key) 46 | if Config.cuda: 47 | variable_corrupted = Variable(torch.cuda.LongTensor(variable.size())) 48 | variable_corrupted.data.copy_(variable.data) 49 | variable_corrupted.data[i*samples_per_key: (i+1)*samples_per_key] = torch.from_numpy(new_idx).cuda() 50 | else: 51 | variable_corrupted = Variable(torch.LongTensor(variable.size())) 52 | variable_corrupted.data.copy_(variable.data) 53 | variable_corrupted.data[i*samples_per_key: (i+1)*samples_per_key] = torch.from_numpy(new_idx) 54 | str2var[key + '_corrupt'] = variable_corrupted 55 | 56 | return str2var 57 | 58 | 59 | ###################################### 60 | # 61 | # Util functions 62 | # 63 | ###################################### 64 | 65 | 66 | def get_list_of_torch_modules(model): 67 | modules = [] 68 | for module in model.modules: 69 | if hasattr(module, 'modules'): 70 | for module2 in module.modules: 71 | modules.append(module2) 72 | else: 73 | modules.append(module) 74 | return modules 75 | 76 | 77 | 78 | def train_model(model, batcher, epochs=1, iterations=None): 79 | modules = get_list_of_torch_modules(model) 80 | generators = [] 81 | for module in modules: 82 | if Config.cuda: 83 | module.cuda() 84 | generators.append(module.parameters()) 85 | 86 | parameters = chain.from_iterable(generators) 87 | optimizer = torch.optim.Adam(parameters, lr=0.001) 88 | for module in modules: 89 | module.train() 90 | 91 | for epoch in range(epochs): 92 | for i, str2var in enumerate(batcher): 93 | optimizer.zero_grad() 94 | logits, loss, argmax = model.forward(str2var) 95 | loss.backward() 96 | optimizer.step() 97 | batcher.state.argmax = argmax 98 | batcher.state.targets = str2var['target'] 99 | 100 | if iterations > 0: 101 | if i == iterations: break 102 | 103 | 104 | def eval_model(model, batcher, iterations=None): 105 | modules = get_list_of_torch_modules(model) 106 | for module in modules: 107 | module.eval() 108 | 109 | for i, str2var in enumerate(batcher): 110 | logits, loss, argmax = model.forward(str2var) 111 | batcher.state.argmax = argmax 112 | batcher.state.targets = str2var['target'] 113 | 114 | if iterations > 0: 115 | if i == iterations: break 116 | -------------------------------------------------------------------------------- /IJCAI2021_demo/kg_completion/spodernet/backends/torchmodels.py: -------------------------------------------------------------------------------- 1 | from torch.nn import LSTM 2 | from torch.autograd import Variable 3 | 4 | import torch 5 | import torch.nn.functional as F 6 | 7 | from spodernet.frontend import AbstractModel 8 | from spodernet.utils.global_config import Config 9 | 10 | class TorchEmbedding(torch.nn.Module, AbstractModel): 11 | def __init__(self, embedding_size, num_embeddings): 12 | super(TorchEmbedding, self).__init__() 13 | 14 | self.emb= torch.nn.Embedding(num_embeddings, 15 | embedding_size, padding_idx=0)#, scale_grad_by_freq=True, padding_idx=0) 16 | 17 | def forward(self, str2var, *args): 18 | self.expected_str2var_keys_oneof(str2var, ['input', 'support']) 19 | self.expected_args('None', 'None') 20 | self.generated_outputs('input idx, support idx', 'both sequences have shape = [batch, timesteps, embedding dim]') 21 | 22 | embedded_results = [] 23 | if 'input' in str2var: 24 | embedded_results.append(self.emb(str2var['input'])) 25 | 26 | if 'support' in str2var: 27 | embedded_results.append(self.emb(str2var['support'])) 28 | 29 | return embedded_results 30 | 31 | class TorchBiDirectionalLSTM(torch.nn.Module, AbstractModel): 32 | def __init__(self, input_size, hidden_size, 33 | dropout=0.0, layers=1, 34 | bidirectional=True, to_cuda=False, conditional_encoding=True): 35 | super(TorchBiDirectionalLSTM, self).__init__() 36 | 37 | use_bias = True 38 | num_directions = (1 if not bidirectional else 2) 39 | 40 | self.lstm = LSTM(input_size,hidden_size,layers, 41 | use_bias,True,0.2,bidirectional) 42 | 43 | # states of both LSTMs 44 | self.h0 = None 45 | self.c0 = None 46 | 47 | self.h0 = Variable(torch.FloatTensor(num_directions*layers, Config.batch_size, hidden_size)) 48 | self.c0 = Variable(torch.FloatTensor(num_directions*layers, Config.batch_size, hidden_size)) 49 | 50 | if Config.cuda: 51 | self.h0 = self.h0.cuda() 52 | self.c0 = self.c0.cuda() 53 | 54 | def forward(self, str2var, *args): 55 | self.expected_str2var_keys(str2var, []) 56 | self.expected_args('embedded seq', 'size [batch, time steps, embedding dim]') 57 | self.generated_outputs('LSTM output seq', 'size [batch, time steps, 2x hidden dim]') 58 | seq = args 59 | self.h0.data.zero_() 60 | self.c0.data.zero_() 61 | out, hid = self.lstm(seq, (self.h0, self.c0)) 62 | return [out, hid] 63 | 64 | 65 | class TorchPairedBiDirectionalLSTM(torch.nn.Module, AbstractModel): 66 | def __init__(self, input_size, hidden_size, 67 | dropout=0.0, layers=1, 68 | bidirectional=True, to_cuda=False, conditional_encoding=True): 69 | super(TorchPairedBiDirectionalLSTM, self).__init__() 70 | 71 | self.conditional_encoding = conditional_encoding 72 | use_bias = True 73 | num_directions = (1 if not bidirectional else 2) 74 | 75 | self.conditional_encoding = conditional_encoding 76 | self.lstm1 = LSTM(input_size,hidden_size,layers, 77 | use_bias,True,Config.dropout,bidirectional) 78 | self.lstm2 = LSTM(input_size,hidden_size,layers, 79 | use_bias,True,Config.dropout,bidirectional) 80 | 81 | # states of both LSTMs 82 | self.h01 = None 83 | self.c01 = None 84 | self.h02 = None 85 | self.c02 = None 86 | 87 | 88 | self.h01 = Variable(torch.FloatTensor(num_directions*layers, Config.batch_size, hidden_size)) 89 | self.c01 = Variable(torch.FloatTensor(num_directions*layers, Config.batch_size, hidden_size)) 90 | 91 | if Config.cuda: 92 | self.h01 = self.h01.cuda() 93 | self.c01 = self.c01.cuda() 94 | 95 | if not self.conditional_encoding: 96 | self.h02 = Variable(torch.FloatTensor(num_directions*layers, Config.batch_size, hidden_size)) 97 | self.c02 = Variable(torch.FloatTensor(num_directions*layers, Config.batch_size, hidden_size)) 98 | 99 | if Config.cuda: 100 | self.h02 = self.h02.cuda() 101 | self.c02 = self.c02.cuda() 102 | 103 | 104 | def forward(self, str2var, *args): 105 | self.expected_str2var_keys(str2var, []) 106 | self.expected_args('embedded input seq, embedded seq support', 'both of size [batch, time steps, embedding dim]') 107 | self.generated_outputs('LSTM output seq inputs, LSTM output seq support', 'both of size [batch, time steps, 2x hidden dim]') 108 | seq1, seq2 = args 109 | if self.conditional_encoding: 110 | self.h01.data.zero_() 111 | self.c01.data.zero_() 112 | out1, hid1 = self.lstm1(seq1, (self.h01, self.c01)) 113 | out2, hid2 = self.lstm2(seq2, hid1) 114 | else: 115 | self.h01.data.zero_() 116 | self.c01.data.zero_() 117 | self.h02.data.zero_() 118 | self.c02.data.zero_() 119 | out1, hid1 = self.lstm1(seq1, (self.h01, self.c01)) 120 | out2, hid2 = self.lstm2(seq2, (self.h02, self.c02)) 121 | return [out1, out2] 122 | 123 | class TorchVariableLengthOutputSelection(torch.nn.Module, AbstractModel): 124 | def __init__(self): 125 | super(TorchVariableLengthOutputSelection, self).__init__() 126 | self.b1 = None 127 | self.b2 = None 128 | 129 | def forward(self, str2var, *args): 130 | self.expected_str2var_keys(str2var, ['input_length', 'support_length']) 131 | self.expected_args('LSTM output sequence input , LSTM output sequence support', 'dimension of both: [batch, time steps, 2x LSTM hidden size]') 132 | self.generated_outputs('stacked bidirectional outputs of last timestep', 'dim is [batch_size, 4x hidden size]') 133 | 134 | output_lstm1, output_lstm2 = args 135 | 136 | l1, l2 = str2var['input_length'], str2var['support_length'] 137 | if self.b1 == None: 138 | b1 = torch.ByteTensor(output_lstm1.size()) 139 | b2 = torch.ByteTensor(output_lstm2.size()) 140 | if Config.cuda: 141 | b1 = b1.cuda() 142 | b2 = b2.cuda() 143 | 144 | b1.fill_(0) 145 | for i, num in enumerate(l1.data): 146 | b1[i,num-1,:] = 1 147 | out1 = output_lstm1[b1].view(Config.batch_size, -1) 148 | 149 | b2.fill_(0) 150 | for i, num in enumerate(l2.data): 151 | b2[i,num-1,:] = 1 152 | out2 = output_lstm2[b2].view(Config.batch_size, -1) 153 | 154 | out = torch.cat([out1,out2], 1) 155 | return [out] 156 | 157 | class TorchSoftmaxCrossEntropy(torch.nn.Module, AbstractModel): 158 | 159 | def __init__(self, input_dim, num_labels): 160 | super(TorchSoftmaxCrossEntropy, self).__init__() 161 | self.num_labels = num_labels 162 | self.projection_to_labels = torch.nn.Linear(input_dim, num_labels) 163 | 164 | def forward(self, str2var, *args): 165 | self.expected_str2var_keys(str2var, ['target']) 166 | self.expected_args('some inputs', 'dimension: [batch, any]') 167 | self.generated_outputs('logits, loss, argmax', 'dimensions: logits = [batch, labels], loss = 1x1, argmax = [batch, 1]') 168 | 169 | outputs_prev_layer = args[0] 170 | t = str2var['target'] 171 | 172 | logits = self.projection_to_labels(outputs_prev_layer) 173 | out = F.log_softmax(logits) 174 | loss = F.nll_loss(out, t) 175 | maximum, argmax = torch.topk(out.data, 1) 176 | 177 | return [logits, loss, argmax] 178 | 179 | -------------------------------------------------------------------------------- /IJCAI2021_demo/kg_completion/spodernet/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/IJCAI2021_demo/kg_completion/spodernet/data/__init__.py -------------------------------------------------------------------------------- /IJCAI2021_demo/kg_completion/spodernet/data/snli2spoder.py: -------------------------------------------------------------------------------- 1 | '''Downloads SNLI data and wrangles it into the spoder format''' 2 | 3 | 4 | if __name__ == '__main__': 5 | snli2spoder() 6 | -------------------------------------------------------------------------------- /IJCAI2021_demo/kg_completion/spodernet/frontend.py: -------------------------------------------------------------------------------- 1 | from itertools import chain 2 | 3 | from spodernet.utils.global_config import Config, Backends 4 | 5 | from spodernet.utils.logger import Logger 6 | log = Logger('frontend.py.txt') 7 | 8 | 9 | class Model(object): 10 | 11 | def __init__(self, input_module=None): 12 | self.modules = [] 13 | self.input_module = input_module 14 | self.module = self 15 | 16 | def add(self, module): 17 | self.modules.append(module) 18 | 19 | def forward(self, str2var, *inputs): 20 | outputs = inputs 21 | if inputs == None: 22 | outputs = [] 23 | for module in self.modules: 24 | outputs = module.forward(str2var, *outputs) 25 | return outputs 26 | 27 | class Trainer(object): 28 | def __init__(self, model): 29 | self.model = model 30 | 31 | self.trainer_backend = None 32 | self.train_func = None 33 | self.eval_func = None 34 | if Config.backend == Backends.TENSORFLOW: 35 | from spodernet.backends.tfbackend import TFTrainer 36 | self.trainer_backend = TFTrainer(model) 37 | self.train_func = lambda _, batch, epochs, iterations: self.trainer_backend.train_model(batch, epochs, iterations) 38 | self.eval_func = lambda _, batch, iterations: self.trainer_backend.eval_model(batch, iterations) 39 | elif Config.backend == Backends.TORCH: 40 | from spodernet.backends.torchbackend import train_model, eval_model 41 | self.train_func = train_model 42 | self.eval_func = eval_model 43 | 44 | def train(self, batcher, epochs=1, iterations=None): 45 | self.train_func(self.model, batcher, epochs, iterations) 46 | 47 | def evaluate(self, batcher, iterations=None): 48 | self.eval_func(self.model, batcher, iterations) 49 | 50 | class AbstractModel(object): 51 | 52 | def __init__(self): 53 | super(AbstractModel, self).__init__() 54 | self.input_str_args = None 55 | self.output_str_args = None 56 | self.used_keys = None 57 | 58 | def forward(self, str2var, *args): 59 | raise NotImplementedError("Classes that inherit from AbstractModel need to implement the forward method.") 60 | 61 | @property 62 | def modules(self): 63 | raise NotImplementedError("Classes that inherit from AbstractModel need to overrite the modules property.") 64 | 65 | def expected_str2var_keys(self, str2var, keys): 66 | self.used_keys = keys 67 | for key in keys: 68 | if key not in str2var: 69 | log.error('Variable with name {0} expected, but not found in str2variable dict with keys {1}'.format(key, str2var.keys())) 70 | 71 | def expected_str2var_keys_oneof(self, str2var, keys): 72 | self.used_keys = keys 73 | one_exists = False 74 | for key in keys: 75 | if key in str2var: 76 | one_exists = True 77 | if not one_exists: 78 | log.error('At least one of these variable was expected: {0}. But str2var only has these variables: {1}.', keys, str2var.keys()) 79 | 80 | def expected_args(self, str_arg_names, str_arg_description): 81 | log.debug_once('Expected args {0}'.format(str_arg_names)) 82 | log.debug_once('Info for the expected arguments: {0}'.format(str_arg_description)) 83 | self.input_str_args = str_arg_names 84 | 85 | def generated_outputs(self, str_output_names, str_output_description): 86 | log.debug_once('Generated outputs: {0}'.format(str_output_names)) 87 | log.debug_once('Info for the provided outputs: {0}'.format(str_output_description)) 88 | self.output_str_args = str_output_names 89 | self.used_keys 90 | self.input_str_args 91 | self.output_str_args 92 | message = '{0} + {1} -> {2}'.format(self.used_keys, self.input_str_args, self.output_str_args) 93 | log.info_once(message) 94 | 95 | 96 | class Embedding(object): 97 | def __init__(self, embedding_size, num_embeddings, scope=None): 98 | self.embedding_size = embedding_size 99 | self.scope = scope 100 | self.num_embeddings = num_embeddings 101 | 102 | self.module = None 103 | if Config.backend == Backends.TENSORFLOW: 104 | from spodernet.backends.tfmodels import TFEmbedding 105 | self.module = TFEmbedding(embedding_size, num_embeddings, scope) 106 | elif Config.backend == Backends.TORCH: 107 | from spodernet.backends.torchmodels import TorchEmbedding 108 | self.module = TorchEmbedding(embedding_size, num_embeddings) 109 | self.modules = [self.module] 110 | 111 | def forward(self, str2var, *args): 112 | return self.module.forward(str2var, *args) 113 | 114 | 115 | class PairedBiDirectionalLSTM(object): 116 | 117 | def __init__(self, input_size, hidden_size, scope=None, conditional_encoding=True): 118 | super(PairedBiDirectionalLSTM, self).__init__() 119 | self.hidden_size = hidden_size 120 | self.scope = scope 121 | 122 | self.module = None 123 | if Config.backend == Backends.TENSORFLOW: 124 | from spodernet.backends.tfmodels import TFPairedBiDirectionalLSTM 125 | self.module = TFPairedBiDirectionalLSTM(hidden_size, scope, conditional_encoding) 126 | elif Config.backend == Backends.TORCH: 127 | from spodernet.backends.torchmodels import TorchPairedBiDirectionalLSTM, TorchVariableLengthOutputSelection 128 | model = Model() 129 | model.add(TorchPairedBiDirectionalLSTM(input_size, hidden_size, conditional_encoding=conditional_encoding)) 130 | model.add(TorchVariableLengthOutputSelection()) 131 | 132 | self.module = model 133 | self.modules = model.modules 134 | 135 | def forward(self, str2var, *args): 136 | return self.module.forward(str2var, *args) 137 | 138 | 139 | class SoftmaxCrossEntropy(object): 140 | def __init__(self, input_size, num_labels): 141 | super(SoftmaxCrossEntropy, self).__init__() 142 | self.num_labels = num_labels 143 | 144 | self.module = None 145 | if Config.backend == Backends.TENSORFLOW: 146 | from spodernet.backends.tfmodels import TFSoftmaxCrossEntropy 147 | self.module = TFSoftmaxCrossEntropy(num_labels) 148 | elif Config.backend == Backends.TORCH: 149 | from spodernet.backends.torchmodels import TorchSoftmaxCrossEntropy 150 | self.module = TorchSoftmaxCrossEntropy(input_size, num_labels) 151 | self.modules = [self.module] 152 | 153 | def forward(self, str2var, *args): 154 | return self.module.forward(str2var, *args) 155 | -------------------------------------------------------------------------------- /IJCAI2021_demo/kg_completion/spodernet/hooks.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.stats 3 | import datetime 4 | 5 | from spodernet.interfaces import IAtIterEndObservable, IAtEpochEndObservable, IAtEpochStartObservable 6 | from spodernet.utils.util import Timer 7 | from spodernet.utils.global_config import Config, Backends 8 | 9 | from spodernet.utils.logger import Logger 10 | log = Logger('hooks.py.txt') 11 | 12 | class AbstractHook(IAtIterEndObservable, IAtEpochEndObservable): 13 | def __init__(self, name, metric_name, print_every_x_batches): 14 | self.epoch_errors = [] 15 | self.current_scores = [] 16 | self.name = name 17 | self.iter_count = 0 18 | self.print_every = print_every_x_batches 19 | self.metric_name = metric_name 20 | self.epoch = 1 21 | 22 | # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance 23 | self.n = 0 24 | self.epoch_n = 0 25 | self.mean = 0 26 | self.M2 = 0 27 | self.load_backend_specific_functions() 28 | 29 | def load_backend_specific_functions(self): 30 | if Config.backend == Backends.TORCH: 31 | from torch.autograd import Variable 32 | def convert_state(state): 33 | if isinstance(state.targets, Variable): 34 | state.targets = state.targets.data 35 | if isinstance(state.argmax, Variable): 36 | state.argmax = state.argmax.data 37 | if isinstance(state.pred, Variable): 38 | state.pred = state.pred.data 39 | if isinstance(state.loss, Variable): 40 | state.loss = state.loss.data 41 | if isinstance(state.multi_labels, Variable): 42 | state.multi_labels = state.multi_labels.data 43 | 44 | return state 45 | 46 | self.convert_state = convert_state 47 | else: 48 | self.convert_state = lambda x: x 49 | 50 | def calculate_metric(self, state): 51 | raise NotImplementedError('Classes that inherit from abstract hook need to implement the calcualte metric method.') 52 | 53 | def at_end_of_iter_event(self, state): 54 | state = self.convert_state(state) 55 | metric = self.calculate_metric(state) 56 | #print(metric) 57 | 58 | self.n += 1 59 | delta = metric - self.mean 60 | self.mean += delta/self.n 61 | delta2 = metric - self.mean 62 | self.M2 += delta*delta2 63 | 64 | self.current_scores.append(metric) 65 | self.iter_count += 1 66 | if self.iter_count % self.print_every == 0: 67 | lower, upper, m, n = self.print_statistic() 68 | self.n = 0 69 | self.mean = 0 70 | self.M2 = 0 71 | return lower, upper, m, n 72 | return 0, 0, self.mean, self.n 73 | 74 | def at_end_of_epoch_event(self, state): 75 | if self.n == 0: return 0, 0, 0, 0 76 | self.epoch_errors.append(self.get_confidence_intervals()) 77 | lower, upper, m, n = self.print_statistic(True) 78 | del self.current_scores[:] 79 | self.n = 0 80 | self.mean = 0 81 | self.M2 = 0 82 | self.epoch += 1 83 | self.iter_count = 0 84 | return lower, upper, m, n 85 | 86 | def get_confidence_intervals(self, percentile=0.99, limit=1000): 87 | z = scipy.stats.norm.ppf(percentile) 88 | var = self.M2/ (self.n) 89 | SE = np.sqrt(var/self.n) 90 | lower = self.mean-(z*SE) 91 | upper = self.mean+(z*SE) 92 | return [self.n, lower, self.mean, upper] 93 | 94 | def print_statistic(self, at_epoch_end=False): 95 | n, lower, m, upper = self.get_confidence_intervals() 96 | str_message = '{3} {4}: {2:.5}\t99% CI: ({0:.5}, {1:.5}), n={5}'.format(lower, upper, m, self.name, self.metric_name, self.n) 97 | if at_epoch_end: log.info('\n') 98 | if at_epoch_end: log.info('#'*40) 99 | if at_epoch_end: log.info(' '*10 + 'COMPLETED EPOCH: {0}'.format(self.epoch) + ' '*30) 100 | log.info(str_message) 101 | if at_epoch_end: log.info('#'*40) 102 | if at_epoch_end: log.info('\n') 103 | return lower, upper, m, n 104 | 105 | 106 | class AccuracyHook(AbstractHook): 107 | def __init__(self, name='', print_every_x_batches=1000): 108 | super(AccuracyHook, self).__init__(name, 'Accuracy', print_every_x_batches) 109 | self.func = None 110 | self.topk = 1 111 | if Config.backend == Backends.TORCH: 112 | import torch 113 | self.func = lambda x: torch.sum(x) 114 | 115 | def calculate_metric(self, state): 116 | if Config.backend == Backends.TORCH: 117 | correct = 0.0 118 | if len(state.argmax.size()) == 1: 119 | correct += self.func(state.targets==state.argmax) 120 | else: 121 | topk = state.argmax.size(1) 122 | for i in range(topk): 123 | correct += self.func(state.targets==state.argmax[:, i]) 124 | n = state.argmax.size()[0] 125 | return correct.item()/np.float32(n) 126 | elif Config.backend == Backends.TENSORFLOW: 127 | n = state.argmax.shape[0] 128 | return np.sum(state.targets==state.argmax)/np.float32(n) 129 | elif Config.backend == Backends.TEST: 130 | n = state.argmax.shape[0] 131 | return np.sum(state.targets==state.argmax)/np.float32(n) 132 | else: 133 | raise Exception('Backend has unsupported value {0}'.format(Config.backend)) 134 | 135 | 136 | class TopKRankingLoss(AbstractHook): 137 | def __init__(self, k, filtered=False, name='', print_every_x_batches=1000): 138 | super(TopKRankingLoss, self).__init__(name, '{1}Hits@{0} loss'.format(k, ('' if not filtered else 'Filtered ')), print_every_x_batches) 139 | self.func = None 140 | self.argsort = None 141 | self.sum_func = None 142 | self.k = k 143 | self.filtered = filtered 144 | if Config.backend == Backends.TORCH: 145 | import torch 146 | self.argsort = lambda x, k: torch.topk(x, k) 147 | self.sum_func = lambda x: torch.sum(x) 148 | 149 | 150 | def calculate_metric(self, state): 151 | if Config.backend == Backends.TORCH: 152 | if self.filtered: 153 | import torch 154 | saved = torch.index_select(state.pred,1,state.targets) 155 | state.pred[state.multi_labels.byte()] = -100000.0 156 | state.pred.index_copy_(1, state.targets, saved) 157 | 158 | max_values, argmax = self.argsort(state.pred, self.k) 159 | in_topk = 0 160 | for i in range(self.k): 161 | in_topk += self.sum_func(argmax[:,i] == state.targets) 162 | n = state.pred.size()[0] 163 | return in_topk/np.float32(n) 164 | else: 165 | raise Exception('Backend has unsupported value {0}'.format(Config.backend)) 166 | 167 | 168 | 169 | class LossHook(AbstractHook): 170 | def __init__(self, name='', print_every_x_batches=1000): 171 | super(LossHook, self).__init__(name, 'Loss', print_every_x_batches) 172 | 173 | def calculate_metric(self, state): 174 | if Config.backend == Backends.TORCH: 175 | state = self.convert_state(state) 176 | return state.loss.item() 177 | else: 178 | return state.loss 179 | 180 | 181 | class IntersectionHook(AbstractHook): 182 | def __init__(self, name='', print_every_x_batches=1000): 183 | super(IntersectionHook, self).__init__(name, 'Intersection', print_every_x_batches) 184 | 185 | def calculate_metric(self, state): 186 | state = self.convert_state(state) 187 | preds = state.pred 188 | targets = state.targets 189 | if Config.cuda: 190 | preds = preds.cpu() 191 | targets = targets.cpu() 192 | 193 | preds = preds.numpy() 194 | targets = targets.numpy() 195 | n = targets.size 196 | k = 0 197 | for row in range(Config.batch_size): 198 | k += np.intersect1d(preds[row], targets[row]).size 199 | 200 | return k/float(n) 201 | 202 | 203 | 204 | class ETAHook(AbstractHook, IAtEpochStartObservable): 205 | def __init__(self, name='', print_every_x_batches=1000): 206 | super(ETAHook, self).__init__(name, 'ETA', print_every_x_batches) 207 | self.t = Timer(silent=True) 208 | self.cumulative_t = 0.0 209 | self.skipped_first = False 210 | 211 | def get_time_string(self, seconds): 212 | m, s = divmod(seconds, 60) 213 | h, m = divmod(m, 60) 214 | if h < 0: h = 0 215 | if m < 0: m = 0 216 | if s < 0: s = 0 217 | return "%d:%02d:%02d" % (h, m, s) 218 | 219 | def calculate_metric(self, state): 220 | n = state.num_batches 221 | i = state.current_idx 222 | cumulative_t = self.t.tick('ETA') 223 | total_time_estimate = (cumulative_t/i)*n 224 | self.t.tick('ETA') 225 | self.cumulative_t = cumulative_t 226 | 227 | return total_time_estimate 228 | 229 | def print_statistic(self): 230 | if not self.skipped_first: 231 | # the first estimation is very unreliable for time measures 232 | self.skipped_first = True 233 | return 0, 0, 0, 0 234 | n, lower, m, upper = self.get_confidence_intervals() 235 | lower -= self.cumulative_t 236 | m -= self.cumulative_t 237 | upper -= self.cumulative_t 238 | lower, m, upper = self.get_time_string(lower), self.get_time_string(m), self.get_time_string(upper) 239 | log.info('{3} {4}: {2}\t99% CI: ({0}, {1}), n={5}'.format(lower, upper, m, self.name, self.metric_name, n)) 240 | return lower, upper, m, n 241 | 242 | def at_start_of_epoch_event(self, batcher_state): 243 | self.t.tick('ETA') 244 | t = self.t.tick('Epoch') 245 | 246 | def at_end_of_epoch_event(self, state): 247 | self.t.tock('ETA') 248 | epoch_time = self.t.tock('Epoch') 249 | self.epoch_errors.append([epoch_time]) 250 | log.info('Total epoch time: {0}'.format(self.get_time_string(epoch_time))) 251 | del self.current_scores[:] 252 | self.n = 0 253 | self.mean = 0 254 | self.M2 = 0 255 | self.skipped_first = False 256 | self.epoch += 1 257 | return epoch_time 258 | -------------------------------------------------------------------------------- /IJCAI2021_demo/kg_completion/spodernet/interfaces.py: -------------------------------------------------------------------------------- 1 | #These are completly useless, but they signify intent which is important. 2 | 3 | class IAtIterEndObservable(object): 4 | def at_end_of_iter_event(self, batcher_state): 5 | raise NotImplementedError('Subclasses of IAtIterEndObservable need to override the end_of_iter_event method') 6 | 7 | class IAtEpochStartObservable(object): 8 | def at_start_of_epoch_event(self, batcher_state): 9 | raise NotImplementedError('Subclasses of IAtEpochStartObservable need to override the at_start_of_epoch method') 10 | 11 | class IAtEpochEndObservable(object): 12 | def at_end_of_epoch_event(self, batcher_state): 13 | raise NotImplementedError('Subclasses of IAtEpochEndObservable need to override the end_of_iter_epoch method') 14 | 15 | class IAtBatchPreparedObservable(object): 16 | def at_batch_prepared(self, batch_parts): 17 | raise NotImplementedError('Subclasses of IAtBatchPreparedObservable need to override the at_batch_prepared method') 18 | -------------------------------------------------------------------------------- /IJCAI2021_demo/kg_completion/spodernet/preprocessing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/IJCAI2021_demo/kg_completion/spodernet/preprocessing/__init__.py -------------------------------------------------------------------------------- /IJCAI2021_demo/kg_completion/spodernet/preprocessing/vocab.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | import numpy as np 4 | import os 5 | import time 6 | import datetime 7 | import pickle 8 | import urllib 9 | # import bashmagic 10 | import time 11 | import json 12 | 13 | from spodernet.utils.util import get_data_path, save_data, xavier_uniform_weight 14 | from os.path import join 15 | 16 | from spodernet.utils.util import Logger 17 | log = Logger('vocab.py.txt') 18 | 19 | '''This models the vocabulary and token embeddings''' 20 | 21 | class Vocab(object): 22 | '''Class that manages work/char embeddings''' 23 | 24 | def __init__(self, path, vocab = Counter(), labels = {}): 25 | '''Constructor. 26 | Args: 27 | vocab: Counter object with vocabulary. 28 | ''' 29 | self.index = None 30 | token2idx = {} 31 | idx2token = {} 32 | self.label2idx = {} 33 | self.idx2label = {} 34 | self.glove_cache = {} 35 | for i, item in enumerate(vocab.items()): 36 | token2idx[item[0]] = i+1 37 | idx2token[i+1] = item[0] 38 | 39 | for idx in labels: 40 | self.label2idx[labels[idx]] = idx 41 | self.idx2label[idx] = labels[idx] 42 | 43 | # out of vocabulary token 44 | token2idx['OOV'] = int(0) 45 | idx2token[int(0)] = 'OOV' 46 | # empty = 0 47 | token2idx[''] = int(1) 48 | idx2token[int(1)] = '' 49 | 50 | self.token2idx = token2idx 51 | self.idx2token = idx2token 52 | self.path = path 53 | if len(idx2token.keys()) > 0: 54 | self.next_idx = int(np.max(list(idx2token.keys())) + 1) 55 | else: 56 | self.next_idx = int(2) 57 | 58 | if len(self.idx2label.keys()) > 0: 59 | self.next_label_2dx = int(int(np.max(self.idx2label.keys())) + 1) 60 | else: 61 | self.next_label_idx = int(0) 62 | 63 | @property 64 | def num_token(self): 65 | return len(self.token2idx) 66 | 67 | @property 68 | def num_labels(self): 69 | return len(self.label2idx) 70 | 71 | def add_token(self, token): 72 | if token not in self.token2idx: 73 | self.token2idx[token] = self.next_idx 74 | self.idx2token[self.next_idx] = token 75 | self.next_idx += 1 76 | 77 | def add_label(self, label): 78 | if label not in self.label2idx: 79 | self.label2idx[label] = self.next_label_idx 80 | self.idx2label[self.next_label_idx] = label 81 | self.next_label_idx += 1 82 | 83 | def get_idx(self, word): 84 | '''Gets the idx if it exists, otherwise returns -1.''' 85 | if word in self.token2idx: 86 | return self.token2idx[word] 87 | else: 88 | return self.token2idx['OOV'] 89 | 90 | def get_idx_label(self, label): 91 | '''Gets the idx of the label''' 92 | return self.label2idx[label] 93 | 94 | def get_word(self, idx): 95 | '''Gets the word if it exists, otherwise returns OOV.''' 96 | if idx in self.idx2token: 97 | return self.idx2token[idx] 98 | else: 99 | return self.idx2token[0] 100 | 101 | def save_to_disk(self, name=''): 102 | log.info('Saving vocab to: {0}'.format(self.path)) 103 | pickle.dump([self.token2idx, self.idx2token, self.label2idx, 104 | self.idx2label], open(self.path + name, 'wb')) 105 | 106 | def load_from_disk(self, name=''): 107 | if not os.path.exists(self.path + name): 108 | return False 109 | timestamp = time.ctime(os.path.getmtime(self.path + name)) 110 | timestamp = datetime.datetime.strptime(timestamp, '%a %b %d %H:%M:%S %Y') 111 | age_in_hours = (datetime.datetime.now() - timestamp).seconds/60./60. 112 | log.info('Loading vocab from: {0}'.format(self.path + name)) 113 | self.token2idx, self.idx2token, self.label2idx, self.idx2label = pickle.load(open(self.path, 'rb')) 114 | if age_in_hours > 12: 115 | log.info('Vocabulary outdated: {0}'.format(self.path + name)) 116 | return False 117 | else: 118 | return True 119 | 120 | def download_glove(self): 121 | if not os.path.exists(join(get_data_path(), 'glove')): 122 | log.info('Glove data is missing, dowloading data now...') 123 | os.mkdir(join(get_data_path(), 'glove')) 124 | bashmagic.wget("http://nlp.stanford.edu/data/glove.6B.zip", join(get_data_path(),'glove')) 125 | bashmagic.unzip(join(get_data_path(), 'glove', 'glove.6B.zip'), join(get_data_path(), 'glove')) 126 | 127 | def prepare_glove(self, dimension): 128 | if self.index is not None: return 129 | if not os.path.exists(join(get_data_path(), 'glove', 'index_50.p')): 130 | dims = [50, 100, 200, 300] 131 | base_filename = 'glove.6B.{0}d.txt' 132 | paths = [join(get_data_path(), 'glove', base_filename.format(dim)) for dim in dims] 133 | for path, dim in zip(paths, dims): 134 | index = {} 135 | index = {'PATH' : path} 136 | with open(path, 'rb') as f: 137 | log.info('Building index for {0}', path) 138 | while True: 139 | prev_pos = f.tell() 140 | line = f.readline().decode('utf-8') 141 | if line == '': break 142 | next_pos = f.tell() 143 | data = line.strip().split(' ') 144 | token = data[0] 145 | index[token] = (prev_pos, next_pos) 146 | 147 | log.info('Saving glove index...') 148 | json.dump(index, open(join(get_data_path(), 'glove', 'index_{0}.p'.format(dim)), 'w')) 149 | 150 | log.info('Loading glove index...') 151 | self.index = json.load(open(join(get_data_path(), 'glove', 'index_{0}.p'.format(dimension)), 'r')) 152 | 153 | 154 | def load_matrix(self, dim): 155 | log.info('Initializing glove matrix...') 156 | X = xavier_uniform_weight(len(self.token2idx), dim) 157 | log.info('Loading vectors into glove matrix with dimension: {0}', X.shape) 158 | pretrained_count = 0 159 | n = len(self.token2idx)-2 160 | for i, (token, idx) in enumerate(self.token2idx.items()): 161 | if i % 10000 == 0: print(i) 162 | vec = self.get_glove_list(token, dim) 163 | if vec is not None: 164 | X[idx] = vec 165 | pretrained_count += 1 166 | log.info('Filled matrix with {0} pretrained embeddings and {1} xavier uniform initialized embeddings.', pretrained_count, n-pretrained_count) 167 | return X 168 | 169 | def get_glove_vector(self, token, dimension=300): 170 | if token in self.glove_cache: return self.glove_cache[token] 171 | vec = self.get_glove_list(token, dimension) 172 | if vec is not None: 173 | arr = np.array(vec, dtype=np.float32) 174 | self.glove_cache[token] = arr 175 | return arr 176 | else: return None 177 | 178 | def get_glove_list(self, token, dimension=300): 179 | assert dimension in [50, 100, 200, 300], 'Dimension not supported! Only dimension 50, 100, 200, and 300 are supported!' 180 | self.download_glove() 181 | self.prepare_glove(dimension) 182 | vec = None 183 | if token in self.index: 184 | p = self.index['PATH'] 185 | with open(p, 'rb') as f: 186 | start, end = self.index[token] 187 | f.seek(start) 188 | line = f.read(end-start).decode('utf-8') 189 | data = line.strip().split(' ') 190 | vec = data[1:] 191 | 192 | return vec 193 | 194 | def exists_in_glove(self, token, dimension=300): 195 | self.download_glove() 196 | self.prepare_glove(dimension) 197 | return token in self.index 198 | 199 | 200 | def get_glove_matrix(self, dimension): 201 | assert dimension in [50, 100, 200, 300], 'Dimension not supported! Only dimension 50, 100, 200, and 300 are supported!' 202 | self.download_glove() 203 | return self.load_matrix(dimension) 204 | -------------------------------------------------------------------------------- /IJCAI2021_demo/kg_completion/spodernet/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/IJCAI2021_demo/kg_completion/spodernet/utils/__init__.py -------------------------------------------------------------------------------- /IJCAI2021_demo/kg_completion/spodernet/utils/cuda_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import torch 3 | from torch.cuda import Event 4 | 5 | class CUDATimer(object): 6 | def __init__(self, silent=False): 7 | self.cumulative_secs = {} 8 | self.current_ticks = {} 9 | self.silent = silent 10 | self.end = Event(enable_timing=True, blocking=True) 11 | 12 | def tick(self, name='default'): 13 | if name not in self.current_ticks: 14 | start = Event(enable_timing=True, blocking=True) 15 | start.record() 16 | self.current_ticks[name] = start 17 | 18 | return 0.0 19 | else: 20 | if name not in self.cumulative_secs: 21 | self.cumulative_secs[name] = 0 22 | self.end.record() 23 | self.end.synchronize() 24 | self.cumulative_secs[name] += self.current_ticks[name].elapsed_time(self.end)/1000. 25 | self.current_ticks.pop(name) 26 | 27 | return self.cumulative_secs[name] 28 | 29 | def tock(self, name='default'): 30 | self.tick(name) 31 | value = self.cumulative_secs[name] 32 | if not self.silent: 33 | print('Time taken for {0}: {1:.8f}s'.format(name, value)) 34 | self.cumulative_secs.pop(name) 35 | if name in self.current_ticks: 36 | del self.current_ticks[name] 37 | self.current_ticks.pop(name, None) 38 | 39 | return value 40 | -------------------------------------------------------------------------------- /IJCAI2021_demo/kg_completion/spodernet/utils/global_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from collections import namedtuple 3 | from spodernet.utils.logger import Logger 4 | log = Logger('global_config.py.txt') 5 | 6 | class Backends: 7 | TORCH = 'pytorch' 8 | TENSORFLOW = 'tensorflow' 9 | TEST = 'test' 10 | CNTK = 'cntk' 11 | 12 | 13 | class Config: 14 | dropout = 0.0 15 | batch_size = 128 16 | learning_rate = 0.001 17 | backend = Backends.TORCH 18 | L2 = 0.000 19 | cuda = False 20 | embedding_dim = 128 21 | hidden_size = 256 22 | input_dropout = 0.0 23 | feature_map_dropout = 0.0 24 | use_conv_transpose = False 25 | use_bias = True 26 | optimizer = 'adam' 27 | learning_rate_decay = 1.0 28 | label_smoothing_epsilon = 0.1 29 | epochs = 1000 30 | dataset = None 31 | process = False 32 | model_name = None 33 | 34 | @staticmethod 35 | def parse_argv(argv): 36 | file_name = argv[0] 37 | args = argv[1:] 38 | assert len(args) % 2 == 0, 'Global parser expects an even number of arguments.' 39 | values = [] 40 | names = [] 41 | for i, token in enumerate(args): 42 | if i % 2 == 0: 43 | names.append(token) 44 | else: 45 | values.append(token) 46 | 47 | for i in range(len(names)): 48 | if names[i] in alias2params: 49 | log.debug('Replaced parameters alias {0} with name {1}', names[i], alias2params[names[i]]) 50 | names[i] = alias2params[names[i]] 51 | 52 | for i in range(len(names)): 53 | name = names[i] 54 | if name[:2] == '--': continue 55 | if name not in params2type: 56 | log.info('List of possible parameters: {0}', params2type.keys()) 57 | log.error('Parameter {0} does not exist. Prefix your custom parameters with -- to skip parsing for global config', name) 58 | values[i] = params2type[name](values[i]) 59 | 60 | for name, value in zip(names, values): 61 | if name[:2] == '--': continue 62 | params2field[name](value) 63 | log.info('Set parameter {0} to {1}', name, value) 64 | 65 | input_dropout = 0.0 66 | feature_map_dropout = 0.0 67 | use_transposed_convolutions = False 68 | use_bias = True 69 | 70 | params2type = {} 71 | params2type['learning_rate'] = lambda x: float(x) 72 | params2type['learning_rate_decay'] = lambda x: float(x) 73 | params2type['dropout'] = lambda x: float(x) 74 | params2type['batch_size'] = lambda x: int(x) 75 | params2type['L2'] = lambda x: float(x) 76 | params2type['embedding_dim'] = lambda x: int(x) 77 | params2type['hidden_size'] = lambda x: int(x) 78 | params2type['input_dropout'] = lambda x: float(x) 79 | params2type['label_smoothing_epsilon'] = lambda x: float(x) 80 | params2type['feature_map_dropout'] = lambda x: float(x) 81 | params2type['use_conv_transpose'] = lambda x: x.lower() == 'true' or x == '1' 82 | params2type['use_bias'] = lambda x: x.lower() == 'true' or x == '1' 83 | params2type['optimizer'] = lambda x: x 84 | params2type['epochs'] = lambda x: int(x) 85 | params2type['dataset'] = lambda x: x 86 | params2type['model_name'] = lambda x: x 87 | params2type['process'] = lambda x: x.lower() == 'true' or x == '1' 88 | 89 | alias2params = {} 90 | alias2params['lr'] = 'learning_rate' 91 | alias2params['lr_decay'] = 'learning_rate_decay' 92 | alias2params['l2'] = 'L2' 93 | alias2params['input_drop'] = 'input_dropout' 94 | alias2params['hidden_drop'] = 'dropout' 95 | alias2params['feat_drop'] = 'feature_map_dropout' 96 | alias2params['bias'] = 'use_bias' 97 | alias2params['conv_trans'] = 'use_conv_transpose' 98 | alias2params['opt'] = 'optimizer' 99 | alias2params['label_smoothing'] = 'label_smoothing_epsilon' 100 | alias2params['model'] = 'model_name' 101 | 102 | 103 | 104 | params2field = {} 105 | params2field['learning_rate'] = lambda x: setattr(Config, 'learning_rate', x) 106 | params2field['learning_rate_decay'] = lambda x: setattr(Config, 'learning_rate_decay', x) 107 | params2field['dropout'] = lambda x: setattr(Config, 'dropout', x) 108 | params2field['batch_size'] = lambda x: setattr(Config, 'batch_size', x) 109 | params2field['L2'] = lambda x: setattr(Config, 'L2', x) 110 | params2field['embedding_dim'] = lambda x: setattr(Config, 'embedding_dim', x) 111 | params2field['hidden_size'] = lambda x: setattr(Config, 'hidden_size', x) 112 | params2field['input_dropout'] = lambda x: setattr(Config, 'input_dropout', x) 113 | params2field['feature_map_dropout'] = lambda x: setattr(Config, 'feature_map_dropout', x) 114 | params2field['use_conv_transpose'] = lambda x: setattr(Config, 'use_conv_transpose', x) 115 | params2field['use_bias'] = lambda x: setattr(Config, 'use_bias', x) 116 | params2field['optimizer'] = lambda x: setattr(Config, 'optimizer', x) 117 | params2field['label_smoothing_epsilon'] = lambda x: setattr(Config, 'label_smoothing_epsilon', x) 118 | params2field['epochs'] = lambda x: setattr(Config, 'epochs', x) 119 | params2field['dataset'] = lambda x: setattr(Config, 'dataset', x) 120 | params2field['process'] = lambda x: setattr(Config, 'process', x) 121 | params2field['model_name'] = lambda x: setattr(Config, 'model_name', x) 122 | 123 | 124 | -------------------------------------------------------------------------------- /IJCAI2021_demo/kg_completion/spodernet/utils/logger.py: -------------------------------------------------------------------------------- 1 | from enum import IntEnum 2 | from os.path import join 3 | 4 | import os 5 | import datetime 6 | import numpy as np 7 | import time 8 | 9 | # util functions start 10 | # 11 | # these function also exist in util.py, 12 | # but since logger is imported everywere these function need to be included here 13 | 14 | def get_home_path(): 15 | return os.environ['HOME'] 16 | 17 | def get_logger_path(): 18 | return join(get_home_path(), '.data', 'log_files') 19 | 20 | def make_dirs_if_not_exists(path): 21 | if not os.path.exists(path): 22 | os.makedirs(path) 23 | 24 | # util functions end 25 | class GlobalLogger: 26 | timestr = None 27 | global_logger_path = None 28 | f_global_logger = None 29 | 30 | @staticmethod 31 | def init(): 32 | GlobalLogger.timestr = time.strftime("%Y%m%d-%H%M%S") 33 | if not os.path.exists(join(get_logger_path(), 'full_logs')): 34 | os.mkdir(join(get_logger_path(), 'full_logs')) 35 | GlobalLogger.global_logger_path = join(get_logger_path(), 'full_logs', GlobalLogger.timestr + '.txt') 36 | GlobalLogger.f_global_logger = open(GlobalLogger.global_logger_path, 'w') 37 | 38 | @staticmethod 39 | def flush(): 40 | GlobalLogger.f_global_logger.close() 41 | GlobalLogger.f_global_logger = open(GlobalLogger.global_logger_path, 'a') 42 | 43 | def __del__(self): 44 | GlobalLogger.f_global_logger.close() 45 | 46 | class LogLevel(IntEnum): 47 | STATISTICAL = 0 48 | DEBUG = 1 49 | INFO = 2 50 | WARNING = 3 51 | ERROR = 4 52 | 53 | class Logger: 54 | GLOBAL_LOG_LEVEL = LogLevel.INFO 55 | LOG_PROPABILITY = 0.05 56 | USE_GLOBAL_STATISTICAL_LOG_PROBABILITY = False 57 | PRINT_COUNT = 2 58 | 59 | def __init__(self, file_name, write_type='w'): 60 | path = join(get_logger_path(), file_name) 61 | path_statistical = join(get_logger_path(), 'statistical_' + file_name) 62 | self.path = path 63 | make_dirs_if_not_exists(get_logger_path()) 64 | self.f = open(path, write_type) 65 | self.f_statistical = open(path_statistical, write_type) 66 | self.rdm = np.random.RandomState(234234) 67 | self.debug('Created log file at: {0} with write type: {1}'.format(path, write_type)) 68 | self.once_dict = {} 69 | 70 | def __del__(self): 71 | self.f.close() 72 | self.f_statistical.close() 73 | 74 | def wrap_message(self, message, log_level, *args): 75 | return '{0} ({2}): {1}'.format(datetime.datetime.now(), message.format(*args), log_level.name) 76 | 77 | def statistical(self, message, p, *args): 78 | if Logger.GLOBAL_LOG_LEVEL == LogLevel.STATISTICAL: 79 | self._log_statistical(message, p, *args) 80 | 81 | def debug(self, message, *args): 82 | self._log(message, LogLevel.DEBUG, *args) 83 | 84 | def info_once(self, message, *args): 85 | if LogLevel.INFO < Logger.GLOBAL_LOG_LEVEL: return 86 | if message not in self.once_dict: self.once_dict[message] = 0 87 | if self.once_dict[message] < Logger.PRINT_COUNT: 88 | self.once_dict[message] += 1 89 | self._log(message, LogLevel.INFO, *args) 90 | 91 | def debug_once(self, message, *args): 92 | if LogLevel.DEBUG < Logger.GLOBAL_LOG_LEVEL: return 93 | if message not in self.once_dict: self.once_dict[message] = 0 94 | if self.once_dict[message] < Logger.PRINT_COUNT: 95 | self.once_dict[message] += 1 96 | self._log(message, LogLevel.DEBUG, *args) 97 | 98 | def info(self, message, *args): 99 | self._log(message, LogLevel.INFO, *args) 100 | 101 | def warning(self, message, *args): 102 | self._log(message, LogLevel.WARNING, *args) 103 | 104 | def error(self, message, *args): 105 | self._log(message, LogLevel.ERROR, *args) 106 | raise Exception(message.format(*args)) 107 | 108 | def _log_statistical(self, message, p, *args): 109 | rdm_num = self.rdm.rand() 110 | if Logger.USE_GLOBAL_STATISTICAL_LOG_PROBABILITY: 111 | if rdm_num < Logger.LOG_PROPABILITY: 112 | message = self.wrap_message(message, LogLevel.STATISTICAL, *args) 113 | self.f_statistical.write(message + '\n') 114 | else: 115 | if rdm_num < p: 116 | message = self.wrap_message(message, LogLevel.STATISTICAL, *args) 117 | self.f_statistical.write(message + '\n') 118 | 119 | def _log(self, message, log_level=LogLevel.INFO, *args): 120 | if log_level >= Logger.GLOBAL_LOG_LEVEL: 121 | message = self.wrap_message(message, log_level, *args) 122 | if message.strip() != '': 123 | print(message) 124 | self.f.write(message + '\n') 125 | if GlobalLogger.f_global_logger is None: GlobalLogger.init() 126 | GlobalLogger.f_global_logger.write(message + '\n') 127 | 128 | 129 | -------------------------------------------------------------------------------- /IJCAI2021_demo/kg_completion/spodernet/utils/spacy_util.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | 3 | subjects = set(['nsubj']) 4 | objects = set(['dobj', 'pobj']) 5 | 6 | def merge_noun_phrases(sent_doc): 7 | for np in sent_doc.noun_chunks: 8 | np.merge(np.root.tag_, np.text, np.root.ent_type_) 9 | 10 | def merge_entities(sent_doc): 11 | for ent in sent_doc.ents: 12 | ent.merge(ent.root.dep_, ent.text, ent.label_) 13 | 14 | def merge_verbs(sent_doc): 15 | has_double_verb = False 16 | for span_length in [3, 2]: 17 | i = 1 18 | while i < len(sent_doc)-1: 19 | token = sent_doc[i] 20 | if token.pos_ == 'VERB': 21 | full_match = True 22 | for j in range(1, span_length): 23 | full_match &= sent_doc[i-j].pos_ == 'VERB' 24 | if full_match: 25 | span = sent_doc[i-1:i+span_length-1] 26 | span.merge() 27 | i += span_length-1 28 | has_double_verb = True 29 | i += 1 30 | 31 | def merge_with_set(sent_doc, to_match, write_key='pobj'): 32 | for span_length in [5, 4, 3, 2]: 33 | i = span_length-1 34 | while i < len(sent_doc)-1: 35 | token = sent_doc[i] 36 | if token.dep_ in write_key: 37 | pos, dep = token.pos_, token.dep_ 38 | full_match = True 39 | for j in range(1, span_length): 40 | full_match &= sent_doc[i-j].dep_ in to_match 41 | full_match &= sent_doc[i-j].pos_ != 'VERB' 42 | idx = sent_doc[i-j].idx 43 | if full_match: 44 | span = sent_doc[i-1:i+span_length-1] 45 | span.merge() 46 | sent_doc[i-1].dep_ = dep 47 | i += span_length-1 48 | i += 1 49 | 50 | def merge_tokens(sent_doc): 51 | merge_noun_phrases(sent_doc) 52 | merge_entities(sent_doc) 53 | merge_verbs(sent_doc) 54 | merge_with_set(sent_doc, set(['pobj', 'prep'])) 55 | merge_with_set(sent_doc, set(['pobj', 'prep'])) 56 | merge_with_set(sent_doc, set(['pobj', 'prep'])) 57 | merge_with_set(sent_doc, set(['attr', 'punct', 'cc', 'conj', 'pobj']), 'pobj') 58 | merge_with_set(sent_doc, set(['attr', 'punct', 'cc', 'conj', 'pobj']), 'pobj') 59 | merge_with_set(sent_doc, set(['dobj', 'pobj']), 'dobj') 60 | merge_with_set(sent_doc, set(['dobj', 'pobj']), 'dobj') 61 | merge_with_set(sent_doc, set(['attr', 'punct', 'cc', 'conj', 'pobj', 'dobj']), 'dobj') 62 | merge_with_set(sent_doc, set(['attr', 'punct', 'cc', 'conj', 'pobj', 'dobj']), 'dobj') 63 | 64 | def extract_triples(sent_doc): 65 | triples = [] 66 | triple = [] 67 | for token in sent_doc: 68 | if token.pos_ == 'VERB': 69 | if len(triple) == 0: continue 70 | if triple[-1].dep_ in subjects: 71 | triple.append(token) 72 | else: 73 | triple = [] 74 | if token.dep_ in subjects: 75 | if len(triple) == 0: 76 | triple.append(token) 77 | else: 78 | triple = [token] 79 | if token.dep_ in objects: 80 | if len(triple) == 0: continue 81 | if triple[-1].pos_ == 'VERB': 82 | triple.append(token) 83 | triples.append(triple) 84 | triple = [] 85 | else: 86 | triple = [] 87 | return triples 88 | -------------------------------------------------------------------------------- /IJCAI2021_demo/kg_completion/spodernet/utils/util.py: -------------------------------------------------------------------------------- 1 | from os.path import join 2 | from scipy.sparse import csr_matrix, spmatrix 3 | 4 | import h5py 5 | import os 6 | import time 7 | import os 8 | import numpy as np 9 | import torch 10 | 11 | from spodernet.utils.logger import Logger 12 | log = Logger('util.py.txt') 13 | 14 | rdm = np.random.RandomState(2345235) 15 | 16 | def save_dense_hdf(path, data): 17 | '''Writes a numpy array to a hdf5 file under the given path.''' 18 | log.debug_once('Saving hdf5 file to: {0}', path) 19 | h5file = h5py.File(path, "w") 20 | h5file.create_dataset("default", data=data) 21 | h5file.close() 22 | 23 | 24 | def load_dense_hdf(path, keyword='default'): 25 | '''Reads and returns a numpy array for a hdf5 file''' 26 | log.debug_once('Reading hdf5 file from: {0}', path) 27 | h5file = h5py.File(path, 'r') 28 | dset = h5file.get(keyword) 29 | data = dset[:] 30 | h5file.close() 31 | return data 32 | 33 | def save_sparse_hdf(path, data): 34 | shape = data.shape 35 | sparse = csr_matrix(data) 36 | folder, filename = os.path.split(path) 37 | save_dense_hdf(join(folder, 'data_' + filename), sparse.data) 38 | save_dense_hdf(join(folder, 'indices_' + filename), sparse.indices) 39 | save_dense_hdf(join(folder, 'indptr_' + filename), sparse.indptr) 40 | save_dense_hdf(join(folder, 'shape_dense_' + filename), shape) 41 | save_dense_hdf(join(folder, 'shape_sparse_' + filename), sparse.shape) 42 | 43 | def load_sparse_hdf(path, keyword='default'): 44 | folder, filename = os.path.split(path) 45 | data = load_dense_hdf(join(folder, 'data_' + filename)) 46 | indices = load_dense_hdf(join(folder, 'indices_' + filename)) 47 | indptr = load_dense_hdf(join(folder, 'indptr_' + filename)) 48 | shape = load_dense_hdf(join(folder, 'shape_dense_' + filename)) 49 | shape_sparse = load_dense_hdf(join(folder, 'shape_sparse_' + filename)) 50 | return csr_matrix((data, indices, indptr), shape=shape_sparse).toarray().reshape(shape) 51 | 52 | def load_data(path): 53 | folder, filename = os.path.split(path) 54 | if os.path.exists(join(folder, 'indptr_' + filename)): 55 | data = load_sparse_hdf(path) 56 | return data 57 | else: 58 | return load_dense_hdf(path) 59 | 60 | def save_data(path, data): 61 | assert data.size > 0 62 | is_sparse = isinstance(data, spmatrix) 63 | if is_sparse: 64 | save_sparse_hdf(path, data) 65 | return 66 | 67 | zero = (data == 0.0).sum() 68 | percent = zero/float(data.size) 69 | if percent > 0.5: 70 | save_sparse_hdf(path, data) 71 | else: 72 | save_dense_hdf(path, data) 73 | 74 | 75 | def load_hdf5_paths(paths, limit=None): 76 | data = [] 77 | for path in paths: 78 | if limit != None: 79 | data.append(load_data(path)[:limit]) 80 | else: 81 | data.append(load_data(path)) 82 | return data 83 | 84 | def get_home_path(): 85 | return os.environ['HOME'] 86 | 87 | def get_data_path(): 88 | return join(os.environ['HOME'], '.data') 89 | 90 | def make_dirs_if_not_exists(path): 91 | if not os.path.exists(path): 92 | os.makedirs(path) 93 | 94 | # taken from pytorch; gain parameter is omitted 95 | def xavier_uniform_weight(fan_in, fan_out): 96 | std = np.sqrt(2.0 / (fan_in + fan_out)) 97 | a = np.sqrt(3.0) * std 98 | return np.float32(rdm.uniform(-a, a, size=(fan_in, fan_out))) 99 | 100 | def embedding_sequence2text(vocab, embedding, break_at_0=True): 101 | if not isinstance(embedding, np.ndarray): 102 | if isinstance(embedding, torch.autograd.Variable): 103 | emb = embedding.data.cpu().numpy() 104 | else: 105 | emb = embedding.cpu().numpy() 106 | else: 107 | emb = embedding 108 | sentences = [] 109 | for row in emb: 110 | sentence_array = [] 111 | for idx in row: 112 | if idx == 0: break 113 | sentence_array.append(vocab.get_word(idx)) 114 | sentences.append(sentence_array) 115 | return sentences 116 | 117 | class PercentileRejecter(object): 118 | 119 | def __init__(self, above_percentile_threshold): 120 | self.values = [] 121 | self.percentile_threshold = above_percentile_threshold 122 | self.threshold_value = 0 123 | self.current_iter = 0 124 | self.compute_every = 1 125 | 126 | def above_percentile(self, value, percentile=None): 127 | self.values.append(value) 128 | self.current_iter += 1 129 | if len(self.values) < 20: 130 | return False 131 | else: 132 | if percentile is None: 133 | if self.current_iter % self.compute_every == 0: 134 | p = np.percentile(self.values, self.percentile_threshold) 135 | if p*1.05 < self.threshold_value or p*0.95 > self.threshold_value: 136 | self.threshold_value = p 137 | self.compute_every -= 1 138 | if self.compute_every < 1: self.compute_every = 1 139 | else: 140 | self.compute_every += 1 141 | else: 142 | p = self.threshold_value 143 | else: 144 | p = np.percentile(self.values, percentile) 145 | self.threshold_value = p 146 | return value > p 147 | 148 | 149 | class Timer(object): 150 | def __init__(self, silent=False): 151 | self.cumulative_secs = {} 152 | self.current_ticks = {} 153 | self.silent = silent 154 | 155 | def tick(self, name='default'): 156 | if name not in self.current_ticks: 157 | self.current_ticks[name] = time.time() 158 | 159 | return 0.0 160 | else: 161 | if name not in self.cumulative_secs: 162 | self.cumulative_secs[name] = 0 163 | t = time.time() 164 | self.cumulative_secs[name] += t - self.current_ticks[name] 165 | self.current_ticks.pop(name) 166 | 167 | return self.cumulative_secs[name] 168 | 169 | def tock(self, name='default'): 170 | self.tick(name) 171 | value = self.cumulative_secs[name] 172 | if not self.silent: 173 | log.info('Time taken for {0}: {1:.8f}s'.format(name, value)) 174 | self.cumulative_secs.pop(name) 175 | self.current_ticks.pop(name, None) 176 | 177 | return value 178 | 179 | -------------------------------------------------------------------------------- /IJCAI2021_demo/kg_completion/wrangle_KG.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from os.path import join 3 | import json 4 | 5 | import argparse 6 | import datetime 7 | import json 8 | import urllib 9 | import pickle 10 | import os 11 | import numpy as np 12 | import operator 13 | import sys 14 | 15 | rdm = np.random.RandomState(234234) 16 | 17 | if len(sys.argv) > 1: 18 | dataset_name = sys.argv[1] 19 | else: 20 | dataset_name = 'FB15k-237' 21 | #dataset_name = 'FB15k' 22 | #dataset_name = 'yago' 23 | #dataset_name = 'WN18RR' 24 | 25 | print('Processing dataset {0}'.format(dataset_name)) 26 | 27 | rdm = np.random.RandomState(2342423) 28 | base_path = 'kg_completion/data/{0}/'.format(dataset_name) 29 | files = ['train.txt', 'valid.txt', 'test.txt'] 30 | 31 | data = [] 32 | for p in files: 33 | with open(join(base_path, p)) as f: 34 | data = f.readlines() + data 35 | 36 | 37 | label_graph = {} 38 | train_graph = {} 39 | test_cases = {} 40 | for p in files: 41 | test_cases[p] = [] 42 | train_graph[p] = {} 43 | 44 | 45 | for p in files: 46 | with open(join(base_path, p)) as f: 47 | for i, line in enumerate(f): 48 | e1, rel, e2 = line.split('\t') 49 | e1 = e1.strip() 50 | e2 = e2.strip() 51 | rel = rel.strip() 52 | rel_reverse = rel+ '_reverse' 53 | 54 | # data 55 | # (Mike, fatherOf, John) 56 | # (John, fatherOf, Tom) 57 | 58 | if (e1 , rel) not in label_graph: 59 | label_graph[(e1, rel)] = set() 60 | 61 | if (e2, rel_reverse) not in label_graph: 62 | label_graph[(e2, rel_reverse)] = set() 63 | 64 | if (e1, rel) not in train_graph[p]: 65 | train_graph[p][(e1, rel)] = set() 66 | if (e2, rel_reverse) not in train_graph[p]: 67 | train_graph[p][(e2, rel_reverse)] = set() 68 | 69 | # labels 70 | # (Mike, fatherOf, John) 71 | # (John, fatherOf, Tom) 72 | # (John, fatherOf_reverse, Mike) 73 | # (Tom, fatherOf_reverse, Mike) 74 | label_graph[(e1, rel)].add(e2) 75 | 76 | label_graph[(e2, rel_reverse)].add(e1) 77 | 78 | # test cases 79 | # (Mike, fatherOf, John) 80 | # (John, fatherOf, Tom) 81 | test_cases[p].append([e1, rel, e2]) 82 | 83 | # data 84 | # (Mike, fatherOf, John) 85 | # (John, fatherOf, Tom) 86 | # (John, fatherOf_reverse, Mike) 87 | # (Tom, fatherOf_reverse, John) 88 | train_graph[p][(e1, rel)].add(e2) 89 | train_graph[p][(e2, rel_reverse)].add(e1) 90 | 91 | 92 | 93 | def write_training_graph(cases, graph, path): 94 | with open(path, 'w') as f: 95 | n = len(graph) 96 | for i, key in enumerate(graph): 97 | e1, rel = key 98 | # (Mike, fatherOf, John) 99 | # (John, fatherOf, Tom) 100 | # (John, fatherOf_reverse, Mike) 101 | # (Tom, fatherOf_reverse, John) 102 | 103 | # (John, fatherOf) -> Tom 104 | # (John, fatherOf_reverse, Mike) 105 | entities1 = " ".join(list(graph[key])) 106 | 107 | data_point = {} 108 | data_point['e1'] = e1 109 | data_point['e2'] = 'None' 110 | data_point['rel'] = rel 111 | data_point['rel_eval'] = 'None' 112 | data_point['e2_multi1'] = entities1 113 | data_point['e2_multi2'] = "None" 114 | 115 | f.write(json.dumps(data_point) + '\n') 116 | 117 | def write_evaluation_graph(cases, graph, path): 118 | with open(path, 'w') as f: 119 | n = len(cases) 120 | n1 = 0 121 | n2 = 0 122 | for i, (e1, rel, e2) in enumerate(cases): 123 | # (Mike, fatherOf) -> John 124 | # (John, fatherOf, Tom) 125 | rel_reverse = rel+'_reverse' 126 | entities1 = " ".join(list(graph[(e1, rel)])) 127 | entities2 = " ".join(list(graph[(e2, rel_reverse)])) 128 | 129 | n1 += len(entities1.split(' ')) 130 | n2 += len(entities2.split(' ')) 131 | 132 | 133 | data_point = {} 134 | data_point['e1'] = e1 135 | data_point['e2'] = e2 136 | data_point['rel'] = rel 137 | data_point['rel_eval'] = rel_reverse 138 | data_point['e2_multi1'] = entities1 139 | data_point['e2_multi2'] = entities2 140 | 141 | f.write(json.dumps(data_point) + '\n') 142 | 143 | 144 | all_cases = test_cases['train.txt'] + test_cases['valid.txt'] + test_cases['test.txt'] 145 | write_training_graph(test_cases['train.txt'], train_graph['train.txt'], 'kg_completion/data/{0}/e1rel_to_e2_train.json'.format(dataset_name)) 146 | write_evaluation_graph(test_cases['valid.txt'], label_graph, join('kg_completion/data/{0}/e1rel_to_e2_ranking_dev.json'.format(dataset_name))) 147 | write_evaluation_graph(test_cases['test.txt'], label_graph, 'kg_completion/data/{0}/e1rel_to_e2_ranking_test.json'.format(dataset_name)) 148 | write_training_graph(all_cases, label_graph, 'kg_completion/data/{0}/e1rel_to_e2_full.json'.format(dataset_name)) 149 | -------------------------------------------------------------------------------- /IJCAI2021_demo/math_word_problem_solving/config.yaml: -------------------------------------------------------------------------------- 1 | graph_construction_name: "dependency" 2 | graph_embedding_name: "graphsage" 3 | decoder_name: "stdtree" 4 | 5 | graph_construction_args: 6 | graph_construction_share: 7 | graph_type: 'dependency' 8 | root_dir: "./data" 9 | topology_subdir: 'DependencyGraph' 10 | thread_number: 4 11 | port: 9000 12 | timeout: 15000 13 | 14 | graph_construction_private: 15 | edge_strategy: 'homogeneous' 16 | merge_strategy: 'tailhead' 17 | sequential_link: true 18 | as_node: false 19 | 20 | node_embedding: 21 | input_size: 300 22 | hidden_size: 300 23 | word_dropout: 0.1 24 | rnn_dropout: 0.1 25 | fix_bert_emb: false 26 | fix_word_emb: false 27 | embedding_style: 28 | single_token_item: true 29 | emb_strategy: "w2v_bilstm" 30 | num_rnn_layers: 1 31 | bert_model_name: null 32 | bert_lower_case: null 33 | 34 | sim_metric_type: 'weighted_cosine' 35 | num_heads: 1 36 | top_k_neigh: null 37 | epsilon_neigh: 0.5 38 | smoothness_ratio: 0.1 39 | connectivity_ratio: 0.05 40 | sparsity_ratio: 0.1 41 | 42 | graph_embedding_args: 43 | graph_embedding_share: 44 | num_layers: 1 45 | input_size: 300 46 | hidden_size: 300 47 | output_size: 300 48 | direction_option: "undirected" 49 | feat_drop: 0.0 50 | attn_drop: 0.0 51 | 52 | graph_embedding_private: 53 | aggregator_type: "lstm" 54 | bias: true 55 | norm: null 56 | activation: "relu" 57 | use_edge_weight: false 58 | 59 | decoder_args: 60 | rnn_decoder_share: 61 | rnn_type: "lstm" 62 | input_size: 300 63 | hidden_size: 300 64 | rnn_emb_input_size: 300 65 | use_copy: true 66 | graph_pooling_strategy: null 67 | attention_type: "uniform" 68 | fuse_strategy: "concatenate" 69 | dropout: 0.3 70 | teacher_forcing_rate: 1.0 71 | 72 | rnn_decoder_private: 73 | max_decoder_step: 35 74 | max_tree_depth: 8 75 | use_sibling: false 76 | use_input_feed: true 77 | -------------------------------------------------------------------------------- /IJCAI2021_demo/math_word_problem_solving/data/processed/DependencyGraph/data.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/IJCAI2021_demo/math_word_problem_solving/data/processed/DependencyGraph/data.pt -------------------------------------------------------------------------------- /IJCAI2021_demo/math_word_problem_solving/data/processed/DependencyGraph/vocab.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/IJCAI2021_demo/math_word_problem_solving/data/processed/DependencyGraph/vocab.pt -------------------------------------------------------------------------------- /IJCAI2021_demo/math_word_problem_solving/imgs/g2t.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/IJCAI2021_demo/math_word_problem_solving/imgs/g2t.png -------------------------------------------------------------------------------- /IJCAI2021_demo/math_word_problem_solving/utils.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import torch 3 | import sympy 4 | from random import randint 5 | from sympy.parsing.sympy_parser import parse_expr 6 | 7 | def convert_to_string(idx_list, form_manager): 8 | w_list = [] 9 | for i in range(len(idx_list)): 10 | w_list.append(form_manager.get_idx_symbol(int(idx_list[i]))) 11 | return " ".join(w_list) 12 | 13 | def is_all_same(c1, c2, form_manager): 14 | all_same = False 15 | if len(c1) == len(c2): 16 | all_same = True 17 | for j in range(len(c1)): 18 | if c1[j] != c2[j]: 19 | all_same = False 20 | break 21 | if all_same == False: 22 | if is_solution_same(c1, c2, form_manager): 23 | return True 24 | return False 25 | else: 26 | return True 27 | 28 | 29 | def is_solution_same(i1, i2, form_manager): 30 | c1 = " ".join([form_manager.get_idx_symbol(x) for x in i1]) 31 | c2 = " ".join([form_manager.get_idx_symbol(x) for x in i2]) 32 | if ('=' not in c1) or ('=' not in c2): 33 | return False 34 | elif (form_manager.unk_token in c1) or (form_manager.unk_token in c2): 35 | return False 36 | else: 37 | try: 38 | s1 = c1.split('=') 39 | s2 = c2.split('=') 40 | eq1 = [] 41 | eq2 = [] 42 | x = sympy.Symbol('x') 43 | eq1.append(parse_expr(s1[0])) 44 | eq1.append(parse_expr(s1[1])) 45 | eq2.append(parse_expr(s2[0])) 46 | eq2.append(parse_expr(s2[1])) 47 | res1 = sympy.solve(sympy.Eq(eq1[0], eq1[1]), x) 48 | res2 = sympy.solve(sympy.Eq(eq2[0], eq2[1]), x) 49 | 50 | if not res1 or not res2: 51 | return False 52 | if res1[0] == res2[0]: 53 | # print("Excution_true: ", c1, '\t', c2) 54 | pass 55 | return res1[0] == res2[0] 56 | 57 | except BaseException: 58 | # print("Excution_error: ", c1, '\t', c2) 59 | pass 60 | return False 61 | 62 | def compute_accuracy(candidate_list, reference_list, form_manager): 63 | if len(candidate_list) != len(reference_list): 64 | print("candidate list has length {}, reference list has length {}\n".format( 65 | len(candidate_list), len(reference_list))) 66 | len_min = min(len(candidate_list), len(reference_list)) 67 | c = 0 68 | for i in range(len_min): 69 | if is_all_same(candidate_list[i], reference_list[i], form_manager): 70 | c = c+1 71 | else: 72 | pass 73 | return c/float(len_min) 74 | 75 | 76 | def compute_tree_accuracy(candidate_list_, reference_list_, form_manager): 77 | candidate_list = [] 78 | for i in range(len(candidate_list_)): 79 | candidate_list.append(candidate_list_[i]) 80 | reference_list = [] 81 | for i in range(len(reference_list_)): 82 | reference_list.append(reference_list_[i]) 83 | return compute_accuracy(candidate_list, reference_list, form_manager) 84 | 85 | def prepare_oov(batch_graph, src_vocab, device): 86 | oov_dict = copy.deepcopy(src_vocab) 87 | token_matrix = [] 88 | for n in batch_graph.node_attributes: 89 | node_token = n['token'] 90 | if oov_dict.get_symbol_idx(node_token) == oov_dict.get_symbol_idx(oov_dict.unk_token): 91 | oov_dict.add_symbol(node_token) 92 | token_matrix.append(oov_dict.get_symbol_idx(node_token)) 93 | batch_graph.node_features['token_id_oov'] = torch.tensor(token_matrix, dtype=torch.long).to(device) 94 | return oov_dict -------------------------------------------------------------------------------- /KDD2021_demo/kg_completion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/KDD2021_demo/kg_completion/__init__.py -------------------------------------------------------------------------------- /KDD2021_demo/kg_completion/evaluation.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import datetime 4 | 5 | from spodernet.utils.logger import Logger 6 | from torch.autograd import Variable 7 | from sklearn import metrics 8 | 9 | log = Logger('evaluation{0}.py.txt'.format(datetime.datetime.now())) 10 | 11 | def ranking_and_hits(model, dev_rank_batcher, vocab, name, kg_graph=None): 12 | log.info('') 13 | log.info('-'*50) 14 | log.info(name) 15 | log.info('-'*50) 16 | log.info('') 17 | hits_left = [] 18 | hits_right = [] 19 | hits = [] 20 | ranks = [] 21 | ranks_left = [] 22 | ranks_right = [] 23 | for i in range(10): 24 | hits_left.append([]) 25 | hits_right.append([]) 26 | hits.append([]) 27 | 28 | for i, str2var in enumerate(dev_rank_batcher): 29 | e1 = str2var['e1'] 30 | e2 = str2var['e2'] 31 | rel = str2var['rel'] 32 | rel_reverse = str2var['rel_eval'] 33 | e2_multi1 = str2var['e2_multi1'].float() 34 | e2_multi2 = str2var['e2_multi2'].float() 35 | pred1 = model.forward(e1, rel, kg_graph) 36 | pred2 = model.forward(e2, rel_reverse, kg_graph) 37 | pred1, pred2 = pred1.data, pred2.data 38 | e1, e2 = e1.data, e2.data 39 | e2_multi1, e2_multi2 = e2_multi1.data, e2_multi2.data 40 | for i in range(e1.shape[0]): 41 | # these filters contain ALL labels 42 | filter1 = e2_multi1[i].long() 43 | filter2 = e2_multi2[i].long() 44 | 45 | num = e1[i, 0].item() 46 | # save the prediction that is relevant 47 | target_value1 = pred1[i,e2[i, 0].item()].item() 48 | target_value2 = pred2[i,e1[i, 0].item()].item() 49 | # zero all known cases (this are not interesting) 50 | # this corresponds to the filtered setting 51 | pred1[i][filter1] = 0.0 52 | pred2[i][filter2] = 0.0 53 | # write base the saved values 54 | pred1[i][e2[i]] = target_value1 55 | pred2[i][e1[i]] = target_value2 56 | 57 | 58 | # sort and rank 59 | max_values, argsort1 = torch.sort(pred1, 1, descending=True) 60 | max_values, argsort2 = torch.sort(pred2, 1, descending=True) 61 | 62 | argsort1 = argsort1.cpu().numpy() 63 | argsort2 = argsort2.cpu().numpy() 64 | for i in range(e1.shape[0]): 65 | # find the rank of the target entities 66 | rank1 = np.where(argsort1[i]==e2[i, 0].item())[0][0] 67 | rank2 = np.where(argsort2[i]==e1[i, 0].item())[0][0] 68 | # rank+1, since the lowest rank is rank 1 not rank 0 69 | ranks.append(rank1+1) 70 | ranks_left.append(rank1+1) 71 | ranks.append(rank2+1) 72 | ranks_right.append(rank2+1) 73 | 74 | # this could be done more elegantly, but here you go 75 | for hits_level in range(10): 76 | if rank1 <= hits_level: 77 | hits[hits_level].append(1.0) 78 | hits_left[hits_level].append(1.0) 79 | else: 80 | hits[hits_level].append(0.0) 81 | hits_left[hits_level].append(0.0) 82 | 83 | if rank2 <= hits_level: 84 | hits[hits_level].append(1.0) 85 | hits_right[hits_level].append(1.0) 86 | else: 87 | hits[hits_level].append(0.0) 88 | hits_right[hits_level].append(0.0) 89 | 90 | dev_rank_batcher.state.loss = [0] 91 | 92 | for i in range(10): 93 | log.info('Hits left @{0}: {1}'.format(i+1, np.mean(hits_left[i]))) 94 | log.info('Hits right @{0}: {1}'.format(i+1, np.mean(hits_right[i]))) 95 | log.info('Hits @{0}: {1}'.format(i+1, np.mean(hits[i]))) 96 | log.info('Mean rank left: {0}', np.mean(ranks_left)) 97 | log.info('Mean rank right: {0}', np.mean(ranks_right)) 98 | log.info('Mean rank: {0}', np.mean(ranks)) 99 | log.info('Mean reciprocal rank left: {0}', np.mean(1./np.array(ranks_left))) 100 | log.info('Mean reciprocal rank right: {0}', np.mean(1./np.array(ranks_right))) 101 | log.info('Mean reciprocal rank: {0}', np.mean(1./np.array(ranks))) 102 | 103 | return np.mean(1./np.array(ranks)) -------------------------------------------------------------------------------- /KDD2021_demo/kg_completion/kinship/processed/KG_graph.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/KDD2021_demo/kg_completion/kinship/processed/KG_graph.pt -------------------------------------------------------------------------------- /KDD2021_demo/kg_completion/kinship/raw/kinship.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/KDD2021_demo/kg_completion/kinship/raw/kinship.tar.gz -------------------------------------------------------------------------------- /KDD2021_demo/kg_completion/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | mkdir kg_completion/data 3 | mkdir kg_completion/data/kinship 4 | mkdir kg_completion/saved_models 5 | tar -xvf kg_completion/kinship/raw/kinship.tar.gz -C kg_completion/data/kinship 6 | python kg_completion/wrangle_KG.py kinship 7 | -------------------------------------------------------------------------------- /KDD2021_demo/kg_completion/saved_models/kinship_ggnn_distmult_0.2_0.25.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/KDD2021_demo/kg_completion/saved_models/kinship_ggnn_distmult_0.2_0.25.model -------------------------------------------------------------------------------- /KDD2021_demo/kg_completion/spodernet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/KDD2021_demo/kg_completion/spodernet/__init__.py -------------------------------------------------------------------------------- /KDD2021_demo/kg_completion/spodernet/backends/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/KDD2021_demo/kg_completion/spodernet/backends/__init__.py -------------------------------------------------------------------------------- /KDD2021_demo/kg_completion/spodernet/backends/tfbackend.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from spodernet.interfaces import IAtBatchPreparedObservable 4 | from spodernet.utils.util import Timer 5 | from spodernet.utils.global_config import Config 6 | 7 | class TensorFlowConfig: 8 | inp = None 9 | support = None 10 | input_length = None 11 | support_length = None 12 | target = None 13 | index = None 14 | sess = None 15 | 16 | @staticmethod 17 | def init_batch_size(batch_size): 18 | TensorFlowConfig.inp = tf.placeholder(tf.int64, [batch_size, None]) 19 | TensorFlowConfig.support = tf.placeholder(tf.int64, [batch_size, None]) 20 | TensorFlowConfig.input_length = tf.placeholder(tf.int64, [batch_size,]) 21 | TensorFlowConfig.support_length = tf.placeholder(tf.int64, [batch_size,]) 22 | TensorFlowConfig.target = tf.placeholder(tf.int64, [batch_size]) 23 | TensorFlowConfig.index = tf.placeholder(tf.int64, [batch_size]) 24 | 25 | @staticmethod 26 | def get_session(): 27 | if TensorFlowConfig.sess is None: 28 | TensorFlowConfig.sess = tf.Session() 29 | return TensorFlowConfig.sess 30 | 31 | 32 | 33 | class TensorFlowConverter(IAtBatchPreparedObservable): 34 | 35 | def at_batch_prepared(self, batch_parts): 36 | inp, inp_len, sup, sup_len, t, idx = batch_parts 37 | if TensorFlowConfig.inp == None: 38 | log.error('You need to initialize the batch size via TensorflowConfig.init_batch_size(batchsize)!') 39 | feed_dict = {} 40 | feed_dict[TensorFlowConfig.inp] = inp 41 | feed_dict[TensorFlowConfig.support] = sup 42 | feed_dict[TensorFlowConfig.input_length] = inp_len 43 | feed_dict[TensorFlowConfig.support_length] = sup_len 44 | feed_dict[TensorFlowConfig.target] = t 45 | feed_dict[TensorFlowConfig.index] = idx 46 | 47 | str2var = {} 48 | str2var['input'] = TensorFlowConfig.inp 49 | str2var['input_length'] = TensorFlowConfig.input_length 50 | str2var['support'] = TensorFlowConfig.support 51 | str2var['support_length'] = TensorFlowConfig.support_length 52 | str2var['target'] = TensorFlowConfig.target 53 | str2var['index'] = TensorFlowConfig.index 54 | 55 | return str2var, feed_dict 56 | 57 | def build_str2var_dict(): 58 | str2var = {} 59 | if TensorFlowConfig.inp is not None: 60 | str2var['input'] = TensorFlowConfig.inp 61 | if TensorFlowConfig.support is not None: 62 | str2var['support'] = TensorFlowConfig.support 63 | if TensorFlowConfig.target is not None: 64 | str2var['target'] = TensorFlowConfig.target 65 | if TensorFlowConfig.input_length is not None: 66 | str2var['input_length'] = TensorFlowConfig.input_length 67 | if TensorFlowConfig.support_length is not None: 68 | str2var['support_length'] = TensorFlowConfig.support_length 69 | if TensorFlowConfig.index is not None: 70 | str2var['index'] = TensorFlowConfig.index 71 | return str2var 72 | 73 | class TFTrainer(object): 74 | def __init__(self, model): 75 | self.sess = TensorFlowConfig.get_session() 76 | str2var = build_str2var_dict() 77 | self.logits, self.loss, self.argmax = model.forward(str2var) 78 | optimizer = tf.train.AdamOptimizer(0.001) 79 | 80 | if Config.L2 != 0.0: 81 | self.loss += tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()]) * Config.L2 82 | 83 | self.min_op = optimizer.minimize(self.loss) 84 | 85 | tf.global_variables_initializer().run(session=self.sess) 86 | 87 | def train_model(self, batcher, epochs=1, iterations=None): 88 | for epoch in range(epochs): 89 | for i, (str2var, feed_dict) in enumerate(batcher): 90 | _, argmax_values = self.sess.run([self.min_op, self.argmax], feed_dict=feed_dict) 91 | 92 | batcher.state.argmax = argmax_values 93 | batcher.state.targets = feed_dict[TensorFlowConfig.target] 94 | 95 | if iterations > 0: 96 | if i == iterations: break 97 | 98 | def eval_model(self, batcher, iterations=None): 99 | for i, (str2var, feed_dict) in enumerate(batcher): 100 | argmax_values = self.sess.run([self.argmax], feed_dict=feed_dict)[0] 101 | 102 | batcher.state.argmax = argmax_values 103 | batcher.state.targets = feed_dict[TensorFlowConfig.target] 104 | 105 | if iterations > 0: 106 | if i == iterations: break 107 | 108 | -------------------------------------------------------------------------------- /KDD2021_demo/kg_completion/spodernet/backends/tfmodels.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow import placeholder 3 | from spodernet.backends.tfbackend import TensorFlowConfig 4 | from spodernet.utils.global_config import Config 5 | from spodernet.frontend import AbstractModel 6 | import numpy as np 7 | 8 | def reader(inputs, lengths, output_size, contexts=(None, None), scope=None): 9 | with tf.variable_scope(scope or "reader") as varscope: 10 | 11 | cell = tf.contrib.rnn.LSTMCell(output_size, state_is_tuple=True,initializer=tf.contrib.layers.xavier_initializer()) 12 | 13 | cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=1.0-Config.dropout) 14 | 15 | outputs, states = tf.nn.bidirectional_dynamic_rnn( 16 | cell, 17 | cell, 18 | inputs, 19 | sequence_length=lengths, 20 | initial_state_fw=contexts[0], 21 | initial_state_bw=contexts[1], 22 | dtype=tf.float32) 23 | 24 | return outputs, states 25 | 26 | def predictor(inputs, targets, target_size): 27 | init = tf.contrib.layers.xavier_initializer(uniform=True) #uniform=False for truncated normal 28 | logits = tf.contrib.layers.fully_connected(inputs, target_size, weights_initializer=init, activation_fn=None) 29 | 30 | loss = tf.reduce_mean( 31 | tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, 32 | labels=targets), name='predictor_loss') 33 | predict = tf.arg_max(tf.nn.softmax(logits), 1, name='prediction') 34 | return [logits, loss, predict] 35 | 36 | 37 | class TFEmbedding(AbstractModel): 38 | 39 | def __init__(self, embedding_size, num_embeddings, scope=None): 40 | super(TFEmbedding, self).__init__() 41 | 42 | self.embedding_size = embedding_size 43 | self.scope = scope 44 | self.num_embeddings = num_embeddings 45 | 46 | def forward(self, str2var, *args): 47 | self.expected_str2var_keys(str2var, ['input', 'support']) 48 | self.expected_args('None', 'None') 49 | self.generated_outputs('input idx, support idx', 'both sequences have shape = [batch, timesteps, embedding dim]') 50 | 51 | embeddings = tf.get_variable("embeddings", [self.num_embeddings, self.embedding_size], 52 | initializer=tf.random_normal_initializer(0., 1./np.sqrt(self.embedding_size)), 53 | trainable=True, dtype="float32") 54 | 55 | with tf.variable_scope("embedders") as varscope: 56 | seqQ = tf.nn.embedding_lookup(embeddings, TensorFlowConfig.inp) 57 | varscope.reuse_variables() 58 | seqS = tf.nn.embedding_lookup(embeddings, TensorFlowConfig.support) 59 | 60 | return seqQ, seqS 61 | 62 | class TFPairedBiDirectionalLSTM(AbstractModel): 63 | 64 | def __init__(self, hidden_size, scope=None, conditional_encoding=True): 65 | super(TFPairedBiDirectionalLSTM, self).__init__() 66 | self.hidden_size = hidden_size 67 | self.scope = scope 68 | if not conditional_encoding: 69 | raise NotImplementedError("conditional_encoding=False is not implemented yet.") 70 | 71 | def forward(self, str2var, *args): 72 | self.expected_str2var_keys(str2var, ['input_length', 'support_length']) 73 | self.expected_args('seq input, seq support', 'dimension of both: [batch, timesteps, embedding dim]') 74 | self.generated_outputs('stacked outputs of last timestep', 'dim is [batch_size, 2x hidden size]') 75 | 76 | seqQ, seqS = args 77 | 78 | with tf.variable_scope(self.scope or "conditional_reader_seq1") as varscope1: 79 | #seq1_states: (c_fw, h_fw), (c_bw, h_bw) 80 | _, seq1_states = reader(seqQ, TensorFlowConfig.input_length, self.hidden_size, scope=varscope1) 81 | with tf.variable_scope(self.scope or "conditional_reader_seq2") as varscope2: 82 | varscope1.reuse_variables() 83 | # each [batch_size x max_seq_length x output_size] 84 | outputs, states = reader(seqS, TensorFlowConfig.support_length, self.hidden_size, seq1_states, scope=varscope2) 85 | 86 | output = tf.concat([states[0][1], states[1][1]], 1) 87 | 88 | return [output] 89 | 90 | class TFSoftmaxCrossEntropy(AbstractModel): 91 | 92 | def __init__(self, num_labels): 93 | super(TFSoftmaxCrossEntropy, self).__init__() 94 | self.num_labels = num_labels 95 | 96 | def forward(self, str2var, *args): 97 | self.expected_str2var_keys(str2var, ['target']) 98 | self.expected_args('some inputs', 'dimension: [batch, any]') 99 | self.generated_outputs('logits, loss, argmax', 'dimensions: logits = [batch, labels], loss = 1x1, argmax = [batch, 1]') 100 | outputs_prev_layer = args[0] 101 | 102 | logits, loss, argmax = predictor(outputs_prev_layer, TensorFlowConfig.target, self.num_labels) 103 | 104 | return [logits, loss, argmax] 105 | -------------------------------------------------------------------------------- /KDD2021_demo/kg_completion/spodernet/backends/torchbackend.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from torch.autograd import Variable 3 | from itertools import chain 4 | 5 | import torch 6 | import numpy as np 7 | 8 | from spodernet.interfaces import IAtBatchPreparedObservable 9 | from spodernet.utils.util import Timer 10 | from spodernet.utils.global_config import Config 11 | 12 | class TorchConverter(IAtBatchPreparedObservable): 13 | def __init__(self, is_volatile): 14 | self.is_volatile = is_volatile 15 | 16 | def at_batch_prepared(self, str2var): 17 | for key in str2var.keys(): 18 | if 'length' in key: continue 19 | if str2var[key].dtype == np.int32: 20 | str2var[key] = np.int64(str2var[key]) 21 | str2var[key] = Variable(torch.from_numpy(str2var[key]), volatile=self.is_volatile) 22 | return str2var 23 | 24 | class TorchCUDAConverter(IAtBatchPreparedObservable): 25 | def __init__(self, device_id): 26 | self.device_id = device_id 27 | 28 | def at_batch_prepared(self, str2var): 29 | for key in str2var.keys(): 30 | if 'length' in key: continue 31 | str2var[key] = str2var[key].cuda(self.device_id, True) 32 | return str2var 33 | 34 | 35 | class TorchNegativeSampling(IAtBatchPreparedObservable): 36 | def __init__(self, max_index, keys_to_corrupt=['input', 'target']): 37 | self.max_index = max_index 38 | self.keys_to_corrupt = keys_to_corrupt 39 | self.rdm = np.random.RandomState(34534) 40 | 41 | def at_batch_prepared(self, str2var): 42 | samples_per_key = Config.batch_size/len(self.keys_to_corrupt) 43 | for i, key in enumerate(self.keys_to_corrupt): 44 | variable = str2var[key] 45 | new_idx = self.rdm.choice(self.max_index, samples_per_key) 46 | if Config.cuda: 47 | variable_corrupted = Variable(torch.cuda.LongTensor(variable.size())) 48 | variable_corrupted.data.copy_(variable.data) 49 | variable_corrupted.data[i*samples_per_key: (i+1)*samples_per_key] = torch.from_numpy(new_idx).cuda() 50 | else: 51 | variable_corrupted = Variable(torch.LongTensor(variable.size())) 52 | variable_corrupted.data.copy_(variable.data) 53 | variable_corrupted.data[i*samples_per_key: (i+1)*samples_per_key] = torch.from_numpy(new_idx) 54 | str2var[key + '_corrupt'] = variable_corrupted 55 | 56 | return str2var 57 | 58 | 59 | ###################################### 60 | # 61 | # Util functions 62 | # 63 | ###################################### 64 | 65 | 66 | def get_list_of_torch_modules(model): 67 | modules = [] 68 | for module in model.modules: 69 | if hasattr(module, 'modules'): 70 | for module2 in module.modules: 71 | modules.append(module2) 72 | else: 73 | modules.append(module) 74 | return modules 75 | 76 | 77 | 78 | def train_model(model, batcher, epochs=1, iterations=None): 79 | modules = get_list_of_torch_modules(model) 80 | generators = [] 81 | for module in modules: 82 | if Config.cuda: 83 | module.cuda() 84 | generators.append(module.parameters()) 85 | 86 | parameters = chain.from_iterable(generators) 87 | optimizer = torch.optim.Adam(parameters, lr=0.001) 88 | for module in modules: 89 | module.train() 90 | 91 | for epoch in range(epochs): 92 | for i, str2var in enumerate(batcher): 93 | optimizer.zero_grad() 94 | logits, loss, argmax = model.forward(str2var) 95 | loss.backward() 96 | optimizer.step() 97 | batcher.state.argmax = argmax 98 | batcher.state.targets = str2var['target'] 99 | 100 | if iterations > 0: 101 | if i == iterations: break 102 | 103 | 104 | def eval_model(model, batcher, iterations=None): 105 | modules = get_list_of_torch_modules(model) 106 | for module in modules: 107 | module.eval() 108 | 109 | for i, str2var in enumerate(batcher): 110 | logits, loss, argmax = model.forward(str2var) 111 | batcher.state.argmax = argmax 112 | batcher.state.targets = str2var['target'] 113 | 114 | if iterations > 0: 115 | if i == iterations: break 116 | -------------------------------------------------------------------------------- /KDD2021_demo/kg_completion/spodernet/backends/torchmodels.py: -------------------------------------------------------------------------------- 1 | from torch.nn import LSTM 2 | from torch.autograd import Variable 3 | 4 | import torch 5 | import torch.nn.functional as F 6 | 7 | from spodernet.frontend import AbstractModel 8 | from spodernet.utils.global_config import Config 9 | 10 | class TorchEmbedding(torch.nn.Module, AbstractModel): 11 | def __init__(self, embedding_size, num_embeddings): 12 | super(TorchEmbedding, self).__init__() 13 | 14 | self.emb= torch.nn.Embedding(num_embeddings, 15 | embedding_size, padding_idx=0)#, scale_grad_by_freq=True, padding_idx=0) 16 | 17 | def forward(self, str2var, *args): 18 | self.expected_str2var_keys_oneof(str2var, ['input', 'support']) 19 | self.expected_args('None', 'None') 20 | self.generated_outputs('input idx, support idx', 'both sequences have shape = [batch, timesteps, embedding dim]') 21 | 22 | embedded_results = [] 23 | if 'input' in str2var: 24 | embedded_results.append(self.emb(str2var['input'])) 25 | 26 | if 'support' in str2var: 27 | embedded_results.append(self.emb(str2var['support'])) 28 | 29 | return embedded_results 30 | 31 | class TorchBiDirectionalLSTM(torch.nn.Module, AbstractModel): 32 | def __init__(self, input_size, hidden_size, 33 | dropout=0.0, layers=1, 34 | bidirectional=True, to_cuda=False, conditional_encoding=True): 35 | super(TorchBiDirectionalLSTM, self).__init__() 36 | 37 | use_bias = True 38 | num_directions = (1 if not bidirectional else 2) 39 | 40 | self.lstm = LSTM(input_size,hidden_size,layers, 41 | use_bias,True,0.2,bidirectional) 42 | 43 | # states of both LSTMs 44 | self.h0 = None 45 | self.c0 = None 46 | 47 | self.h0 = Variable(torch.FloatTensor(num_directions*layers, Config.batch_size, hidden_size)) 48 | self.c0 = Variable(torch.FloatTensor(num_directions*layers, Config.batch_size, hidden_size)) 49 | 50 | if Config.cuda: 51 | self.h0 = self.h0.cuda() 52 | self.c0 = self.c0.cuda() 53 | 54 | def forward(self, str2var, *args): 55 | self.expected_str2var_keys(str2var, []) 56 | self.expected_args('embedded seq', 'size [batch, time steps, embedding dim]') 57 | self.generated_outputs('LSTM output seq', 'size [batch, time steps, 2x hidden dim]') 58 | seq = args 59 | self.h0.data.zero_() 60 | self.c0.data.zero_() 61 | out, hid = self.lstm(seq, (self.h0, self.c0)) 62 | return [out, hid] 63 | 64 | 65 | class TorchPairedBiDirectionalLSTM(torch.nn.Module, AbstractModel): 66 | def __init__(self, input_size, hidden_size, 67 | dropout=0.0, layers=1, 68 | bidirectional=True, to_cuda=False, conditional_encoding=True): 69 | super(TorchPairedBiDirectionalLSTM, self).__init__() 70 | 71 | self.conditional_encoding = conditional_encoding 72 | use_bias = True 73 | num_directions = (1 if not bidirectional else 2) 74 | 75 | self.conditional_encoding = conditional_encoding 76 | self.lstm1 = LSTM(input_size,hidden_size,layers, 77 | use_bias,True,Config.dropout,bidirectional) 78 | self.lstm2 = LSTM(input_size,hidden_size,layers, 79 | use_bias,True,Config.dropout,bidirectional) 80 | 81 | # states of both LSTMs 82 | self.h01 = None 83 | self.c01 = None 84 | self.h02 = None 85 | self.c02 = None 86 | 87 | 88 | self.h01 = Variable(torch.FloatTensor(num_directions*layers, Config.batch_size, hidden_size)) 89 | self.c01 = Variable(torch.FloatTensor(num_directions*layers, Config.batch_size, hidden_size)) 90 | 91 | if Config.cuda: 92 | self.h01 = self.h01.cuda() 93 | self.c01 = self.c01.cuda() 94 | 95 | if not self.conditional_encoding: 96 | self.h02 = Variable(torch.FloatTensor(num_directions*layers, Config.batch_size, hidden_size)) 97 | self.c02 = Variable(torch.FloatTensor(num_directions*layers, Config.batch_size, hidden_size)) 98 | 99 | if Config.cuda: 100 | self.h02 = self.h02.cuda() 101 | self.c02 = self.c02.cuda() 102 | 103 | 104 | def forward(self, str2var, *args): 105 | self.expected_str2var_keys(str2var, []) 106 | self.expected_args('embedded input seq, embedded seq support', 'both of size [batch, time steps, embedding dim]') 107 | self.generated_outputs('LSTM output seq inputs, LSTM output seq support', 'both of size [batch, time steps, 2x hidden dim]') 108 | seq1, seq2 = args 109 | if self.conditional_encoding: 110 | self.h01.data.zero_() 111 | self.c01.data.zero_() 112 | out1, hid1 = self.lstm1(seq1, (self.h01, self.c01)) 113 | out2, hid2 = self.lstm2(seq2, hid1) 114 | else: 115 | self.h01.data.zero_() 116 | self.c01.data.zero_() 117 | self.h02.data.zero_() 118 | self.c02.data.zero_() 119 | out1, hid1 = self.lstm1(seq1, (self.h01, self.c01)) 120 | out2, hid2 = self.lstm2(seq2, (self.h02, self.c02)) 121 | return [out1, out2] 122 | 123 | class TorchVariableLengthOutputSelection(torch.nn.Module, AbstractModel): 124 | def __init__(self): 125 | super(TorchVariableLengthOutputSelection, self).__init__() 126 | self.b1 = None 127 | self.b2 = None 128 | 129 | def forward(self, str2var, *args): 130 | self.expected_str2var_keys(str2var, ['input_length', 'support_length']) 131 | self.expected_args('LSTM output sequence input , LSTM output sequence support', 'dimension of both: [batch, time steps, 2x LSTM hidden size]') 132 | self.generated_outputs('stacked bidirectional outputs of last timestep', 'dim is [batch_size, 4x hidden size]') 133 | 134 | output_lstm1, output_lstm2 = args 135 | 136 | l1, l2 = str2var['input_length'], str2var['support_length'] 137 | if self.b1 == None: 138 | b1 = torch.ByteTensor(output_lstm1.size()) 139 | b2 = torch.ByteTensor(output_lstm2.size()) 140 | if Config.cuda: 141 | b1 = b1.cuda() 142 | b2 = b2.cuda() 143 | 144 | b1.fill_(0) 145 | for i, num in enumerate(l1.data): 146 | b1[i,num-1,:] = 1 147 | out1 = output_lstm1[b1].view(Config.batch_size, -1) 148 | 149 | b2.fill_(0) 150 | for i, num in enumerate(l2.data): 151 | b2[i,num-1,:] = 1 152 | out2 = output_lstm2[b2].view(Config.batch_size, -1) 153 | 154 | out = torch.cat([out1,out2], 1) 155 | return [out] 156 | 157 | class TorchSoftmaxCrossEntropy(torch.nn.Module, AbstractModel): 158 | 159 | def __init__(self, input_dim, num_labels): 160 | super(TorchSoftmaxCrossEntropy, self).__init__() 161 | self.num_labels = num_labels 162 | self.projection_to_labels = torch.nn.Linear(input_dim, num_labels) 163 | 164 | def forward(self, str2var, *args): 165 | self.expected_str2var_keys(str2var, ['target']) 166 | self.expected_args('some inputs', 'dimension: [batch, any]') 167 | self.generated_outputs('logits, loss, argmax', 'dimensions: logits = [batch, labels], loss = 1x1, argmax = [batch, 1]') 168 | 169 | outputs_prev_layer = args[0] 170 | t = str2var['target'] 171 | 172 | logits = self.projection_to_labels(outputs_prev_layer) 173 | out = F.log_softmax(logits) 174 | loss = F.nll_loss(out, t) 175 | maximum, argmax = torch.topk(out.data, 1) 176 | 177 | return [logits, loss, argmax] 178 | 179 | -------------------------------------------------------------------------------- /KDD2021_demo/kg_completion/spodernet/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/KDD2021_demo/kg_completion/spodernet/data/__init__.py -------------------------------------------------------------------------------- /KDD2021_demo/kg_completion/spodernet/data/snli2spoder.py: -------------------------------------------------------------------------------- 1 | '''Downloads SNLI data and wrangles it into the spoder format''' 2 | 3 | 4 | if __name__ == '__main__': 5 | snli2spoder() 6 | -------------------------------------------------------------------------------- /KDD2021_demo/kg_completion/spodernet/frontend.py: -------------------------------------------------------------------------------- 1 | from itertools import chain 2 | 3 | from spodernet.utils.global_config import Config, Backends 4 | 5 | from spodernet.utils.logger import Logger 6 | log = Logger('frontend.py.txt') 7 | 8 | 9 | class Model(object): 10 | 11 | def __init__(self, input_module=None): 12 | self.modules = [] 13 | self.input_module = input_module 14 | self.module = self 15 | 16 | def add(self, module): 17 | self.modules.append(module) 18 | 19 | def forward(self, str2var, *inputs): 20 | outputs = inputs 21 | if inputs == None: 22 | outputs = [] 23 | for module in self.modules: 24 | outputs = module.forward(str2var, *outputs) 25 | return outputs 26 | 27 | class Trainer(object): 28 | def __init__(self, model): 29 | self.model = model 30 | 31 | self.trainer_backend = None 32 | self.train_func = None 33 | self.eval_func = None 34 | if Config.backend == Backends.TENSORFLOW: 35 | from spodernet.backends.tfbackend import TFTrainer 36 | self.trainer_backend = TFTrainer(model) 37 | self.train_func = lambda _, batch, epochs, iterations: self.trainer_backend.train_model(batch, epochs, iterations) 38 | self.eval_func = lambda _, batch, iterations: self.trainer_backend.eval_model(batch, iterations) 39 | elif Config.backend == Backends.TORCH: 40 | from spodernet.backends.torchbackend import train_model, eval_model 41 | self.train_func = train_model 42 | self.eval_func = eval_model 43 | 44 | def train(self, batcher, epochs=1, iterations=None): 45 | self.train_func(self.model, batcher, epochs, iterations) 46 | 47 | def evaluate(self, batcher, iterations=None): 48 | self.eval_func(self.model, batcher, iterations) 49 | 50 | class AbstractModel(object): 51 | 52 | def __init__(self): 53 | super(AbstractModel, self).__init__() 54 | self.input_str_args = None 55 | self.output_str_args = None 56 | self.used_keys = None 57 | 58 | def forward(self, str2var, *args): 59 | raise NotImplementedError("Classes that inherit from AbstractModel need to implement the forward method.") 60 | 61 | @property 62 | def modules(self): 63 | raise NotImplementedError("Classes that inherit from AbstractModel need to overrite the modules property.") 64 | 65 | def expected_str2var_keys(self, str2var, keys): 66 | self.used_keys = keys 67 | for key in keys: 68 | if key not in str2var: 69 | log.error('Variable with name {0} expected, but not found in str2variable dict with keys {1}'.format(key, str2var.keys())) 70 | 71 | def expected_str2var_keys_oneof(self, str2var, keys): 72 | self.used_keys = keys 73 | one_exists = False 74 | for key in keys: 75 | if key in str2var: 76 | one_exists = True 77 | if not one_exists: 78 | log.error('At least one of these variable was expected: {0}. But str2var only has these variables: {1}.', keys, str2var.keys()) 79 | 80 | def expected_args(self, str_arg_names, str_arg_description): 81 | log.debug_once('Expected args {0}'.format(str_arg_names)) 82 | log.debug_once('Info for the expected arguments: {0}'.format(str_arg_description)) 83 | self.input_str_args = str_arg_names 84 | 85 | def generated_outputs(self, str_output_names, str_output_description): 86 | log.debug_once('Generated outputs: {0}'.format(str_output_names)) 87 | log.debug_once('Info for the provided outputs: {0}'.format(str_output_description)) 88 | self.output_str_args = str_output_names 89 | self.used_keys 90 | self.input_str_args 91 | self.output_str_args 92 | message = '{0} + {1} -> {2}'.format(self.used_keys, self.input_str_args, self.output_str_args) 93 | log.info_once(message) 94 | 95 | 96 | class Embedding(object): 97 | def __init__(self, embedding_size, num_embeddings, scope=None): 98 | self.embedding_size = embedding_size 99 | self.scope = scope 100 | self.num_embeddings = num_embeddings 101 | 102 | self.module = None 103 | if Config.backend == Backends.TENSORFLOW: 104 | from spodernet.backends.tfmodels import TFEmbedding 105 | self.module = TFEmbedding(embedding_size, num_embeddings, scope) 106 | elif Config.backend == Backends.TORCH: 107 | from spodernet.backends.torchmodels import TorchEmbedding 108 | self.module = TorchEmbedding(embedding_size, num_embeddings) 109 | self.modules = [self.module] 110 | 111 | def forward(self, str2var, *args): 112 | return self.module.forward(str2var, *args) 113 | 114 | 115 | class PairedBiDirectionalLSTM(object): 116 | 117 | def __init__(self, input_size, hidden_size, scope=None, conditional_encoding=True): 118 | super(PairedBiDirectionalLSTM, self).__init__() 119 | self.hidden_size = hidden_size 120 | self.scope = scope 121 | 122 | self.module = None 123 | if Config.backend == Backends.TENSORFLOW: 124 | from spodernet.backends.tfmodels import TFPairedBiDirectionalLSTM 125 | self.module = TFPairedBiDirectionalLSTM(hidden_size, scope, conditional_encoding) 126 | elif Config.backend == Backends.TORCH: 127 | from spodernet.backends.torchmodels import TorchPairedBiDirectionalLSTM, TorchVariableLengthOutputSelection 128 | model = Model() 129 | model.add(TorchPairedBiDirectionalLSTM(input_size, hidden_size, conditional_encoding=conditional_encoding)) 130 | model.add(TorchVariableLengthOutputSelection()) 131 | 132 | self.module = model 133 | self.modules = model.modules 134 | 135 | def forward(self, str2var, *args): 136 | return self.module.forward(str2var, *args) 137 | 138 | 139 | class SoftmaxCrossEntropy(object): 140 | def __init__(self, input_size, num_labels): 141 | super(SoftmaxCrossEntropy, self).__init__() 142 | self.num_labels = num_labels 143 | 144 | self.module = None 145 | if Config.backend == Backends.TENSORFLOW: 146 | from spodernet.backends.tfmodels import TFSoftmaxCrossEntropy 147 | self.module = TFSoftmaxCrossEntropy(num_labels) 148 | elif Config.backend == Backends.TORCH: 149 | from spodernet.backends.torchmodels import TorchSoftmaxCrossEntropy 150 | self.module = TorchSoftmaxCrossEntropy(input_size, num_labels) 151 | self.modules = [self.module] 152 | 153 | def forward(self, str2var, *args): 154 | return self.module.forward(str2var, *args) 155 | -------------------------------------------------------------------------------- /KDD2021_demo/kg_completion/spodernet/hooks.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.stats 3 | import datetime 4 | 5 | from spodernet.interfaces import IAtIterEndObservable, IAtEpochEndObservable, IAtEpochStartObservable 6 | from spodernet.utils.util import Timer 7 | from spodernet.utils.global_config import Config, Backends 8 | 9 | from spodernet.utils.logger import Logger 10 | log = Logger('hooks.py.txt') 11 | 12 | class AbstractHook(IAtIterEndObservable, IAtEpochEndObservable): 13 | def __init__(self, name, metric_name, print_every_x_batches): 14 | self.epoch_errors = [] 15 | self.current_scores = [] 16 | self.name = name 17 | self.iter_count = 0 18 | self.print_every = print_every_x_batches 19 | self.metric_name = metric_name 20 | self.epoch = 1 21 | 22 | # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance 23 | self.n = 0 24 | self.epoch_n = 0 25 | self.mean = 0 26 | self.M2 = 0 27 | self.load_backend_specific_functions() 28 | 29 | def load_backend_specific_functions(self): 30 | if Config.backend == Backends.TORCH: 31 | from torch.autograd import Variable 32 | def convert_state(state): 33 | if isinstance(state.targets, Variable): 34 | state.targets = state.targets.data 35 | if isinstance(state.argmax, Variable): 36 | state.argmax = state.argmax.data 37 | if isinstance(state.pred, Variable): 38 | state.pred = state.pred.data 39 | if isinstance(state.loss, Variable): 40 | state.loss = state.loss.data 41 | if isinstance(state.multi_labels, Variable): 42 | state.multi_labels = state.multi_labels.data 43 | 44 | return state 45 | 46 | self.convert_state = convert_state 47 | else: 48 | self.convert_state = lambda x: x 49 | 50 | def calculate_metric(self, state): 51 | raise NotImplementedError('Classes that inherit from abstract hook need to implement the calcualte metric method.') 52 | 53 | def at_end_of_iter_event(self, state): 54 | state = self.convert_state(state) 55 | metric = self.calculate_metric(state) 56 | #print(metric) 57 | 58 | self.n += 1 59 | delta = metric - self.mean 60 | self.mean += delta/self.n 61 | delta2 = metric - self.mean 62 | self.M2 += delta*delta2 63 | 64 | self.current_scores.append(metric) 65 | self.iter_count += 1 66 | if self.iter_count % self.print_every == 0: 67 | lower, upper, m, n = self.print_statistic() 68 | self.n = 0 69 | self.mean = 0 70 | self.M2 = 0 71 | return lower, upper, m, n 72 | return 0, 0, self.mean, self.n 73 | 74 | def at_end_of_epoch_event(self, state): 75 | if self.n == 0: return 0, 0, 0, 0 76 | self.epoch_errors.append(self.get_confidence_intervals()) 77 | lower, upper, m, n = self.print_statistic(True) 78 | del self.current_scores[:] 79 | self.n = 0 80 | self.mean = 0 81 | self.M2 = 0 82 | self.epoch += 1 83 | self.iter_count = 0 84 | return lower, upper, m, n 85 | 86 | def get_confidence_intervals(self, percentile=0.99, limit=1000): 87 | z = scipy.stats.norm.ppf(percentile) 88 | var = self.M2/ (self.n) 89 | SE = np.sqrt(var/self.n) 90 | lower = self.mean-(z*SE) 91 | upper = self.mean+(z*SE) 92 | return [self.n, lower, self.mean, upper] 93 | 94 | def print_statistic(self, at_epoch_end=False): 95 | n, lower, m, upper = self.get_confidence_intervals() 96 | str_message = '{3} {4}: {2:.5}\t99% CI: ({0:.5}, {1:.5}), n={5}'.format(lower, upper, m, self.name, self.metric_name, self.n) 97 | if at_epoch_end: log.info('\n') 98 | if at_epoch_end: log.info('#'*40) 99 | if at_epoch_end: log.info(' '*10 + 'COMPLETED EPOCH: {0}'.format(self.epoch) + ' '*30) 100 | log.info(str_message) 101 | if at_epoch_end: log.info('#'*40) 102 | if at_epoch_end: log.info('\n') 103 | return lower, upper, m, n 104 | 105 | 106 | class AccuracyHook(AbstractHook): 107 | def __init__(self, name='', print_every_x_batches=1000): 108 | super(AccuracyHook, self).__init__(name, 'Accuracy', print_every_x_batches) 109 | self.func = None 110 | self.topk = 1 111 | if Config.backend == Backends.TORCH: 112 | import torch 113 | self.func = lambda x: torch.sum(x) 114 | 115 | def calculate_metric(self, state): 116 | if Config.backend == Backends.TORCH: 117 | correct = 0.0 118 | if len(state.argmax.size()) == 1: 119 | correct += self.func(state.targets==state.argmax) 120 | else: 121 | topk = state.argmax.size(1) 122 | for i in range(topk): 123 | correct += self.func(state.targets==state.argmax[:, i]) 124 | n = state.argmax.size()[0] 125 | return correct.item()/np.float32(n) 126 | elif Config.backend == Backends.TENSORFLOW: 127 | n = state.argmax.shape[0] 128 | return np.sum(state.targets==state.argmax)/np.float32(n) 129 | elif Config.backend == Backends.TEST: 130 | n = state.argmax.shape[0] 131 | return np.sum(state.targets==state.argmax)/np.float32(n) 132 | else: 133 | raise Exception('Backend has unsupported value {0}'.format(Config.backend)) 134 | 135 | 136 | class TopKRankingLoss(AbstractHook): 137 | def __init__(self, k, filtered=False, name='', print_every_x_batches=1000): 138 | super(TopKRankingLoss, self).__init__(name, '{1}Hits@{0} loss'.format(k, ('' if not filtered else 'Filtered ')), print_every_x_batches) 139 | self.func = None 140 | self.argsort = None 141 | self.sum_func = None 142 | self.k = k 143 | self.filtered = filtered 144 | if Config.backend == Backends.TORCH: 145 | import torch 146 | self.argsort = lambda x, k: torch.topk(x, k) 147 | self.sum_func = lambda x: torch.sum(x) 148 | 149 | 150 | def calculate_metric(self, state): 151 | if Config.backend == Backends.TORCH: 152 | if self.filtered: 153 | import torch 154 | saved = torch.index_select(state.pred,1,state.targets) 155 | state.pred[state.multi_labels.byte()] = -100000.0 156 | state.pred.index_copy_(1, state.targets, saved) 157 | 158 | max_values, argmax = self.argsort(state.pred, self.k) 159 | in_topk = 0 160 | for i in range(self.k): 161 | in_topk += self.sum_func(argmax[:,i] == state.targets) 162 | n = state.pred.size()[0] 163 | return in_topk/np.float32(n) 164 | else: 165 | raise Exception('Backend has unsupported value {0}'.format(Config.backend)) 166 | 167 | 168 | 169 | class LossHook(AbstractHook): 170 | def __init__(self, name='', print_every_x_batches=1000): 171 | super(LossHook, self).__init__(name, 'Loss', print_every_x_batches) 172 | 173 | def calculate_metric(self, state): 174 | if Config.backend == Backends.TORCH: 175 | state = self.convert_state(state) 176 | return state.loss.item() 177 | else: 178 | return state.loss 179 | 180 | 181 | class IntersectionHook(AbstractHook): 182 | def __init__(self, name='', print_every_x_batches=1000): 183 | super(IntersectionHook, self).__init__(name, 'Intersection', print_every_x_batches) 184 | 185 | def calculate_metric(self, state): 186 | state = self.convert_state(state) 187 | preds = state.pred 188 | targets = state.targets 189 | if Config.cuda: 190 | preds = preds.cpu() 191 | targets = targets.cpu() 192 | 193 | preds = preds.numpy() 194 | targets = targets.numpy() 195 | n = targets.size 196 | k = 0 197 | for row in range(Config.batch_size): 198 | k += np.intersect1d(preds[row], targets[row]).size 199 | 200 | return k/float(n) 201 | 202 | 203 | 204 | class ETAHook(AbstractHook, IAtEpochStartObservable): 205 | def __init__(self, name='', print_every_x_batches=1000): 206 | super(ETAHook, self).__init__(name, 'ETA', print_every_x_batches) 207 | self.t = Timer(silent=True) 208 | self.cumulative_t = 0.0 209 | self.skipped_first = False 210 | 211 | def get_time_string(self, seconds): 212 | m, s = divmod(seconds, 60) 213 | h, m = divmod(m, 60) 214 | if h < 0: h = 0 215 | if m < 0: m = 0 216 | if s < 0: s = 0 217 | return "%d:%02d:%02d" % (h, m, s) 218 | 219 | def calculate_metric(self, state): 220 | n = state.num_batches 221 | i = state.current_idx 222 | cumulative_t = self.t.tick('ETA') 223 | total_time_estimate = (cumulative_t/i)*n 224 | self.t.tick('ETA') 225 | self.cumulative_t = cumulative_t 226 | 227 | return total_time_estimate 228 | 229 | def print_statistic(self): 230 | if not self.skipped_first: 231 | # the first estimation is very unreliable for time measures 232 | self.skipped_first = True 233 | return 0, 0, 0, 0 234 | n, lower, m, upper = self.get_confidence_intervals() 235 | lower -= self.cumulative_t 236 | m -= self.cumulative_t 237 | upper -= self.cumulative_t 238 | lower, m, upper = self.get_time_string(lower), self.get_time_string(m), self.get_time_string(upper) 239 | log.info('{3} {4}: {2}\t99% CI: ({0}, {1}), n={5}'.format(lower, upper, m, self.name, self.metric_name, n)) 240 | return lower, upper, m, n 241 | 242 | def at_start_of_epoch_event(self, batcher_state): 243 | self.t.tick('ETA') 244 | t = self.t.tick('Epoch') 245 | 246 | def at_end_of_epoch_event(self, state): 247 | self.t.tock('ETA') 248 | epoch_time = self.t.tock('Epoch') 249 | self.epoch_errors.append([epoch_time]) 250 | log.info('Total epoch time: {0}'.format(self.get_time_string(epoch_time))) 251 | del self.current_scores[:] 252 | self.n = 0 253 | self.mean = 0 254 | self.M2 = 0 255 | self.skipped_first = False 256 | self.epoch += 1 257 | return epoch_time 258 | -------------------------------------------------------------------------------- /KDD2021_demo/kg_completion/spodernet/interfaces.py: -------------------------------------------------------------------------------- 1 | #These are completly useless, but they signify intent which is important. 2 | 3 | class IAtIterEndObservable(object): 4 | def at_end_of_iter_event(self, batcher_state): 5 | raise NotImplementedError('Subclasses of IAtIterEndObservable need to override the end_of_iter_event method') 6 | 7 | class IAtEpochStartObservable(object): 8 | def at_start_of_epoch_event(self, batcher_state): 9 | raise NotImplementedError('Subclasses of IAtEpochStartObservable need to override the at_start_of_epoch method') 10 | 11 | class IAtEpochEndObservable(object): 12 | def at_end_of_epoch_event(self, batcher_state): 13 | raise NotImplementedError('Subclasses of IAtEpochEndObservable need to override the end_of_iter_epoch method') 14 | 15 | class IAtBatchPreparedObservable(object): 16 | def at_batch_prepared(self, batch_parts): 17 | raise NotImplementedError('Subclasses of IAtBatchPreparedObservable need to override the at_batch_prepared method') 18 | -------------------------------------------------------------------------------- /KDD2021_demo/kg_completion/spodernet/preprocessing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/KDD2021_demo/kg_completion/spodernet/preprocessing/__init__.py -------------------------------------------------------------------------------- /KDD2021_demo/kg_completion/spodernet/preprocessing/vocab.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | import numpy as np 4 | import os 5 | import time 6 | import datetime 7 | import pickle 8 | import urllib 9 | # import bashmagic 10 | import time 11 | import json 12 | 13 | from spodernet.utils.util import get_data_path, save_data, xavier_uniform_weight 14 | from os.path import join 15 | 16 | from spodernet.utils.util import Logger 17 | log = Logger('vocab.py.txt') 18 | 19 | '''This models the vocabulary and token embeddings''' 20 | 21 | class Vocab(object): 22 | '''Class that manages work/char embeddings''' 23 | 24 | def __init__(self, path, vocab = Counter(), labels = {}): 25 | '''Constructor. 26 | Args: 27 | vocab: Counter object with vocabulary. 28 | ''' 29 | self.index = None 30 | token2idx = {} 31 | idx2token = {} 32 | self.label2idx = {} 33 | self.idx2label = {} 34 | self.glove_cache = {} 35 | for i, item in enumerate(vocab.items()): 36 | token2idx[item[0]] = i+1 37 | idx2token[i+1] = item[0] 38 | 39 | for idx in labels: 40 | self.label2idx[labels[idx]] = idx 41 | self.idx2label[idx] = labels[idx] 42 | 43 | # out of vocabulary token 44 | token2idx['OOV'] = int(0) 45 | idx2token[int(0)] = 'OOV' 46 | # empty = 0 47 | token2idx[''] = int(1) 48 | idx2token[int(1)] = '' 49 | 50 | self.token2idx = token2idx 51 | self.idx2token = idx2token 52 | self.path = path 53 | if len(idx2token.keys()) > 0: 54 | self.next_idx = int(np.max(list(idx2token.keys())) + 1) 55 | else: 56 | self.next_idx = int(2) 57 | 58 | if len(self.idx2label.keys()) > 0: 59 | self.next_label_2dx = int(int(np.max(self.idx2label.keys())) + 1) 60 | else: 61 | self.next_label_idx = int(0) 62 | 63 | @property 64 | def num_token(self): 65 | return len(self.token2idx) 66 | 67 | @property 68 | def num_labels(self): 69 | return len(self.label2idx) 70 | 71 | def add_token(self, token): 72 | if token not in self.token2idx: 73 | self.token2idx[token] = self.next_idx 74 | self.idx2token[self.next_idx] = token 75 | self.next_idx += 1 76 | 77 | def add_label(self, label): 78 | if label not in self.label2idx: 79 | self.label2idx[label] = self.next_label_idx 80 | self.idx2label[self.next_label_idx] = label 81 | self.next_label_idx += 1 82 | 83 | def get_idx(self, word): 84 | '''Gets the idx if it exists, otherwise returns -1.''' 85 | if word in self.token2idx: 86 | return self.token2idx[word] 87 | else: 88 | return self.token2idx['OOV'] 89 | 90 | def get_idx_label(self, label): 91 | '''Gets the idx of the label''' 92 | return self.label2idx[label] 93 | 94 | def get_word(self, idx): 95 | '''Gets the word if it exists, otherwise returns OOV.''' 96 | if idx in self.idx2token: 97 | return self.idx2token[idx] 98 | else: 99 | return self.idx2token[0] 100 | 101 | def save_to_disk(self, name=''): 102 | log.info('Saving vocab to: {0}'.format(self.path)) 103 | pickle.dump([self.token2idx, self.idx2token, self.label2idx, 104 | self.idx2label], open(self.path + name, 'wb')) 105 | 106 | def load_from_disk(self, name=''): 107 | if not os.path.exists(self.path + name): 108 | return False 109 | timestamp = time.ctime(os.path.getmtime(self.path + name)) 110 | timestamp = datetime.datetime.strptime(timestamp, '%a %b %d %H:%M:%S %Y') 111 | age_in_hours = (datetime.datetime.now() - timestamp).seconds/60./60. 112 | log.info('Loading vocab from: {0}'.format(self.path + name)) 113 | self.token2idx, self.idx2token, self.label2idx, self.idx2label = pickle.load(open(self.path, 'rb')) 114 | if age_in_hours > 12: 115 | log.info('Vocabulary outdated: {0}'.format(self.path + name)) 116 | return False 117 | else: 118 | return True 119 | 120 | def download_glove(self): 121 | if not os.path.exists(join(get_data_path(), 'glove')): 122 | log.info('Glove data is missing, dowloading data now...') 123 | os.mkdir(join(get_data_path(), 'glove')) 124 | bashmagic.wget("http://nlp.stanford.edu/data/glove.6B.zip", join(get_data_path(),'glove')) 125 | bashmagic.unzip(join(get_data_path(), 'glove', 'glove.6B.zip'), join(get_data_path(), 'glove')) 126 | 127 | def prepare_glove(self, dimension): 128 | if self.index is not None: return 129 | if not os.path.exists(join(get_data_path(), 'glove', 'index_50.p')): 130 | dims = [50, 100, 200, 300] 131 | base_filename = 'glove.6B.{0}d.txt' 132 | paths = [join(get_data_path(), 'glove', base_filename.format(dim)) for dim in dims] 133 | for path, dim in zip(paths, dims): 134 | index = {} 135 | index = {'PATH' : path} 136 | with open(path, 'rb') as f: 137 | log.info('Building index for {0}', path) 138 | while True: 139 | prev_pos = f.tell() 140 | line = f.readline().decode('utf-8') 141 | if line == '': break 142 | next_pos = f.tell() 143 | data = line.strip().split(' ') 144 | token = data[0] 145 | index[token] = (prev_pos, next_pos) 146 | 147 | log.info('Saving glove index...') 148 | json.dump(index, open(join(get_data_path(), 'glove', 'index_{0}.p'.format(dim)), 'w')) 149 | 150 | log.info('Loading glove index...') 151 | self.index = json.load(open(join(get_data_path(), 'glove', 'index_{0}.p'.format(dimension)), 'r')) 152 | 153 | 154 | def load_matrix(self, dim): 155 | log.info('Initializing glove matrix...') 156 | X = xavier_uniform_weight(len(self.token2idx), dim) 157 | log.info('Loading vectors into glove matrix with dimension: {0}', X.shape) 158 | pretrained_count = 0 159 | n = len(self.token2idx)-2 160 | for i, (token, idx) in enumerate(self.token2idx.items()): 161 | if i % 10000 == 0: print(i) 162 | vec = self.get_glove_list(token, dim) 163 | if vec is not None: 164 | X[idx] = vec 165 | pretrained_count += 1 166 | log.info('Filled matrix with {0} pretrained embeddings and {1} xavier uniform initialized embeddings.', pretrained_count, n-pretrained_count) 167 | return X 168 | 169 | def get_glove_vector(self, token, dimension=300): 170 | if token in self.glove_cache: return self.glove_cache[token] 171 | vec = self.get_glove_list(token, dimension) 172 | if vec is not None: 173 | arr = np.array(vec, dtype=np.float32) 174 | self.glove_cache[token] = arr 175 | return arr 176 | else: return None 177 | 178 | def get_glove_list(self, token, dimension=300): 179 | assert dimension in [50, 100, 200, 300], 'Dimension not supported! Only dimension 50, 100, 200, and 300 are supported!' 180 | self.download_glove() 181 | self.prepare_glove(dimension) 182 | vec = None 183 | if token in self.index: 184 | p = self.index['PATH'] 185 | with open(p, 'rb') as f: 186 | start, end = self.index[token] 187 | f.seek(start) 188 | line = f.read(end-start).decode('utf-8') 189 | data = line.strip().split(' ') 190 | vec = data[1:] 191 | 192 | return vec 193 | 194 | def exists_in_glove(self, token, dimension=300): 195 | self.download_glove() 196 | self.prepare_glove(dimension) 197 | return token in self.index 198 | 199 | 200 | def get_glove_matrix(self, dimension): 201 | assert dimension in [50, 100, 200, 300], 'Dimension not supported! Only dimension 50, 100, 200, and 300 are supported!' 202 | self.download_glove() 203 | return self.load_matrix(dimension) 204 | -------------------------------------------------------------------------------- /KDD2021_demo/kg_completion/spodernet/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/KDD2021_demo/kg_completion/spodernet/utils/__init__.py -------------------------------------------------------------------------------- /KDD2021_demo/kg_completion/spodernet/utils/cuda_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import torch 3 | from torch.cuda import Event 4 | 5 | class CUDATimer(object): 6 | def __init__(self, silent=False): 7 | self.cumulative_secs = {} 8 | self.current_ticks = {} 9 | self.silent = silent 10 | self.end = Event(enable_timing=True, blocking=True) 11 | 12 | def tick(self, name='default'): 13 | if name not in self.current_ticks: 14 | start = Event(enable_timing=True, blocking=True) 15 | start.record() 16 | self.current_ticks[name] = start 17 | 18 | return 0.0 19 | else: 20 | if name not in self.cumulative_secs: 21 | self.cumulative_secs[name] = 0 22 | self.end.record() 23 | self.end.synchronize() 24 | self.cumulative_secs[name] += self.current_ticks[name].elapsed_time(self.end)/1000. 25 | self.current_ticks.pop(name) 26 | 27 | return self.cumulative_secs[name] 28 | 29 | def tock(self, name='default'): 30 | self.tick(name) 31 | value = self.cumulative_secs[name] 32 | if not self.silent: 33 | print('Time taken for {0}: {1:.8f}s'.format(name, value)) 34 | self.cumulative_secs.pop(name) 35 | if name in self.current_ticks: 36 | del self.current_ticks[name] 37 | self.current_ticks.pop(name, None) 38 | 39 | return value 40 | -------------------------------------------------------------------------------- /KDD2021_demo/kg_completion/spodernet/utils/global_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from collections import namedtuple 3 | from spodernet.utils.logger import Logger 4 | log = Logger('global_config.py.txt') 5 | 6 | class Backends: 7 | TORCH = 'pytorch' 8 | TENSORFLOW = 'tensorflow' 9 | TEST = 'test' 10 | CNTK = 'cntk' 11 | 12 | 13 | class Config: 14 | dropout = 0.0 15 | batch_size = 128 16 | learning_rate = 0.001 17 | backend = Backends.TORCH 18 | L2 = 0.000 19 | cuda = False 20 | embedding_dim = 128 21 | hidden_size = 256 22 | input_dropout = 0.0 23 | feature_map_dropout = 0.0 24 | use_conv_transpose = False 25 | use_bias = True 26 | optimizer = 'adam' 27 | learning_rate_decay = 1.0 28 | label_smoothing_epsilon = 0.1 29 | epochs = 1000 30 | dataset = None 31 | process = False 32 | model_name = None 33 | 34 | @staticmethod 35 | def parse_argv(argv): 36 | file_name = argv[0] 37 | args = argv[1:] 38 | assert len(args) % 2 == 0, 'Global parser expects an even number of arguments.' 39 | values = [] 40 | names = [] 41 | for i, token in enumerate(args): 42 | if i % 2 == 0: 43 | names.append(token) 44 | else: 45 | values.append(token) 46 | 47 | for i in range(len(names)): 48 | if names[i] in alias2params: 49 | log.debug('Replaced parameters alias {0} with name {1}', names[i], alias2params[names[i]]) 50 | names[i] = alias2params[names[i]] 51 | 52 | for i in range(len(names)): 53 | name = names[i] 54 | if name[:2] == '--': continue 55 | if name not in params2type: 56 | log.info('List of possible parameters: {0}', params2type.keys()) 57 | log.error('Parameter {0} does not exist. Prefix your custom parameters with -- to skip parsing for global config', name) 58 | values[i] = params2type[name](values[i]) 59 | 60 | for name, value in zip(names, values): 61 | if name[:2] == '--': continue 62 | params2field[name](value) 63 | log.info('Set parameter {0} to {1}', name, value) 64 | 65 | input_dropout = 0.0 66 | feature_map_dropout = 0.0 67 | use_transposed_convolutions = False 68 | use_bias = True 69 | 70 | params2type = {} 71 | params2type['learning_rate'] = lambda x: float(x) 72 | params2type['learning_rate_decay'] = lambda x: float(x) 73 | params2type['dropout'] = lambda x: float(x) 74 | params2type['batch_size'] = lambda x: int(x) 75 | params2type['L2'] = lambda x: float(x) 76 | params2type['embedding_dim'] = lambda x: int(x) 77 | params2type['hidden_size'] = lambda x: int(x) 78 | params2type['input_dropout'] = lambda x: float(x) 79 | params2type['label_smoothing_epsilon'] = lambda x: float(x) 80 | params2type['feature_map_dropout'] = lambda x: float(x) 81 | params2type['use_conv_transpose'] = lambda x: x.lower() == 'true' or x == '1' 82 | params2type['use_bias'] = lambda x: x.lower() == 'true' or x == '1' 83 | params2type['optimizer'] = lambda x: x 84 | params2type['epochs'] = lambda x: int(x) 85 | params2type['dataset'] = lambda x: x 86 | params2type['model_name'] = lambda x: x 87 | params2type['process'] = lambda x: x.lower() == 'true' or x == '1' 88 | 89 | alias2params = {} 90 | alias2params['lr'] = 'learning_rate' 91 | alias2params['lr_decay'] = 'learning_rate_decay' 92 | alias2params['l2'] = 'L2' 93 | alias2params['input_drop'] = 'input_dropout' 94 | alias2params['hidden_drop'] = 'dropout' 95 | alias2params['feat_drop'] = 'feature_map_dropout' 96 | alias2params['bias'] = 'use_bias' 97 | alias2params['conv_trans'] = 'use_conv_transpose' 98 | alias2params['opt'] = 'optimizer' 99 | alias2params['label_smoothing'] = 'label_smoothing_epsilon' 100 | alias2params['model'] = 'model_name' 101 | 102 | 103 | 104 | params2field = {} 105 | params2field['learning_rate'] = lambda x: setattr(Config, 'learning_rate', x) 106 | params2field['learning_rate_decay'] = lambda x: setattr(Config, 'learning_rate_decay', x) 107 | params2field['dropout'] = lambda x: setattr(Config, 'dropout', x) 108 | params2field['batch_size'] = lambda x: setattr(Config, 'batch_size', x) 109 | params2field['L2'] = lambda x: setattr(Config, 'L2', x) 110 | params2field['embedding_dim'] = lambda x: setattr(Config, 'embedding_dim', x) 111 | params2field['hidden_size'] = lambda x: setattr(Config, 'hidden_size', x) 112 | params2field['input_dropout'] = lambda x: setattr(Config, 'input_dropout', x) 113 | params2field['feature_map_dropout'] = lambda x: setattr(Config, 'feature_map_dropout', x) 114 | params2field['use_conv_transpose'] = lambda x: setattr(Config, 'use_conv_transpose', x) 115 | params2field['use_bias'] = lambda x: setattr(Config, 'use_bias', x) 116 | params2field['optimizer'] = lambda x: setattr(Config, 'optimizer', x) 117 | params2field['label_smoothing_epsilon'] = lambda x: setattr(Config, 'label_smoothing_epsilon', x) 118 | params2field['epochs'] = lambda x: setattr(Config, 'epochs', x) 119 | params2field['dataset'] = lambda x: setattr(Config, 'dataset', x) 120 | params2field['process'] = lambda x: setattr(Config, 'process', x) 121 | params2field['model_name'] = lambda x: setattr(Config, 'model_name', x) 122 | 123 | 124 | -------------------------------------------------------------------------------- /KDD2021_demo/kg_completion/spodernet/utils/logger.py: -------------------------------------------------------------------------------- 1 | from enum import IntEnum 2 | from os.path import join 3 | 4 | import os 5 | import datetime 6 | import numpy as np 7 | import time 8 | 9 | # util functions start 10 | # 11 | # these function also exist in util.py, 12 | # but since logger is imported everywere these function need to be included here 13 | 14 | def get_home_path(): 15 | return os.environ['HOME'] 16 | 17 | def get_logger_path(): 18 | return join(get_home_path(), '.data', 'log_files') 19 | 20 | def make_dirs_if_not_exists(path): 21 | if not os.path.exists(path): 22 | os.makedirs(path) 23 | 24 | # util functions end 25 | class GlobalLogger: 26 | timestr = None 27 | global_logger_path = None 28 | f_global_logger = None 29 | 30 | @staticmethod 31 | def init(): 32 | GlobalLogger.timestr = time.strftime("%Y%m%d-%H%M%S") 33 | if not os.path.exists(join(get_logger_path(), 'full_logs')): 34 | os.mkdir(join(get_logger_path(), 'full_logs')) 35 | GlobalLogger.global_logger_path = join(get_logger_path(), 'full_logs', GlobalLogger.timestr + '.txt') 36 | GlobalLogger.f_global_logger = open(GlobalLogger.global_logger_path, 'w') 37 | 38 | @staticmethod 39 | def flush(): 40 | GlobalLogger.f_global_logger.close() 41 | GlobalLogger.f_global_logger = open(GlobalLogger.global_logger_path, 'a') 42 | 43 | def __del__(self): 44 | GlobalLogger.f_global_logger.close() 45 | 46 | class LogLevel(IntEnum): 47 | STATISTICAL = 0 48 | DEBUG = 1 49 | INFO = 2 50 | WARNING = 3 51 | ERROR = 4 52 | 53 | class Logger: 54 | GLOBAL_LOG_LEVEL = LogLevel.INFO 55 | LOG_PROPABILITY = 0.05 56 | USE_GLOBAL_STATISTICAL_LOG_PROBABILITY = False 57 | PRINT_COUNT = 2 58 | 59 | def __init__(self, file_name, write_type='w'): 60 | path = join(get_logger_path(), file_name) 61 | path_statistical = join(get_logger_path(), 'statistical_' + file_name) 62 | self.path = path 63 | make_dirs_if_not_exists(get_logger_path()) 64 | self.f = open(path, write_type) 65 | self.f_statistical = open(path_statistical, write_type) 66 | self.rdm = np.random.RandomState(234234) 67 | self.debug('Created log file at: {0} with write type: {1}'.format(path, write_type)) 68 | self.once_dict = {} 69 | 70 | def __del__(self): 71 | self.f.close() 72 | self.f_statistical.close() 73 | 74 | def wrap_message(self, message, log_level, *args): 75 | return '{0} ({2}): {1}'.format(datetime.datetime.now(), message.format(*args), log_level.name) 76 | 77 | def statistical(self, message, p, *args): 78 | if Logger.GLOBAL_LOG_LEVEL == LogLevel.STATISTICAL: 79 | self._log_statistical(message, p, *args) 80 | 81 | def debug(self, message, *args): 82 | self._log(message, LogLevel.DEBUG, *args) 83 | 84 | def info_once(self, message, *args): 85 | if LogLevel.INFO < Logger.GLOBAL_LOG_LEVEL: return 86 | if message not in self.once_dict: self.once_dict[message] = 0 87 | if self.once_dict[message] < Logger.PRINT_COUNT: 88 | self.once_dict[message] += 1 89 | self._log(message, LogLevel.INFO, *args) 90 | 91 | def debug_once(self, message, *args): 92 | if LogLevel.DEBUG < Logger.GLOBAL_LOG_LEVEL: return 93 | if message not in self.once_dict: self.once_dict[message] = 0 94 | if self.once_dict[message] < Logger.PRINT_COUNT: 95 | self.once_dict[message] += 1 96 | self._log(message, LogLevel.DEBUG, *args) 97 | 98 | def info(self, message, *args): 99 | self._log(message, LogLevel.INFO, *args) 100 | 101 | def warning(self, message, *args): 102 | self._log(message, LogLevel.WARNING, *args) 103 | 104 | def error(self, message, *args): 105 | self._log(message, LogLevel.ERROR, *args) 106 | raise Exception(message.format(*args)) 107 | 108 | def _log_statistical(self, message, p, *args): 109 | rdm_num = self.rdm.rand() 110 | if Logger.USE_GLOBAL_STATISTICAL_LOG_PROBABILITY: 111 | if rdm_num < Logger.LOG_PROPABILITY: 112 | message = self.wrap_message(message, LogLevel.STATISTICAL, *args) 113 | self.f_statistical.write(message + '\n') 114 | else: 115 | if rdm_num < p: 116 | message = self.wrap_message(message, LogLevel.STATISTICAL, *args) 117 | self.f_statistical.write(message + '\n') 118 | 119 | def _log(self, message, log_level=LogLevel.INFO, *args): 120 | if log_level >= Logger.GLOBAL_LOG_LEVEL: 121 | message = self.wrap_message(message, log_level, *args) 122 | if message.strip() != '': 123 | print(message) 124 | self.f.write(message + '\n') 125 | if GlobalLogger.f_global_logger is None: GlobalLogger.init() 126 | GlobalLogger.f_global_logger.write(message + '\n') 127 | 128 | 129 | -------------------------------------------------------------------------------- /KDD2021_demo/kg_completion/spodernet/utils/spacy_util.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | 3 | subjects = set(['nsubj']) 4 | objects = set(['dobj', 'pobj']) 5 | 6 | def merge_noun_phrases(sent_doc): 7 | for np in sent_doc.noun_chunks: 8 | np.merge(np.root.tag_, np.text, np.root.ent_type_) 9 | 10 | def merge_entities(sent_doc): 11 | for ent in sent_doc.ents: 12 | ent.merge(ent.root.dep_, ent.text, ent.label_) 13 | 14 | def merge_verbs(sent_doc): 15 | has_double_verb = False 16 | for span_length in [3, 2]: 17 | i = 1 18 | while i < len(sent_doc)-1: 19 | token = sent_doc[i] 20 | if token.pos_ == 'VERB': 21 | full_match = True 22 | for j in range(1, span_length): 23 | full_match &= sent_doc[i-j].pos_ == 'VERB' 24 | if full_match: 25 | span = sent_doc[i-1:i+span_length-1] 26 | span.merge() 27 | i += span_length-1 28 | has_double_verb = True 29 | i += 1 30 | 31 | def merge_with_set(sent_doc, to_match, write_key='pobj'): 32 | for span_length in [5, 4, 3, 2]: 33 | i = span_length-1 34 | while i < len(sent_doc)-1: 35 | token = sent_doc[i] 36 | if token.dep_ in write_key: 37 | pos, dep = token.pos_, token.dep_ 38 | full_match = True 39 | for j in range(1, span_length): 40 | full_match &= sent_doc[i-j].dep_ in to_match 41 | full_match &= sent_doc[i-j].pos_ != 'VERB' 42 | idx = sent_doc[i-j].idx 43 | if full_match: 44 | span = sent_doc[i-1:i+span_length-1] 45 | span.merge() 46 | sent_doc[i-1].dep_ = dep 47 | i += span_length-1 48 | i += 1 49 | 50 | def merge_tokens(sent_doc): 51 | merge_noun_phrases(sent_doc) 52 | merge_entities(sent_doc) 53 | merge_verbs(sent_doc) 54 | merge_with_set(sent_doc, set(['pobj', 'prep'])) 55 | merge_with_set(sent_doc, set(['pobj', 'prep'])) 56 | merge_with_set(sent_doc, set(['pobj', 'prep'])) 57 | merge_with_set(sent_doc, set(['attr', 'punct', 'cc', 'conj', 'pobj']), 'pobj') 58 | merge_with_set(sent_doc, set(['attr', 'punct', 'cc', 'conj', 'pobj']), 'pobj') 59 | merge_with_set(sent_doc, set(['dobj', 'pobj']), 'dobj') 60 | merge_with_set(sent_doc, set(['dobj', 'pobj']), 'dobj') 61 | merge_with_set(sent_doc, set(['attr', 'punct', 'cc', 'conj', 'pobj', 'dobj']), 'dobj') 62 | merge_with_set(sent_doc, set(['attr', 'punct', 'cc', 'conj', 'pobj', 'dobj']), 'dobj') 63 | 64 | def extract_triples(sent_doc): 65 | triples = [] 66 | triple = [] 67 | for token in sent_doc: 68 | if token.pos_ == 'VERB': 69 | if len(triple) == 0: continue 70 | if triple[-1].dep_ in subjects: 71 | triple.append(token) 72 | else: 73 | triple = [] 74 | if token.dep_ in subjects: 75 | if len(triple) == 0: 76 | triple.append(token) 77 | else: 78 | triple = [token] 79 | if token.dep_ in objects: 80 | if len(triple) == 0: continue 81 | if triple[-1].pos_ == 'VERB': 82 | triple.append(token) 83 | triples.append(triple) 84 | triple = [] 85 | else: 86 | triple = [] 87 | return triples 88 | -------------------------------------------------------------------------------- /KDD2021_demo/kg_completion/spodernet/utils/util.py: -------------------------------------------------------------------------------- 1 | from os.path import join 2 | from scipy.sparse import csr_matrix, spmatrix 3 | 4 | import h5py 5 | import os 6 | import time 7 | import os 8 | import numpy as np 9 | import torch 10 | 11 | from spodernet.utils.logger import Logger 12 | log = Logger('util.py.txt') 13 | 14 | rdm = np.random.RandomState(2345235) 15 | 16 | def save_dense_hdf(path, data): 17 | '''Writes a numpy array to a hdf5 file under the given path.''' 18 | log.debug_once('Saving hdf5 file to: {0}', path) 19 | h5file = h5py.File(path, "w") 20 | h5file.create_dataset("default", data=data) 21 | h5file.close() 22 | 23 | 24 | def load_dense_hdf(path, keyword='default'): 25 | '''Reads and returns a numpy array for a hdf5 file''' 26 | log.debug_once('Reading hdf5 file from: {0}', path) 27 | h5file = h5py.File(path, 'r') 28 | dset = h5file.get(keyword) 29 | data = dset[:] 30 | h5file.close() 31 | return data 32 | 33 | def save_sparse_hdf(path, data): 34 | shape = data.shape 35 | sparse = csr_matrix(data) 36 | folder, filename = os.path.split(path) 37 | save_dense_hdf(join(folder, 'data_' + filename), sparse.data) 38 | save_dense_hdf(join(folder, 'indices_' + filename), sparse.indices) 39 | save_dense_hdf(join(folder, 'indptr_' + filename), sparse.indptr) 40 | save_dense_hdf(join(folder, 'shape_dense_' + filename), shape) 41 | save_dense_hdf(join(folder, 'shape_sparse_' + filename), sparse.shape) 42 | 43 | def load_sparse_hdf(path, keyword='default'): 44 | folder, filename = os.path.split(path) 45 | data = load_dense_hdf(join(folder, 'data_' + filename)) 46 | indices = load_dense_hdf(join(folder, 'indices_' + filename)) 47 | indptr = load_dense_hdf(join(folder, 'indptr_' + filename)) 48 | shape = load_dense_hdf(join(folder, 'shape_dense_' + filename)) 49 | shape_sparse = load_dense_hdf(join(folder, 'shape_sparse_' + filename)) 50 | return csr_matrix((data, indices, indptr), shape=shape_sparse).toarray().reshape(shape) 51 | 52 | def load_data(path): 53 | folder, filename = os.path.split(path) 54 | if os.path.exists(join(folder, 'indptr_' + filename)): 55 | data = load_sparse_hdf(path) 56 | return data 57 | else: 58 | return load_dense_hdf(path) 59 | 60 | def save_data(path, data): 61 | assert data.size > 0 62 | is_sparse = isinstance(data, spmatrix) 63 | if is_sparse: 64 | save_sparse_hdf(path, data) 65 | return 66 | 67 | zero = (data == 0.0).sum() 68 | percent = zero/float(data.size) 69 | if percent > 0.5: 70 | save_sparse_hdf(path, data) 71 | else: 72 | save_dense_hdf(path, data) 73 | 74 | 75 | def load_hdf5_paths(paths, limit=None): 76 | data = [] 77 | for path in paths: 78 | if limit != None: 79 | data.append(load_data(path)[:limit]) 80 | else: 81 | data.append(load_data(path)) 82 | return data 83 | 84 | def get_home_path(): 85 | return os.environ['HOME'] 86 | 87 | def get_data_path(): 88 | return join(os.environ['HOME'], '.data') 89 | 90 | def make_dirs_if_not_exists(path): 91 | if not os.path.exists(path): 92 | os.makedirs(path) 93 | 94 | # taken from pytorch; gain parameter is omitted 95 | def xavier_uniform_weight(fan_in, fan_out): 96 | std = np.sqrt(2.0 / (fan_in + fan_out)) 97 | a = np.sqrt(3.0) * std 98 | return np.float32(rdm.uniform(-a, a, size=(fan_in, fan_out))) 99 | 100 | def embedding_sequence2text(vocab, embedding, break_at_0=True): 101 | if not isinstance(embedding, np.ndarray): 102 | if isinstance(embedding, torch.autograd.Variable): 103 | emb = embedding.data.cpu().numpy() 104 | else: 105 | emb = embedding.cpu().numpy() 106 | else: 107 | emb = embedding 108 | sentences = [] 109 | for row in emb: 110 | sentence_array = [] 111 | for idx in row: 112 | if idx == 0: break 113 | sentence_array.append(vocab.get_word(idx)) 114 | sentences.append(sentence_array) 115 | return sentences 116 | 117 | class PercentileRejecter(object): 118 | 119 | def __init__(self, above_percentile_threshold): 120 | self.values = [] 121 | self.percentile_threshold = above_percentile_threshold 122 | self.threshold_value = 0 123 | self.current_iter = 0 124 | self.compute_every = 1 125 | 126 | def above_percentile(self, value, percentile=None): 127 | self.values.append(value) 128 | self.current_iter += 1 129 | if len(self.values) < 20: 130 | return False 131 | else: 132 | if percentile is None: 133 | if self.current_iter % self.compute_every == 0: 134 | p = np.percentile(self.values, self.percentile_threshold) 135 | if p*1.05 < self.threshold_value or p*0.95 > self.threshold_value: 136 | self.threshold_value = p 137 | self.compute_every -= 1 138 | if self.compute_every < 1: self.compute_every = 1 139 | else: 140 | self.compute_every += 1 141 | else: 142 | p = self.threshold_value 143 | else: 144 | p = np.percentile(self.values, percentile) 145 | self.threshold_value = p 146 | return value > p 147 | 148 | 149 | class Timer(object): 150 | def __init__(self, silent=False): 151 | self.cumulative_secs = {} 152 | self.current_ticks = {} 153 | self.silent = silent 154 | 155 | def tick(self, name='default'): 156 | if name not in self.current_ticks: 157 | self.current_ticks[name] = time.time() 158 | 159 | return 0.0 160 | else: 161 | if name not in self.cumulative_secs: 162 | self.cumulative_secs[name] = 0 163 | t = time.time() 164 | self.cumulative_secs[name] += t - self.current_ticks[name] 165 | self.current_ticks.pop(name) 166 | 167 | return self.cumulative_secs[name] 168 | 169 | def tock(self, name='default'): 170 | self.tick(name) 171 | value = self.cumulative_secs[name] 172 | if not self.silent: 173 | log.info('Time taken for {0}: {1:.8f}s'.format(name, value)) 174 | self.cumulative_secs.pop(name) 175 | self.current_ticks.pop(name, None) 176 | 177 | return value 178 | 179 | -------------------------------------------------------------------------------- /KDD2021_demo/kg_completion/wrangle_KG.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from os.path import join 3 | import json 4 | 5 | import argparse 6 | import datetime 7 | import json 8 | import urllib 9 | import pickle 10 | import os 11 | import numpy as np 12 | import operator 13 | import sys 14 | 15 | rdm = np.random.RandomState(234234) 16 | 17 | if len(sys.argv) > 1: 18 | dataset_name = sys.argv[1] 19 | else: 20 | dataset_name = 'FB15k-237' 21 | #dataset_name = 'FB15k' 22 | #dataset_name = 'yago' 23 | #dataset_name = 'WN18RR' 24 | 25 | print('Processing dataset {0}'.format(dataset_name)) 26 | 27 | rdm = np.random.RandomState(2342423) 28 | base_path = 'kg_completion/data/{0}/'.format(dataset_name) 29 | files = ['train.txt', 'valid.txt', 'test.txt'] 30 | 31 | data = [] 32 | for p in files: 33 | with open(join(base_path, p)) as f: 34 | data = f.readlines() + data 35 | 36 | 37 | label_graph = {} 38 | train_graph = {} 39 | test_cases = {} 40 | for p in files: 41 | test_cases[p] = [] 42 | train_graph[p] = {} 43 | 44 | 45 | for p in files: 46 | with open(join(base_path, p)) as f: 47 | for i, line in enumerate(f): 48 | e1, rel, e2 = line.split('\t') 49 | e1 = e1.strip() 50 | e2 = e2.strip() 51 | rel = rel.strip() 52 | rel_reverse = rel+ '_reverse' 53 | 54 | # data 55 | # (Mike, fatherOf, John) 56 | # (John, fatherOf, Tom) 57 | 58 | if (e1 , rel) not in label_graph: 59 | label_graph[(e1, rel)] = set() 60 | 61 | if (e2, rel_reverse) not in label_graph: 62 | label_graph[(e2, rel_reverse)] = set() 63 | 64 | if (e1, rel) not in train_graph[p]: 65 | train_graph[p][(e1, rel)] = set() 66 | if (e2, rel_reverse) not in train_graph[p]: 67 | train_graph[p][(e2, rel_reverse)] = set() 68 | 69 | # labels 70 | # (Mike, fatherOf, John) 71 | # (John, fatherOf, Tom) 72 | # (John, fatherOf_reverse, Mike) 73 | # (Tom, fatherOf_reverse, Mike) 74 | label_graph[(e1, rel)].add(e2) 75 | 76 | label_graph[(e2, rel_reverse)].add(e1) 77 | 78 | # test cases 79 | # (Mike, fatherOf, John) 80 | # (John, fatherOf, Tom) 81 | test_cases[p].append([e1, rel, e2]) 82 | 83 | # data 84 | # (Mike, fatherOf, John) 85 | # (John, fatherOf, Tom) 86 | # (John, fatherOf_reverse, Mike) 87 | # (Tom, fatherOf_reverse, John) 88 | train_graph[p][(e1, rel)].add(e2) 89 | train_graph[p][(e2, rel_reverse)].add(e1) 90 | 91 | 92 | 93 | def write_training_graph(cases, graph, path): 94 | with open(path, 'w') as f: 95 | n = len(graph) 96 | for i, key in enumerate(graph): 97 | e1, rel = key 98 | # (Mike, fatherOf, John) 99 | # (John, fatherOf, Tom) 100 | # (John, fatherOf_reverse, Mike) 101 | # (Tom, fatherOf_reverse, John) 102 | 103 | # (John, fatherOf) -> Tom 104 | # (John, fatherOf_reverse, Mike) 105 | entities1 = " ".join(list(graph[key])) 106 | 107 | data_point = {} 108 | data_point['e1'] = e1 109 | data_point['e2'] = 'None' 110 | data_point['rel'] = rel 111 | data_point['rel_eval'] = 'None' 112 | data_point['e2_multi1'] = entities1 113 | data_point['e2_multi2'] = "None" 114 | 115 | f.write(json.dumps(data_point) + '\n') 116 | 117 | def write_evaluation_graph(cases, graph, path): 118 | with open(path, 'w') as f: 119 | n = len(cases) 120 | n1 = 0 121 | n2 = 0 122 | for i, (e1, rel, e2) in enumerate(cases): 123 | # (Mike, fatherOf) -> John 124 | # (John, fatherOf, Tom) 125 | rel_reverse = rel+'_reverse' 126 | entities1 = " ".join(list(graph[(e1, rel)])) 127 | entities2 = " ".join(list(graph[(e2, rel_reverse)])) 128 | 129 | n1 += len(entities1.split(' ')) 130 | n2 += len(entities2.split(' ')) 131 | 132 | 133 | data_point = {} 134 | data_point['e1'] = e1 135 | data_point['e2'] = e2 136 | data_point['rel'] = rel 137 | data_point['rel_eval'] = rel_reverse 138 | data_point['e2_multi1'] = entities1 139 | data_point['e2_multi2'] = entities2 140 | 141 | f.write(json.dumps(data_point) + '\n') 142 | 143 | 144 | all_cases = test_cases['train.txt'] + test_cases['valid.txt'] + test_cases['test.txt'] 145 | write_training_graph(test_cases['train.txt'], train_graph['train.txt'], 'kg_completion/data/{0}/e1rel_to_e2_train.json'.format(dataset_name)) 146 | write_evaluation_graph(test_cases['valid.txt'], label_graph, join('kg_completion/data/{0}/e1rel_to_e2_ranking_dev.json'.format(dataset_name))) 147 | write_evaluation_graph(test_cases['test.txt'], label_graph, 'kg_completion/data/{0}/e1rel_to_e2_ranking_test.json'.format(dataset_name)) 148 | write_training_graph(all_cases, label_graph, 'kg_completion/data/{0}/e1rel_to_e2_full.json'.format(dataset_name)) 149 | -------------------------------------------------------------------------------- /KDD2021_demo/math_word_problem_solving/config.yaml: -------------------------------------------------------------------------------- 1 | graph_construction_name: "dependency" 2 | graph_embedding_name: "graphsage" 3 | decoder_name: "stdtree" 4 | 5 | graph_construction_args: 6 | graph_construction_share: 7 | graph_type: 'dependency' 8 | root_dir: "./data" 9 | topology_subdir: 'DependencyGraph' 10 | thread_number: 4 11 | port: 9000 12 | timeout: 15000 13 | 14 | graph_construction_private: 15 | edge_strategy: 'homogeneous' 16 | merge_strategy: 'tailhead' 17 | sequential_link: true 18 | as_node: false 19 | 20 | node_embedding: 21 | input_size: 300 22 | hidden_size: 300 23 | word_dropout: 0.1 24 | rnn_dropout: 0.1 25 | fix_bert_emb: false 26 | fix_word_emb: false 27 | embedding_style: 28 | single_token_item: true 29 | emb_strategy: "w2v_bilstm" 30 | num_rnn_layers: 1 31 | bert_model_name: null 32 | bert_lower_case: null 33 | 34 | sim_metric_type: 'weighted_cosine' 35 | num_heads: 1 36 | top_k_neigh: null 37 | epsilon_neigh: 0.5 38 | smoothness_ratio: 0.1 39 | connectivity_ratio: 0.05 40 | sparsity_ratio: 0.1 41 | 42 | graph_embedding_args: 43 | graph_embedding_share: 44 | num_layers: 1 45 | input_size: 300 46 | hidden_size: 300 47 | output_size: 300 48 | direction_option: "undirected" 49 | feat_drop: 0.0 50 | attn_drop: 0.0 51 | 52 | graph_embedding_private: 53 | aggregator_type: "lstm" 54 | bias: true 55 | norm: null 56 | activation: "relu" 57 | use_edge_weight: false 58 | 59 | decoder_args: 60 | rnn_decoder_share: 61 | rnn_type: "lstm" 62 | input_size: 300 63 | hidden_size: 300 64 | rnn_emb_input_size: 300 65 | use_copy: true 66 | graph_pooling_strategy: null 67 | attention_type: "uniform" 68 | fuse_strategy: "concatenate" 69 | dropout: 0.3 70 | teacher_forcing_rate: 1.0 71 | 72 | rnn_decoder_private: 73 | max_decoder_step: 35 74 | max_tree_depth: 8 75 | use_sibling: false 76 | use_input_feed: true 77 | -------------------------------------------------------------------------------- /KDD2021_demo/math_word_problem_solving/data/processed/DependencyGraph/data.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/KDD2021_demo/math_word_problem_solving/data/processed/DependencyGraph/data.pt -------------------------------------------------------------------------------- /KDD2021_demo/math_word_problem_solving/data/processed/DependencyGraph/vocab.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/KDD2021_demo/math_word_problem_solving/data/processed/DependencyGraph/vocab.pt -------------------------------------------------------------------------------- /KDD2021_demo/math_word_problem_solving/imgs/g2t.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/KDD2021_demo/math_word_problem_solving/imgs/g2t.png -------------------------------------------------------------------------------- /KDD2021_demo/math_word_problem_solving/utils.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import torch 3 | import sympy 4 | from random import randint 5 | from sympy.parsing.sympy_parser import parse_expr 6 | 7 | def convert_to_string(idx_list, form_manager): 8 | w_list = [] 9 | for i in range(len(idx_list)): 10 | w_list.append(form_manager.get_idx_symbol(int(idx_list[i]))) 11 | return " ".join(w_list) 12 | 13 | def is_all_same(c1, c2, form_manager): 14 | all_same = False 15 | if len(c1) == len(c2): 16 | all_same = True 17 | for j in range(len(c1)): 18 | if c1[j] != c2[j]: 19 | all_same = False 20 | break 21 | if all_same == False: 22 | if is_solution_same(c1, c2, form_manager): 23 | return True 24 | return False 25 | else: 26 | return True 27 | 28 | 29 | def is_solution_same(i1, i2, form_manager): 30 | c1 = " ".join([form_manager.get_idx_symbol(x) for x in i1]) 31 | c2 = " ".join([form_manager.get_idx_symbol(x) for x in i2]) 32 | if ('=' not in c1) or ('=' not in c2): 33 | return False 34 | elif (form_manager.unk_token in c1) or (form_manager.unk_token in c2): 35 | return False 36 | else: 37 | try: 38 | s1 = c1.split('=') 39 | s2 = c2.split('=') 40 | eq1 = [] 41 | eq2 = [] 42 | x = sympy.Symbol('x') 43 | eq1.append(parse_expr(s1[0])) 44 | eq1.append(parse_expr(s1[1])) 45 | eq2.append(parse_expr(s2[0])) 46 | eq2.append(parse_expr(s2[1])) 47 | res1 = sympy.solve(sympy.Eq(eq1[0], eq1[1]), x) 48 | res2 = sympy.solve(sympy.Eq(eq2[0], eq2[1]), x) 49 | 50 | if not res1 or not res2: 51 | return False 52 | if res1[0] == res2[0]: 53 | # print("Excution_true: ", c1, '\t', c2) 54 | pass 55 | return res1[0] == res2[0] 56 | 57 | except BaseException: 58 | # print("Excution_error: ", c1, '\t', c2) 59 | pass 60 | return False 61 | 62 | def compute_accuracy(candidate_list, reference_list, form_manager): 63 | if len(candidate_list) != len(reference_list): 64 | print("candidate list has length {}, reference list has length {}\n".format( 65 | len(candidate_list), len(reference_list))) 66 | len_min = min(len(candidate_list), len(reference_list)) 67 | c = 0 68 | for i in range(len_min): 69 | if is_all_same(candidate_list[i], reference_list[i], form_manager): 70 | c = c+1 71 | else: 72 | pass 73 | return c/float(len_min) 74 | 75 | 76 | def compute_tree_accuracy(candidate_list_, reference_list_, form_manager): 77 | candidate_list = [] 78 | for i in range(len(candidate_list_)): 79 | candidate_list.append(candidate_list_[i]) 80 | reference_list = [] 81 | for i in range(len(reference_list_)): 82 | reference_list.append(reference_list_[i]) 83 | return compute_accuracy(candidate_list, reference_list, form_manager) 84 | 85 | def prepare_oov(batch_graph, src_vocab, device): 86 | oov_dict = copy.deepcopy(src_vocab) 87 | token_matrix = [] 88 | for n in batch_graph.node_attributes: 89 | node_token = n['token'] 90 | if oov_dict.get_symbol_idx(node_token) == oov_dict.get_symbol_idx(oov_dict.unk_token): 91 | oov_dict.add_symbol(node_token) 92 | token_matrix.append(oov_dict.get_symbol_idx(node_token)) 93 | batch_graph.node_features['token_id_oov'] = torch.tensor(token_matrix, dtype=torch.long).to(device) 94 | return oov_dict -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep Learning on Graphs for Natural Language Processing Demo 2 | 3 | 4 | The repository contains code examples for [DLG4NLP](https://dlg4nlp.github.io) tutorials at [NAACL 2021](https://2021.naacl.org), [SIGIR 2021](https://sigir.org/sigir2021/), [KDD 2021](https://www.kdd.org/kdd2021/), [IJCAI 2021](http://ijcai-21.org), [AAAI 2022](https://aaai.org/Conferences/AAAI-22/) and [TheWebConf 2022](https://www2022.thewebconf.org/conference-schedule/). 5 | 6 | Slides can be downloaded from [here](https://dlg4nlp.github.io/tutorials.html). 7 | 8 | 9 | ## Get Started 10 | 11 | You will need to install our [graph4nlp library](https://github.com/graph4ai/graph4nlp) in order to run the demo code. Please follow the following environment setup instructions. Please also refer to the [*graph4nlp* repository page](https://github.com/graph4ai/graph4nlp#readme) for more details on how to use the library. 12 | 13 | 14 | ### Environment setup 15 | 16 | 1. Create virtual environment 17 | ``` 18 | conda create --name graph4nlp python=3.8 19 | conda activate graph4nlp 20 | ``` 21 | 22 | 2. Install [graph4nlp](https://github.com/graph4ai/graph4nlp) library 23 | - Clone the github repo 24 | ``` 25 | git clone -b [branch_version] https://github.com/graph4ai/graph4nlp.git 26 | cd graph4nlp 27 | ``` 28 | Please choose the branch version corresponding to the demo version as shown in the table below. 29 | 30 | | demo version | library branch version | 31 | | ---- | ---- | 32 | | DLG4NLP@ICLR 2022 | v0.5.5 | 33 | | TheWebConf 2022 | v0.5.5 | 34 | | AAAI 2022 | v0.5.5 | 35 | | CLIQ-ai 2021 | stable_nov2021b | 36 | | IJCAI 2021 | stable_202108 | 37 | | KDD 2021 | stable_202108 | 38 | | SIGIR 2021 | stable | 39 | | NAACL 2021 | stable | 40 | 41 | 42 | - Then run `./configure` (or `./configure.bat` if you are using Windows 10) to config your installation. The configuration program will ask you to specify your CUDA version. If you do not have a GPU, please choose 'cpu'. 43 | ``` 44 | ./configure 45 | ``` 46 | - Finally, install the package 47 | ``` 48 | python setup.py install 49 | ``` 50 | 3. Install other packages 51 | ``` 52 | pip install torchtext 53 | pip install notebook 54 | ``` 55 | 56 | 4. Set up StanfordCoreNLP (for static graph construction only, unnecessary for this demo because preprocessed data is provided) 57 | - Download [StanfordCoreNLP](https://stanfordnlp.github.io/CoreNLP/) 58 | - Go to the root folder and start the server 59 | ``` 60 | java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000 61 | ``` 62 | 63 | 64 | ### Start Jupyter notebook and run the demo 65 | 66 | After complete the above steps, you can start the jupyter notebook server to run the demo: 67 | ``` 68 | cd graph4nlp_demo/XYZ 69 | jupyter notebook 70 | ``` 71 | Note that you will need to change `XYZ` to the specific folder name. 72 | 73 | ## Additional Resources: 74 | 75 | * [Graph4NLP library](https://github.com/graph4ai/graph4nlp) 76 | * [DLG4NLP website](https://dlg4nlp.github.io/index.html) 77 | * [DLG4NLP survey](https://arxiv.org/pdf/2106.06090) 78 | * [DLG4NLP literature repo](https://github.com/graph4ai/graph4nlp_literature) 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /TheWebConf2022_demo/Math-word-problem/config.yaml: -------------------------------------------------------------------------------- 1 | graph_construction_name: "node_emb" 2 | graph_embedding_name: "graphsage" 3 | decoder_name: "stdtree" 4 | 5 | graph_construction_args: 6 | graph_construction_share: 7 | graph_name: 'node_emb' 8 | root_dir: "data" 9 | topology_subdir: 'NodeEmbGraph' 10 | thread_number: 4 11 | port: 9000 12 | timeout: 15000 13 | 14 | graph_construction_private: 15 | edge_strategy: 'homogeneous' 16 | merge_strategy: 'tailhead' 17 | sequential_link: true 18 | as_node: false 19 | sim_metric_type: 'weighted_cosine' 20 | num_heads: 1 21 | top_k_neigh: null 22 | epsilon_neigh: 0.5 23 | smoothness_ratio: 0.1 24 | connectivity_ratio: 0.05 25 | sparsity_ratio: 0.1 26 | 27 | graph_initialization_args: 28 | input_size: 300 29 | hidden_size: 300 30 | word_dropout: 0.1 31 | rnn_dropout: 0.1 32 | fix_bert_emb: false 33 | fix_word_emb: false 34 | embedding_style: 35 | single_token_item: true 36 | emb_strategy: "w2v_bilstm" 37 | num_rnn_layers: 1 38 | bert_model_name: null 39 | bert_lower_case: null 40 | 41 | graph_embedding_args: 42 | graph_embedding_share: 43 | num_layers: 1 44 | input_size: 300 45 | hidden_size: 300 46 | output_size: 300 47 | direction_option: "undirected" 48 | feat_drop: 0.0 49 | attn_drop: 0.0 50 | 51 | graph_embedding_private: 52 | aggregator_type: "lstm" 53 | bias: true 54 | norm: null 55 | activation: "relu" 56 | use_edge_weight: true 57 | 58 | decoder_args: 59 | rnn_decoder_share: 60 | rnn_type: "lstm" 61 | input_size: 300 62 | hidden_size: 300 63 | rnn_emb_input_size: 300 64 | use_copy: true 65 | graph_pooling_strategy: null 66 | attention_type: "uniform" 67 | fuse_strategy: "concatenate" 68 | dropout: 0.3 69 | teacher_forcing_rate: 1.0 70 | 71 | rnn_decoder_private: 72 | max_decoder_step: 35 73 | max_tree_depth: 8 74 | use_sibling: false 75 | -------------------------------------------------------------------------------- /TheWebConf2022_demo/Math-word-problem/data/processed/NodeEmbGraph/data.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/TheWebConf2022_demo/Math-word-problem/data/processed/NodeEmbGraph/data.pt -------------------------------------------------------------------------------- /TheWebConf2022_demo/Math-word-problem/data/processed/NodeEmbGraph/vocab.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/TheWebConf2022_demo/Math-word-problem/data/processed/NodeEmbGraph/vocab.pt -------------------------------------------------------------------------------- /TheWebConf2022_demo/Math-word-problem/imgs/g2t.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/TheWebConf2022_demo/Math-word-problem/imgs/g2t.png -------------------------------------------------------------------------------- /TheWebConf2022_demo/Math-word-problem/utils.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import torch 3 | import sympy 4 | from random import randint 5 | from sympy.parsing.sympy_parser import parse_expr 6 | 7 | def convert_to_string(idx_list, form_manager): 8 | w_list = [] 9 | for i in range(len(idx_list)): 10 | w_list.append(form_manager.get_idx_symbol(int(idx_list[i]))) 11 | return " ".join(w_list) 12 | 13 | def is_all_same(c1, c2, form_manager): 14 | all_same = False 15 | if len(c1) == len(c2): 16 | all_same = True 17 | for j in range(len(c1)): 18 | if c1[j] != c2[j]: 19 | all_same = False 20 | break 21 | if all_same == False: 22 | if is_solution_same(c1, c2, form_manager): 23 | return True 24 | return False 25 | else: 26 | return True 27 | 28 | 29 | def is_solution_same(i1, i2, form_manager): 30 | c1 = " ".join([form_manager.get_idx_symbol(x) for x in i1]) 31 | c2 = " ".join([form_manager.get_idx_symbol(x) for x in i2]) 32 | if ('=' not in c1) or ('=' not in c2): 33 | return False 34 | elif (form_manager.unk_token in c1) or (form_manager.unk_token in c2): 35 | return False 36 | else: 37 | try: 38 | s1 = c1.split('=') 39 | s2 = c2.split('=') 40 | eq1 = [] 41 | eq2 = [] 42 | x = sympy.Symbol('x') 43 | eq1.append(parse_expr(s1[0])) 44 | eq1.append(parse_expr(s1[1])) 45 | eq2.append(parse_expr(s2[0])) 46 | eq2.append(parse_expr(s2[1])) 47 | res1 = sympy.solve(sympy.Eq(eq1[0], eq1[1]), x) 48 | res2 = sympy.solve(sympy.Eq(eq2[0], eq2[1]), x) 49 | 50 | if not res1 or not res2: 51 | return False 52 | if res1[0] == res2[0]: 53 | # print("Excution_true: ", c1, '\t', c2) 54 | pass 55 | return res1[0] == res2[0] 56 | 57 | except BaseException: 58 | # print("Excution_error: ", c1, '\t', c2) 59 | pass 60 | return False 61 | 62 | def compute_accuracy(candidate_list, reference_list, form_manager): 63 | if len(candidate_list) != len(reference_list): 64 | print("candidate list has length {}, reference list has length {}\n".format( 65 | len(candidate_list), len(reference_list))) 66 | len_min = min(len(candidate_list), len(reference_list)) 67 | c = 0 68 | for i in range(len_min): 69 | if is_all_same(candidate_list[i], reference_list[i], form_manager): 70 | c = c+1 71 | else: 72 | pass 73 | return c/float(len_min) 74 | 75 | 76 | def compute_tree_accuracy(candidate_list_, reference_list_, form_manager): 77 | candidate_list = [] 78 | for i in range(len(candidate_list_)): 79 | candidate_list.append(candidate_list_[i]) 80 | reference_list = [] 81 | for i in range(len(reference_list_)): 82 | reference_list.append(reference_list_[i]) 83 | return compute_accuracy(candidate_list, reference_list, form_manager) 84 | 85 | def prepare_ext_vocab(batch_graph, src_vocab, device): 86 | oov_dict = copy.deepcopy(src_vocab) 87 | token_matrix = [] 88 | for n in batch_graph.node_attributes: 89 | node_token = n["token"] 90 | if (n.get("type") is None or n.get("type") == 0) and oov_dict.get_symbol_idx( 91 | node_token 92 | ) == oov_dict.get_symbol_idx(oov_dict.unk_token): 93 | oov_dict.add_symbol(node_token) 94 | token_matrix.append(oov_dict.get_symbol_idx(node_token)) 95 | batch_graph.node_features["token_id_oov"] = torch.tensor(token_matrix, dtype=torch.long).to( 96 | device 97 | ) 98 | return oov_dict -------------------------------------------------------------------------------- /config/jobs/gat_bi_sep_dynamic_node_emb.yaml: -------------------------------------------------------------------------------- 1 | graph_construction_name: "node_emb" 2 | graph_embedding_name: "gat" 3 | decoder_name: "stdrnn" 4 | 5 | graph_construction_args: 6 | graph_construction_share: 7 | graph_type: 'node_emb' 8 | root_dir: "../data/jobs" 9 | topology_subdir: 'node_emb_graph' 10 | thread_number: 4 11 | port: 9000 12 | timeout: 15000 13 | 14 | graph_construction_private: 15 | edge_strategy: 'homogeneous' 16 | merge_strategy: 'tailhead' 17 | sequential_link: true 18 | as_node: false 19 | 20 | node_embedding: 21 | input_size: 300 22 | hidden_size: 300 23 | word_dropout: 0.2 24 | rnn_dropout: 0.3 25 | fix_bert_emb: false 26 | fix_word_emb: false 27 | embedding_style: 28 | single_token_item: true 29 | emb_strategy: "w2v_bilstm" 30 | num_rnn_layers: 1 31 | bert_model_name: null 32 | bert_lower_case: null 33 | 34 | sim_metric_type: 'weighted_cosine' 35 | num_heads: 1 36 | top_k_neigh: null 37 | epsilon_neigh: 0.5 38 | smoothness_ratio: 0.1 39 | connectivity_ratio: 0.05 40 | sparsity_ratio: 0.1 41 | 42 | graph_embedding_args: 43 | graph_embedding_share: 44 | num_layers: 3 45 | input_size: 300 46 | hidden_size: 300 47 | output_size: 300 48 | direction_option: "bi_sep" 49 | feat_drop: 0.2 50 | 51 | graph_embedding_private: 52 | heads: [2, 2, 1] 53 | attn_drop: 0.2 54 | negative_slope: 0.2 55 | residual: false 56 | activation: "relu" 57 | 58 | decoder_args: 59 | rnn_decoder_share: 60 | rnn_type: "lstm" 61 | input_size: 300 62 | hidden_size: 512 63 | rnn_emb_input_size: 300 64 | use_copy: true 65 | use_coverage: true 66 | graph_pooling_strategy: null 67 | attention_type: "sep_diff_encoder_type" 68 | fuse_strategy: "concatenate" 69 | dropout: 0.3 70 | 71 | rnn_decoder_private: 72 | max_decoder_step: 50 73 | node_type_num: null 74 | tgt_emb_as_output_layer: true 75 | 76 | other_args: 77 | seed: 1236 78 | checkpoint_save_path: 'out/jobs/gat_bi_sep_node_emb_ckpt' 79 | pretrained_word_emb_name: '6B' 80 | pretrained_word_emb_cache_dir: '.vector_cache' 81 | share_vocab: True 82 | word_emb_size: 300 83 | beam_size: 4 84 | epochs: 200 # number of maximal training epochs 85 | batch_size: 24 86 | learning_rate: 1.e-3 87 | loss_display_step: 10 88 | eval_display_number: 10 89 | lr_start_decay_epoch: 20 90 | lr_decay_rate: 0.9 91 | lr_decay_per_epoch: 5 92 | min_lr: 1.e-3 93 | val_split_ratio: 0 94 | num_workers: 0 # number of data loader workers 95 | use_gpu: 1 # 0 for don't use cuda, 1 for using cuda 96 | gpu: 0 # gpu id 97 | -------------------------------------------------------------------------------- /config/jobs/gat_bi_sep_dynamic_node_emb_v2.yaml: -------------------------------------------------------------------------------- 1 | graph_construction_name: "node_emb" 2 | graph_embedding_name: "gat" 3 | decoder_name: "stdrnn" 4 | 5 | graph_construction_args: 6 | graph_construction_share: 7 | graph_name: 'node_emb' 8 | root_dir: "../data/jobs" 9 | topology_subdir: 'NodeEmbGraph' 10 | thread_number: 4 11 | port: 9000 12 | timeout: 15000 13 | 14 | graph_construction_private: 15 | edge_strategy: 'homogeneous' 16 | merge_strategy: 'tailhead' 17 | sequential_link: true 18 | as_node: false 19 | sim_metric_type: 'weighted_cosine' 20 | num_heads: 1 21 | top_k_neigh: null 22 | epsilon_neigh: 0.5 23 | smoothness_ratio: 0.1 24 | connectivity_ratio: 0.05 25 | sparsity_ratio: 0.1 26 | 27 | graph_initialization_args: 28 | input_size: 300 29 | hidden_size: 300 30 | word_dropout: 0.2 31 | rnn_dropout: 0.3 32 | fix_bert_emb: false 33 | fix_word_emb: false 34 | embedding_style: 35 | single_token_item: true 36 | emb_strategy: "w2v_bilstm" 37 | num_rnn_layers: 1 38 | bert_model_name: null 39 | bert_lower_case: null 40 | 41 | 42 | 43 | graph_embedding_args: 44 | graph_embedding_share: 45 | num_layers: 3 46 | input_size: 300 47 | hidden_size: 300 48 | output_size: 300 49 | direction_option: "bi_sep" 50 | feat_drop: 0.2 51 | 52 | graph_embedding_private: 53 | heads: [2, 2, 1] 54 | attn_drop: 0.2 55 | negative_slope: 0.2 56 | residual: true 57 | activation: "relu" 58 | allow_zero_in_degree: true 59 | 60 | decoder_args: 61 | rnn_decoder_share: 62 | rnn_type: "lstm" 63 | input_size: 300 64 | hidden_size: 512 65 | rnn_emb_input_size: 300 66 | use_copy: true 67 | use_coverage: true 68 | graph_pooling_strategy: null 69 | attention_type: "sep_diff_encoder_type" 70 | fuse_strategy: "concatenate" 71 | dropout: 0.3 72 | 73 | rnn_decoder_private: 74 | max_decoder_step: 50 75 | node_type_num: null 76 | tgt_emb_as_output_layer: true 77 | 78 | other_args: 79 | seed: 1236 80 | checkpoint_save_path: 'out/jobs/gat_bi_sep_node_emb_ckpt' 81 | pretrained_word_emb_name: '6B' 82 | pretrained_word_emb_cache_dir: '.vector_cache' 83 | share_vocab: True 84 | word_emb_size: 300 85 | beam_size: 4 86 | epochs: 200 # number of maximal training epochs 87 | batch_size: 24 88 | learning_rate: 1.e-3 89 | loss_display_step: 10 90 | eval_display_number: 10 91 | lr_start_decay_epoch: 20 92 | lr_decay_rate: 0.9 93 | lr_decay_per_epoch: 5 94 | min_lr: 1.e-3 95 | val_split_ratio: 0 96 | num_workers: 0 # number of data loader workers 97 | use_gpu: 1 # 0 for don't use cuda, 1 for using cuda 98 | gpu: 0 # gpu id 99 | -------------------------------------------------------------------------------- /config/trec/graphsage_bi_fuse_static_dependency.yaml: -------------------------------------------------------------------------------- 1 | # Data 2 | dataset: 'trec' 3 | root_data_dir: '../data/trec' 4 | val_split_ratio: 0.2 # validation set split ratio (default: 0.2) 5 | pretrained_word_emb_name: '840B' 6 | out_dir: 'out/trec/graphsage_bi_fuse_dependency_ckpt' 7 | 8 | 9 | # Graph construction 10 | graph_type: 'dependency' # graph construction type ('dependency', 'constituency', 'ie', 'node_emb', 'node_emb_refined') 11 | 12 | # Dynamic graph construction 13 | init_graph_type: null # initial graph construction type ('line', 'dependency', 'constituency', 'ie') 14 | gl_metric_type: null # similarity metric type for dynamic graph construction ('weighted_cosine', 'attention', 'rbf_kernel', 'cosine') 15 | gl_epsilon: null # epsilon for graph sparsification 16 | gl_top_k: null # top k for graph sparsification 17 | gl_num_heads: 1 # num of heads for dynamic graph construction 18 | gl_num_hidden: 300 # number of hidden units for dynamic graph construction 19 | gl_smoothness_ratio: null # smoothness ratio for graph regularization loss 20 | gl_sparsity_ratio: null # sparsity ratio for graph regularization loss 21 | gl_connectivity_ratio: null # connectivity ratio for graph regularization loss 22 | init_adj_alpha: null # alpha ratio for combining initial graph adjacency matrix 23 | 24 | 25 | # Graph embedding construction 26 | word_dropout: 0.4 # word embedding dropout 27 | rnn_dropout: 0.1 # RNN dropout 28 | no_fix_word_emb: false # Not fix pretrained word embeddings (default: false) 29 | node_edge_emb_strategy: 'mean' # node edge embedding strategy for graph embedding construction ('mean', 'lstm', 'gru', 'bilstm' and 'bigru') 30 | seq_info_encode_strategy: 'bilstm' # sequence info encoding strategy for graph embedding construction ('none', 'lstm', 'gru', 'bilstm' and 'bigru') 31 | 32 | 33 | # GNN 34 | gnn: 'graphsage' 35 | gnn_direction_option: 'bi_fuse' # GNN direction type ('undirected', 'bi_sep', 'bi_fuse') 36 | gnn_num_layers: 1 # number of GNN layers 37 | num_hidden: 300 # number of hidden units 38 | graph_pooling: 'avg_pool' # graph pooling ('avg_pool', 'max_pool') 39 | max_pool_linear_proj: false # use linear projectioni for max pooling 40 | gnn_dropout: 0.3 # 0.3 # GNN input feature dropout 41 | 42 | # GAT 43 | gat_attn_dropout: null # GAT attention dropout 44 | gat_negative_slope: null # the negative slope of leaky relu 45 | gat_num_heads: null # number of hidden attention heads 46 | gat_num_out_heads: null # number of output attention heads 47 | gat_residual: false # use gat_residual connection 48 | # GraphSAGE 49 | graphsage_aggreagte_type: 'lstm' # graphsage aggreagte type ('mean', 'gcn', 'pool', 'lstm') 50 | 51 | 52 | # Training 53 | seed: 1234 54 | batch_size: 50 # batch size 55 | epochs: 500 # number of maximal training epochs 56 | patience: 10 57 | lr: 0.001 # learning rate 58 | lr_patience: 2 59 | lr_reduce_factor: 0.5 60 | num_workers: 1 # number of data loader workers 61 | 62 | 63 | gpu: 0 64 | no_cuda: false 65 | -------------------------------------------------------------------------------- /config/trec/graphsage_bi_fuse_static_dependency_v2.yaml: -------------------------------------------------------------------------------- 1 | # Data 2 | dataset: 'trec' 3 | val_split_ratio: 0.2 # validation set split ratio (default: 0.2) 4 | pretrained_word_emb_name: '840B' 5 | out_dir: 'out/trec/graphsage_bi_fuse_dependency_ckpt' 6 | 7 | 8 | # Graph construction 9 | graph_construction_args: 10 | graph_construction_share: 11 | graph_name: 'dependency' 12 | root_dir: '../data/trec' 13 | thread_number: 10 14 | port: 9000 15 | timeout: 15000 16 | 17 | graph_construction_private: 18 | edge_strategy: 'homogeneous' 19 | merge_strategy: 'tailhead' 20 | sequential_link: true 21 | as_node: false 22 | dynamic_init_graph_name: null # initial graph construction type ('line', 'dependency', 'constituency', 'ie') 23 | 24 | 25 | 26 | # Dynamic graph construction 27 | gl_metric_type: null # similarity metric type for dynamic graph construction ('weighted_cosine', 'attention', 'rbf_kernel', 'cosine') 28 | gl_epsilon: null # epsilon for graph sparsification 29 | gl_top_k: null # top k for graph sparsification 30 | gl_num_heads: 1 # num of heads for dynamic graph construction 31 | gl_num_hidden: 300 # number of hidden units for dynamic graph construction 32 | gl_smoothness_ratio: null # smoothness ratio for graph regularization loss 33 | gl_sparsity_ratio: null # sparsity ratio for graph regularization loss 34 | gl_connectivity_ratio: null # connectivity ratio for graph regularization loss 35 | init_adj_alpha: null # alpha ratio for combining initial graph adjacency matrix 36 | 37 | 38 | # Graph embedding construction 39 | word_dropout: 0.4 # word embedding dropout 40 | rnn_dropout: 0.1 # RNN dropout 41 | no_fix_word_emb: false # Not fix pretrained word embeddings (default: false) 42 | emb_strategy: 'w2v_bilstm' 43 | 44 | # GNN 45 | gnn: 'graphsage' 46 | gnn_direction_option: 'bi_fuse' # GNN direction type ('undirected', 'bi_sep', 'bi_fuse') 47 | gnn_num_layers: 1 # number of GNN layers 48 | num_hidden: 300 # number of hidden units 49 | graph_pooling: 'avg_pool' # graph pooling ('avg_pool', 'max_pool') 50 | max_pool_linear_proj: false # use linear projectioni for max pooling 51 | gnn_dropout: 0.4 # 0.4 # GNN input feature dropout 52 | 53 | # GAT 54 | gat_attn_dropout: null # GAT attention dropout 55 | gat_negative_slope: null # the negative slope of leaky relu 56 | gat_num_heads: null # number of hidden attention heads 57 | gat_num_out_heads: null # number of output attention heads 58 | gat_residual: false # use gat_residual connection 59 | # GraphSAGE 60 | graphsage_aggreagte_type: 'lstm' # graphsage aggreagte type ('mean', 'gcn', 'pool', 'lstm') 61 | 62 | 63 | # Training 64 | seed: 1234 65 | batch_size: 50 # batch size 66 | epochs: 500 # number of maximal training epochs 67 | patience: 10 68 | lr: 0.001 # learning rate 69 | lr_patience: 2 70 | lr_reduce_factor: 0.5 71 | num_workers: 0 # number of data loader workers 72 | 73 | 74 | gpu: -1 75 | no_cuda: false 76 | -------------------------------------------------------------------------------- /data/jobs/processed/NodeEmbGraph/data.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/data/jobs/processed/NodeEmbGraph/data.pt -------------------------------------------------------------------------------- /data/jobs/processed/NodeEmbGraph/vocab.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/data/jobs/processed/NodeEmbGraph/vocab.pt -------------------------------------------------------------------------------- /data/jobs/processed/node_emb_graph/data.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/data/jobs/processed/node_emb_graph/data.pt -------------------------------------------------------------------------------- /data/jobs/processed/node_emb_graph/vocab.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/data/jobs/processed/node_emb_graph/vocab.pt -------------------------------------------------------------------------------- /data/jobs/raw/sequence.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/data/jobs/raw/sequence.pt -------------------------------------------------------------------------------- /data/jobs/raw/vocab.f.txt: -------------------------------------------------------------------------------- 1 | , 1996 2 | ) 1498 3 | ( 1498 4 | ANS 1470 5 | job 494 6 | language 209 7 | languageid0 195 8 | loc 167 9 | locid0 164 10 | req_deg 111 11 | degid0 105 12 | platform 102 13 | platformid0 97 14 | area 87 15 | areaid0 84 16 | titleid0 79 17 | title 79 18 | salary_greater_than 66 19 | num_salary 66 20 | year 65 21 | req_exp 63 22 | \+ 62 23 | company 58 24 | companyid0 56 25 | year0 31 26 | application 22 27 | languageid1 19 28 | A 19 29 | applicationid0 14 30 | des_deg 13 31 | des_exp 11 32 | recruiter 6 33 | B 6 34 | recruiterid0 6 35 | degid1 4 36 | ; 3 37 | year1 3 38 | X 3 39 | areaid1 3 40 | languageid2 2 41 | platformid1 2 42 | P 2 43 | 20 1 44 | month 1 45 | salary_less_than 1 46 | hour 1 47 | locid1 1 48 | -------------------------------------------------------------------------------- /data/jobs/raw/vocab.q.txt: -------------------------------------------------------------------------------- 1 | job 452 2 | in 200 3 | languageid0 195 4 | what 188 5 | are 186 6 | a 166 7 | locid0 164 8 | requir 163 9 | there 142 10 | me 135 11 | show 132 12 | the 130 13 | degid0 105 14 | use 100 15 | platformid0 98 16 | for 86 17 | areaid0 84 18 | titleid0 79 19 | that 74 20 | experi 72 21 | on 70 22 | ani 68 23 | num_salary 66 24 | with 65 25 | of 63 26 | and 63 27 | companyid0 59 28 | year 53 29 | not 32 30 | year0 31 31 | list 31 32 | pay 31 33 | at 29 34 | give 26 35 | degre 25 36 | desir 22 37 | but 22 38 | which 22 39 | specialist 21 40 | do 20 41 | languageid1 19 42 | salari 19 43 | avail 17 44 | no 17 45 | all 16 46 | knowledg 16 47 | posit 16 48 | have 15 49 | i 15 50 | applicationid0 14 51 | per 14 52 | tell 13 53 | offer 12 54 | than 12 55 | to 11 56 | open 11 57 | is 11 58 | least 10 59 | work 10 60 | program 9 61 | know 9 62 | want 9 63 | who 9 64 | find 8 65 | dont 8 66 | need 8 67 | area 7 68 | greater 6 69 | recruiterid0 6 70 | can 6 71 | as 6 72 | you 6 73 | doe 6 74 | over 5 75 | involv 5 76 | locat 5 77 | specialti 5 78 | like 5 79 | more 5 80 | compani 5 81 | recruit 5 82 | an 5 83 | dollar 4 84 | out 4 85 | could 4 86 | comput 4 87 | degid1 4 88 | onli 3 89 | year1 3 90 | anyth 3 91 | deal 3 92 | some 3 93 | areaid1 3 94 | or 3 95 | platform 3 96 | would 3 97 | peopl 3 98 | hire 3 99 | develop 3 100 | titl 3 101 | everyth 3 102 | move 2 103 | administr 2 104 | see 2 105 | special 2 106 | machin 2 107 | colleg 2 108 | major 2 109 | within 2 110 | art 2 111 | languageid2 2 112 | from 2 113 | system 2 114 | their 2 115 | tool 2 116 | wish 2 117 | employ 2 118 | were 2 119 | applic 2 120 | platformid1 2 121 | concern 2 122 | sure 2 123 | student 2 124 | fresh 2 125 | buzword 1 126 | help 1 127 | month 1 128 | languag 1 129 | field 1 130 | less 1 131 | might 1 132 | familiar 1 133 | vaniti 1 134 | earn 1 135 | name 1 136 | where 1 137 | pleas 1 138 | old 1 139 | locid1 1 140 | someth 1 141 | hardwar 1 142 | learn 1 143 | key 1 144 | 20 1 145 | outsid 1 146 | oper 1 147 | wonder 1 148 | doesnt 1 149 | cs 1 150 | . 1 151 | live 1 152 | much 1 153 | relat 1 154 | hold 1 155 | look 1 156 | hour 1 157 | us 1 158 | near 1 159 | it 1 160 | if 1 161 | make 1 162 | id 1 163 | satiat 1 164 | how 1 165 | anyon 1 166 | someon 1 167 | associ 1 168 | without 1 169 | environ 1 170 | greed 1 171 | -------------------------------------------------------------------------------- /data/trec/processed/dependency_graph/data.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/data/trec/processed/dependency_graph/data.pt -------------------------------------------------------------------------------- /data/trec/processed/dependency_graph/label.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/data/trec/processed/dependency_graph/label.pt -------------------------------------------------------------------------------- /data/trec/processed/dependency_graph/vocab.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/data/trec/processed/dependency_graph/vocab.pt --------------------------------------------------------------------------------