├── .gitignore
├── AAAI2022_demo
    ├── semantic_parsing.ipynb
    └── text_classification.ipynb
├── CLIQ-ai2021_demo
    ├── semantic_parsing.ipynb
    └── text_classification.ipynb
├── DLG4NLP@ICLR2022_demo
    ├── Math-word-problem
    │   ├── config.yaml
    │   ├── data
    │   │   └── raw
    │   │   │   ├── test.txt
    │   │   │   ├── train.txt
    │   │   │   └── valid.txt
    │   ├── imgs
    │   │   └── g2t.png
    │   ├── math_word_problem.ipynb
    │   └── utils.py
    └── text_classification.ipynb
├── GraphML2022_demo
    ├── Math-word-problem
    │   ├── .ipynb_checkpoints
    │   │   └── math_word_problem-checkpoint.ipynb
    │   ├── config.yaml
    │   ├── data
    │   │   └── raw
    │   │   │   ├── test.txt
    │   │   │   ├── train.txt
    │   │   │   └── valid.txt
    │   ├── imgs
    │   │   └── g2t.png
    │   ├── math_word_problem.ipynb
    │   └── utils.py
    └── text_classification.ipynb
├── IJCAI2021_demo
    ├── kg_completion
    │   ├── .ipynb_checkpoints
    │   │   └── kgc-checkpoint.ipynb
    │   ├── __init__.py
    │   ├── data
    │   │   └── kinship
    │   │   │   ├── e1rel_to_e2_full.json
    │   │   │   ├── e1rel_to_e2_ranking_dev.json
    │   │   │   ├── e1rel_to_e2_ranking_test.json
    │   │   │   ├── e1rel_to_e2_train.json
    │   │   │   ├── test.txt
    │   │   │   ├── train.txt
    │   │   │   └── valid.txt
    │   ├── evaluation.py
    │   ├── kgc.ipynb
    │   ├── kinship
    │   │   ├── processed
    │   │   │   └── KG_graph.pt
    │   │   └── raw
    │   │   │   └── kinship.tar.gz
    │   ├── model.py
    │   ├── preprocess.sh
    │   ├── spodernet
    │   │   ├── __init__.py
    │   │   ├── backends
    │   │   │   ├── __init__.py
    │   │   │   ├── tfbackend.py
    │   │   │   ├── tfmodels.py
    │   │   │   ├── torchbackend.py
    │   │   │   └── torchmodels.py
    │   │   ├── data
    │   │   │   ├── __init__.py
    │   │   │   └── snli2spoder.py
    │   │   ├── frontend.py
    │   │   ├── hooks.py
    │   │   ├── interfaces.py
    │   │   ├── preprocessing
    │   │   │   ├── __init__.py
    │   │   │   ├── batching.py
    │   │   │   ├── pipeline.py
    │   │   │   ├── processors.py
    │   │   │   └── vocab.py
    │   │   └── utils
    │   │   │   ├── __init__.py
    │   │   │   ├── cuda_utils.py
    │   │   │   ├── global_config.py
    │   │   │   ├── logger.py
    │   │   │   ├── spacy_util.py
    │   │   │   └── util.py
    │   └── wrangle_KG.py
    ├── math_word_problem_solving
    │   ├── .ipynb_checkpoints
    │   │   └── math_word_problem-checkpoint.ipynb
    │   ├── config.yaml
    │   ├── data
    │   │   ├── processed
    │   │   │   └── DependencyGraph
    │   │   │   │   ├── data.pt
    │   │   │   │   └── vocab.pt
    │   │   └── raw
    │   │   │   ├── test.txt
    │   │   │   ├── train.txt
    │   │   │   └── valid.txt
    │   ├── imgs
    │   │   └── g2t.png
    │   ├── math_word_problem.ipynb
    │   └── utils.py
    ├── semantic_parsing.ipynb
    └── text_classification.ipynb
├── KDD2021_demo
    ├── kg_completion
    │   ├── .ipynb_checkpoints
    │   │   └── kgc-checkpoint.ipynb
    │   ├── __init__.py
    │   ├── data
    │   │   └── kinship
    │   │   │   ├── e1rel_to_e2_full.json
    │   │   │   ├── e1rel_to_e2_ranking_dev.json
    │   │   │   ├── e1rel_to_e2_ranking_test.json
    │   │   │   ├── e1rel_to_e2_train.json
    │   │   │   ├── test.txt
    │   │   │   ├── train.txt
    │   │   │   └── valid.txt
    │   ├── evaluation.py
    │   ├── kgc.ipynb
    │   ├── kinship
    │   │   ├── processed
    │   │   │   └── KG_graph.pt
    │   │   └── raw
    │   │   │   └── kinship.tar.gz
    │   ├── model.py
    │   ├── preprocess.sh
    │   ├── saved_models
    │   │   └── kinship_ggnn_distmult_0.2_0.25.model
    │   ├── spodernet
    │   │   ├── __init__.py
    │   │   ├── backends
    │   │   │   ├── __init__.py
    │   │   │   ├── tfbackend.py
    │   │   │   ├── tfmodels.py
    │   │   │   ├── torchbackend.py
    │   │   │   └── torchmodels.py
    │   │   ├── data
    │   │   │   ├── __init__.py
    │   │   │   └── snli2spoder.py
    │   │   ├── frontend.py
    │   │   ├── hooks.py
    │   │   ├── interfaces.py
    │   │   ├── preprocessing
    │   │   │   ├── __init__.py
    │   │   │   ├── batching.py
    │   │   │   ├── pipeline.py
    │   │   │   ├── processors.py
    │   │   │   └── vocab.py
    │   │   └── utils
    │   │   │   ├── __init__.py
    │   │   │   ├── cuda_utils.py
    │   │   │   ├── global_config.py
    │   │   │   ├── logger.py
    │   │   │   ├── spacy_util.py
    │   │   │   └── util.py
    │   └── wrangle_KG.py
    ├── math_word_problem_solving
    │   ├── .ipynb_checkpoints
    │   │   └── math_word_problem-checkpoint.ipynb
    │   ├── config.yaml
    │   ├── data
    │   │   ├── processed
    │   │   │   └── DependencyGraph
    │   │   │   │   ├── data.pt
    │   │   │   │   └── vocab.pt
    │   │   └── raw
    │   │   │   ├── test.txt
    │   │   │   ├── train.txt
    │   │   │   └── valid.txt
    │   ├── imgs
    │   │   └── g2t.png
    │   ├── math_word_problem.ipynb
    │   └── utils.py
    ├── semantic_parsing.ipynb
    └── text_classification.ipynb
├── LICENSE
├── NAACL2021_demo
    ├── semantic_parsing.ipynb
    └── text_classification.ipynb
├── README.md
├── SIGIR2021_demo
    ├── semantic_parsing.ipynb
    └── text_classification.ipynb
├── TheWebConf2022_demo
    ├── Math-word-problem
    │   ├── config.yaml
    │   ├── data
    │   │   ├── processed
    │   │   │   └── NodeEmbGraph
    │   │   │   │   ├── data.pt
    │   │   │   │   └── vocab.pt
    │   │   └── raw
    │   │   │   ├── test.txt
    │   │   │   ├── train.txt
    │   │   │   └── valid.txt
    │   ├── imgs
    │   │   └── g2t.png
    │   ├── math_word_problem.ipynb
    │   └── utils.py
    └── text_classification.ipynb
├── config
    ├── jobs
    │   ├── gat_bi_sep_dynamic_node_emb.yaml
    │   └── gat_bi_sep_dynamic_node_emb_v2.yaml
    └── trec
    │   ├── graphsage_bi_fuse_static_dependency.yaml
    │   └── graphsage_bi_fuse_static_dependency_v2.yaml
└── data
    ├── jobs
        ├── processed
        │   ├── NodeEmbGraph
        │   │   ├── data.pt
        │   │   └── vocab.pt
        │   └── node_emb_graph
        │   │   ├── data.pt
        │   │   └── vocab.pt
        └── raw
        │   ├── sequence.pt
        │   ├── sequence.txt
        │   ├── test.txt
        │   ├── train.txt
        │   ├── vocab.f.txt
        │   └── vocab.q.txt
    └── trec
        ├── processed
            └── dependency_graph
            │   ├── data.pt
            │   ├── label.pt
            │   └── vocab.pt
        └── raw
            ├── test.txt
            └── train.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | */__pycache__
3 | */.ipynb_checkpoints
4 | */out
5 | */.vector_cache
6 | *.DS_Store
7 | */*/out
8 | */*/.vector_cache/
9 | *.pt


--------------------------------------------------------------------------------
/DLG4NLP@ICLR2022_demo/Math-word-problem/config.yaml:
--------------------------------------------------------------------------------
 1 | graph_construction_name: "node_emb"
 2 | graph_embedding_name: "graphsage"
 3 | decoder_name: "stdtree"
 4 | 
 5 | graph_construction_args:
 6 |   graph_construction_share:
 7 |     graph_name: 'node_emb'
 8 |     root_dir: "data"
 9 |     topology_subdir: 'NodeEmbGraph'
10 |     thread_number: 4
11 |     port: 9000
12 |     timeout: 15000
13 | 
14 |   graph_construction_private:
15 |     edge_strategy: 'homogeneous'
16 |     merge_strategy: 'tailhead'
17 |     sequential_link: true
18 |     as_node: false
19 |     sim_metric_type: 'weighted_cosine'
20 |     num_heads: 1
21 |     top_k_neigh: null
22 |     epsilon_neigh: 0.5
23 |     smoothness_ratio: 0.1
24 |     connectivity_ratio: 0.05
25 |     sparsity_ratio: 0.1
26 | 
27 | graph_initialization_args:
28 |   input_size: 300
29 |   hidden_size: 300
30 |   word_dropout: 0.1
31 |   rnn_dropout: 0.1
32 |   fix_bert_emb: false
33 |   fix_word_emb: false
34 |   embedding_style:
35 |     single_token_item: true
36 |     emb_strategy: "w2v_bilstm"
37 |     num_rnn_layers: 1
38 |     bert_model_name: null
39 |     bert_lower_case: null
40 | 
41 | graph_embedding_args:
42 |   graph_embedding_share:
43 |     num_layers: 1
44 |     input_size: 300
45 |     hidden_size: 300
46 |     output_size: 300
47 |     direction_option: "undirected"
48 |     feat_drop: 0.0
49 |     attn_drop: 0.0
50 | 
51 |   graph_embedding_private:
52 |     aggregator_type: "lstm"
53 |     bias: true
54 |     norm: null
55 |     activation: "relu"
56 |     use_edge_weight: true
57 | 
58 | decoder_args:
59 |   rnn_decoder_share:
60 |     rnn_type: "lstm"
61 |     input_size: 300
62 |     hidden_size: 300
63 |     rnn_emb_input_size: 300
64 |     use_copy: true
65 |     graph_pooling_strategy: null
66 |     attention_type: "uniform"
67 |     fuse_strategy: "concatenate"
68 |     dropout: 0.3
69 |     teacher_forcing_rate: 1.0
70 | 
71 |   rnn_decoder_private:
72 |     max_decoder_step: 35
73 |     max_tree_depth: 8
74 |     use_sibling: false
75 | 


--------------------------------------------------------------------------------
/DLG4NLP@ICLR2022_demo/Math-word-problem/imgs/g2t.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/DLG4NLP@ICLR2022_demo/Math-word-problem/imgs/g2t.png


--------------------------------------------------------------------------------
/DLG4NLP@ICLR2022_demo/Math-word-problem/utils.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import torch
 3 | import sympy
 4 | from random import randint
 5 | from sympy.parsing.sympy_parser import parse_expr
 6 | 
 7 | def convert_to_string(idx_list, form_manager):
 8 |     w_list = []
 9 |     for i in range(len(idx_list)):
10 |         w_list.append(form_manager.get_idx_symbol(int(idx_list[i])))
11 |     return " ".join(w_list)
12 | 
13 | def is_all_same(c1, c2, form_manager):
14 |     all_same = False
15 |     if len(c1) == len(c2):
16 |         all_same = True
17 |         for j in range(len(c1)):
18 |             if c1[j] != c2[j]:
19 |                 all_same = False
20 |                 break
21 |     if all_same == False:
22 |         if is_solution_same(c1, c2, form_manager):
23 |             return True
24 |         return False
25 |     else:
26 |         return True
27 | 
28 | 
29 | def is_solution_same(i1, i2, form_manager):
30 |     c1 = " ".join([form_manager.get_idx_symbol(x) for x in i1])
31 |     c2 = " ".join([form_manager.get_idx_symbol(x) for x in i2])
32 |     if ('=' not in c1) or ('=' not in c2):
33 |         return False
34 |     elif (form_manager.unk_token in c1) or (form_manager.unk_token in c2):
35 |         return False
36 |     else:
37 |         try:
38 |             s1 = c1.split('=')
39 |             s2 = c2.split('=')
40 |             eq1 = []
41 |             eq2 = []
42 |             x = sympy.Symbol('x')
43 |             eq1.append(parse_expr(s1[0]))
44 |             eq1.append(parse_expr(s1[1]))
45 |             eq2.append(parse_expr(s2[0]))
46 |             eq2.append(parse_expr(s2[1]))
47 |             res1 = sympy.solve(sympy.Eq(eq1[0], eq1[1]), x)
48 |             res2 = sympy.solve(sympy.Eq(eq2[0], eq2[1]), x)
49 | 
50 |             if not res1 or not res2:
51 |                 return False
52 |             if res1[0] == res2[0]:
53 |                 # print("Excution_true: ", c1, '\t', c2)
54 |                 pass
55 |             return res1[0] == res2[0]
56 | 
57 |         except BaseException:
58 |             # print("Excution_error: ", c1, '\t', c2)
59 |             pass
60 |             return False
61 | 
62 | def compute_accuracy(candidate_list, reference_list, form_manager):
63 |     if len(candidate_list) != len(reference_list):
64 |         print("candidate list has length {}, reference list has length {}\n".format(
65 |             len(candidate_list), len(reference_list)))
66 |     len_min = min(len(candidate_list), len(reference_list))
67 |     c = 0
68 |     for i in range(len_min):
69 |         if is_all_same(candidate_list[i], reference_list[i], form_manager):
70 |             c = c+1
71 |         else:
72 |             pass
73 |     return c/float(len_min)
74 | 
75 | 
76 | def compute_tree_accuracy(candidate_list_, reference_list_, form_manager):
77 |     candidate_list = []
78 |     for i in range(len(candidate_list_)):
79 |         candidate_list.append(candidate_list_[i])
80 |     reference_list = []
81 |     for i in range(len(reference_list_)):
82 |         reference_list.append(reference_list_[i])
83 |     return compute_accuracy(candidate_list, reference_list, form_manager)
84 | 
85 | def prepare_ext_vocab(batch_graph, src_vocab, device):
86 |     oov_dict = copy.deepcopy(src_vocab)
87 |     token_matrix = []
88 |     for n in batch_graph.node_attributes:
89 |         node_token = n["token"]
90 |         if (n.get("type") is None or n.get("type") == 0) and oov_dict.get_symbol_idx(
91 |             node_token
92 |         ) == oov_dict.get_symbol_idx(oov_dict.unk_token):
93 |             oov_dict.add_symbol(node_token)
94 |         token_matrix.append(oov_dict.get_symbol_idx(node_token))
95 |     batch_graph.node_features["token_id_oov"] = torch.tensor(token_matrix, dtype=torch.long).to(
96 |         device
97 |     )
98 |     return oov_dict


--------------------------------------------------------------------------------
/GraphML2022_demo/Math-word-problem/config.yaml:
--------------------------------------------------------------------------------
 1 | graph_construction_name: "node_emb"
 2 | graph_embedding_name: "graphsage"
 3 | decoder_name: "stdtree"
 4 | 
 5 | graph_construction_args:
 6 |   graph_construction_share:
 7 |     graph_name: 'node_emb'
 8 |     root_dir: "data"
 9 |     topology_subdir: 'NodeEmbGraph'
10 |     thread_number: 4
11 |     port: 9000
12 |     timeout: 15000
13 | 
14 |   graph_construction_private:
15 |     edge_strategy: 'homogeneous'
16 |     merge_strategy: 'tailhead'
17 |     sequential_link: true
18 |     as_node: false
19 |     sim_metric_type: 'weighted_cosine'
20 |     num_heads: 1
21 |     top_k_neigh: null
22 |     epsilon_neigh: 0.5
23 |     smoothness_ratio: 0.1
24 |     connectivity_ratio: 0.05
25 |     sparsity_ratio: 0.1
26 | 
27 | graph_initialization_args:
28 |   input_size: 300
29 |   hidden_size: 300
30 |   word_dropout: 0.1
31 |   rnn_dropout: 0.1
32 |   fix_bert_emb: false
33 |   fix_word_emb: false
34 |   embedding_style:
35 |     single_token_item: true
36 |     emb_strategy: "w2v_bilstm"
37 |     num_rnn_layers: 1
38 |     bert_model_name: null
39 |     bert_lower_case: null
40 | 
41 | graph_embedding_args:
42 |   graph_embedding_share:
43 |     num_layers: 1
44 |     input_size: 300
45 |     hidden_size: 300
46 |     output_size: 300
47 |     direction_option: "undirected"
48 |     feat_drop: 0.0
49 |     attn_drop: 0.0
50 | 
51 |   graph_embedding_private:
52 |     aggregator_type: "lstm"
53 |     bias: true
54 |     norm: null
55 |     activation: "relu"
56 |     use_edge_weight: true
57 | 
58 | decoder_args:
59 |   rnn_decoder_share:
60 |     rnn_type: "lstm"
61 |     input_size: 300
62 |     hidden_size: 300
63 |     rnn_emb_input_size: 300
64 |     use_copy: true
65 |     graph_pooling_strategy: null
66 |     attention_type: "uniform"
67 |     fuse_strategy: "concatenate"
68 |     dropout: 0.3
69 |     teacher_forcing_rate: 1.0
70 | 
71 |   rnn_decoder_private:
72 |     max_decoder_step: 35
73 |     max_tree_depth: 8
74 |     use_sibling: false
75 | 


--------------------------------------------------------------------------------
/GraphML2022_demo/Math-word-problem/imgs/g2t.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/GraphML2022_demo/Math-word-problem/imgs/g2t.png


--------------------------------------------------------------------------------
/GraphML2022_demo/Math-word-problem/utils.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import torch
 3 | import sympy
 4 | from random import randint
 5 | from sympy.parsing.sympy_parser import parse_expr
 6 | 
 7 | def convert_to_string(idx_list, form_manager):
 8 |     w_list = []
 9 |     for i in range(len(idx_list)):
10 |         w_list.append(form_manager.get_idx_symbol(int(idx_list[i])))
11 |     return " ".join(w_list)
12 | 
13 | def is_all_same(c1, c2, form_manager):
14 |     all_same = False
15 |     if len(c1) == len(c2):
16 |         all_same = True
17 |         for j in range(len(c1)):
18 |             if c1[j] != c2[j]:
19 |                 all_same = False
20 |                 break
21 |     if all_same == False:
22 |         if is_solution_same(c1, c2, form_manager):
23 |             return True
24 |         return False
25 |     else:
26 |         return True
27 | 
28 | 
29 | def is_solution_same(i1, i2, form_manager):
30 |     c1 = " ".join([form_manager.get_idx_symbol(x) for x in i1])
31 |     c2 = " ".join([form_manager.get_idx_symbol(x) for x in i2])
32 |     if ('=' not in c1) or ('=' not in c2):
33 |         return False
34 |     elif (form_manager.unk_token in c1) or (form_manager.unk_token in c2):
35 |         return False
36 |     else:
37 |         try:
38 |             s1 = c1.split('=')
39 |             s2 = c2.split('=')
40 |             eq1 = []
41 |             eq2 = []
42 |             x = sympy.Symbol('x')
43 |             eq1.append(parse_expr(s1[0]))
44 |             eq1.append(parse_expr(s1[1]))
45 |             eq2.append(parse_expr(s2[0]))
46 |             eq2.append(parse_expr(s2[1]))
47 |             res1 = sympy.solve(sympy.Eq(eq1[0], eq1[1]), x)
48 |             res2 = sympy.solve(sympy.Eq(eq2[0], eq2[1]), x)
49 | 
50 |             if not res1 or not res2:
51 |                 return False
52 |             if res1[0] == res2[0]:
53 |                 # print("Excution_true: ", c1, '\t', c2)
54 |                 pass
55 |             return res1[0] == res2[0]
56 | 
57 |         except BaseException:
58 |             # print("Excution_error: ", c1, '\t', c2)
59 |             pass
60 |             return False
61 | 
62 | def compute_accuracy(candidate_list, reference_list, form_manager):
63 |     if len(candidate_list) != len(reference_list):
64 |         print("candidate list has length {}, reference list has length {}\n".format(
65 |             len(candidate_list), len(reference_list)))
66 |     len_min = min(len(candidate_list), len(reference_list))
67 |     c = 0
68 |     for i in range(len_min):
69 |         if is_all_same(candidate_list[i], reference_list[i], form_manager):
70 |             c = c+1
71 |         else:
72 |             pass
73 |     return c/float(len_min)
74 | 
75 | 
76 | def compute_tree_accuracy(candidate_list_, reference_list_, form_manager):
77 |     candidate_list = []
78 |     for i in range(len(candidate_list_)):
79 |         candidate_list.append(candidate_list_[i])
80 |     reference_list = []
81 |     for i in range(len(reference_list_)):
82 |         reference_list.append(reference_list_[i])
83 |     return compute_accuracy(candidate_list, reference_list, form_manager)
84 | 
85 | def prepare_ext_vocab(batch_graph, src_vocab, device):
86 |     oov_dict = copy.deepcopy(src_vocab)
87 |     token_matrix = []
88 |     for n in batch_graph.node_attributes:
89 |         node_token = n["token"]
90 |         if (n.get("type") is None or n.get("type") == 0) and oov_dict.get_symbol_idx(
91 |             node_token
92 |         ) == oov_dict.get_symbol_idx(oov_dict.unk_token):
93 |             oov_dict.add_symbol(node_token)
94 |         token_matrix.append(oov_dict.get_symbol_idx(node_token))
95 |     batch_graph.node_features["token_id_oov"] = torch.tensor(token_matrix, dtype=torch.long).to(
96 |         device
97 |     )
98 |     return oov_dict


--------------------------------------------------------------------------------
/IJCAI2021_demo/kg_completion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/IJCAI2021_demo/kg_completion/__init__.py


--------------------------------------------------------------------------------
/IJCAI2021_demo/kg_completion/evaluation.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import datetime
  4 | 
  5 | from spodernet.utils.logger import Logger
  6 | from torch.autograd import Variable
  7 | from sklearn import metrics
  8 | 
  9 | log = Logger('evaluation{0}.py.txt'.format(datetime.datetime.now()))
 10 | 
 11 | def ranking_and_hits(model, dev_rank_batcher, vocab, name, kg_graph=None):
 12 |     log.info('')
 13 |     log.info('-'*50)
 14 |     log.info(name)
 15 |     log.info('-'*50)
 16 |     log.info('')
 17 |     hits_left = []
 18 |     hits_right = []
 19 |     hits = []
 20 |     ranks = []
 21 |     ranks_left = []
 22 |     ranks_right = []
 23 |     for i in range(10):
 24 |         hits_left.append([])
 25 |         hits_right.append([])
 26 |         hits.append([])
 27 | 
 28 |     for i, str2var in enumerate(dev_rank_batcher):
 29 |         e1 = str2var['e1']
 30 |         e2 = str2var['e2']
 31 |         rel = str2var['rel']
 32 |         rel_reverse = str2var['rel_eval']
 33 |         e2_multi1 = str2var['e2_multi1'].float()
 34 |         e2_multi2 = str2var['e2_multi2'].float()
 35 |         pred1 = model.forward(e1, rel, kg_graph)
 36 |         pred2 = model.forward(e2, rel_reverse, kg_graph)
 37 |         pred1, pred2 = pred1.data, pred2.data
 38 |         e1, e2 = e1.data, e2.data
 39 |         e2_multi1, e2_multi2 = e2_multi1.data, e2_multi2.data
 40 |         for i in range(e1.shape[0]):
 41 |             # these filters contain ALL labels
 42 |             filter1 = e2_multi1[i].long()
 43 |             filter2 = e2_multi2[i].long()
 44 | 
 45 |             num = e1[i, 0].item()
 46 |             # save the prediction that is relevant
 47 |             target_value1 = pred1[i,e2[i, 0].item()].item()
 48 |             target_value2 = pred2[i,e1[i, 0].item()].item()
 49 |             # zero all known cases (this are not interesting)
 50 |             # this corresponds to the filtered setting
 51 |             pred1[i][filter1] = 0.0
 52 |             pred2[i][filter2] = 0.0
 53 |             # write base the saved values
 54 |             pred1[i][e2[i]] = target_value1
 55 |             pred2[i][e1[i]] = target_value2
 56 | 
 57 | 
 58 |         # sort and rank
 59 |         max_values, argsort1 = torch.sort(pred1, 1, descending=True)
 60 |         max_values, argsort2 = torch.sort(pred2, 1, descending=True)
 61 | 
 62 |         argsort1 = argsort1.cpu().numpy()
 63 |         argsort2 = argsort2.cpu().numpy()
 64 |         for i in range(e1.shape[0]):
 65 |             # find the rank of the target entities
 66 |             rank1 = np.where(argsort1[i]==e2[i, 0].item())[0][0]
 67 |             rank2 = np.where(argsort2[i]==e1[i, 0].item())[0][0]
 68 |             # rank+1, since the lowest rank is rank 1 not rank 0
 69 |             ranks.append(rank1+1)
 70 |             ranks_left.append(rank1+1)
 71 |             ranks.append(rank2+1)
 72 |             ranks_right.append(rank2+1)
 73 | 
 74 |             # this could be done more elegantly, but here you go
 75 |             for hits_level in range(10):
 76 |                 if rank1 <= hits_level:
 77 |                     hits[hits_level].append(1.0)
 78 |                     hits_left[hits_level].append(1.0)
 79 |                 else:
 80 |                     hits[hits_level].append(0.0)
 81 |                     hits_left[hits_level].append(0.0)
 82 | 
 83 |                 if rank2 <= hits_level:
 84 |                     hits[hits_level].append(1.0)
 85 |                     hits_right[hits_level].append(1.0)
 86 |                 else:
 87 |                     hits[hits_level].append(0.0)
 88 |                     hits_right[hits_level].append(0.0)
 89 | 
 90 |         dev_rank_batcher.state.loss = [0]
 91 | 
 92 |     for i in range(10):
 93 |         log.info('Hits left @{0}: {1}'.format(i+1, np.mean(hits_left[i])))
 94 |         log.info('Hits right @{0}: {1}'.format(i+1, np.mean(hits_right[i])))
 95 |         log.info('Hits @{0}: {1}'.format(i+1, np.mean(hits[i])))
 96 |     log.info('Mean rank left: {0}', np.mean(ranks_left))
 97 |     log.info('Mean rank right: {0}', np.mean(ranks_right))
 98 |     log.info('Mean rank: {0}', np.mean(ranks))
 99 |     log.info('Mean reciprocal rank left: {0}', np.mean(1./np.array(ranks_left)))
100 |     log.info('Mean reciprocal rank right: {0}', np.mean(1./np.array(ranks_right)))
101 |     log.info('Mean reciprocal rank: {0}', np.mean(1./np.array(ranks)))
102 | 
103 |     return np.mean(1./np.array(ranks))


--------------------------------------------------------------------------------
/IJCAI2021_demo/kg_completion/kinship/processed/KG_graph.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/IJCAI2021_demo/kg_completion/kinship/processed/KG_graph.pt


--------------------------------------------------------------------------------
/IJCAI2021_demo/kg_completion/kinship/raw/kinship.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/IJCAI2021_demo/kg_completion/kinship/raw/kinship.tar.gz


--------------------------------------------------------------------------------
/IJCAI2021_demo/kg_completion/preprocess.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | mkdir kg_completion/data
3 | mkdir kg_completion/data/kinship
4 | mkdir kg_completion/saved_models
5 | tar -xvf kg_completion/kinship/raw/kinship.tar.gz -C kg_completion/data/kinship
6 | python kg_completion/wrangle_KG.py kinship
7 | 


--------------------------------------------------------------------------------
/IJCAI2021_demo/kg_completion/spodernet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/IJCAI2021_demo/kg_completion/spodernet/__init__.py


--------------------------------------------------------------------------------
/IJCAI2021_demo/kg_completion/spodernet/backends/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/IJCAI2021_demo/kg_completion/spodernet/backends/__init__.py


--------------------------------------------------------------------------------
/IJCAI2021_demo/kg_completion/spodernet/backends/tfbackend.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | from spodernet.interfaces import IAtBatchPreparedObservable
  4 | from spodernet.utils.util import Timer
  5 | from spodernet.utils.global_config import Config
  6 | 
  7 | class TensorFlowConfig:
  8 |     inp = None
  9 |     support = None
 10 |     input_length = None
 11 |     support_length = None
 12 |     target = None
 13 |     index = None
 14 |     sess = None
 15 | 
 16 |     @staticmethod
 17 |     def init_batch_size(batch_size):
 18 |         TensorFlowConfig.inp = tf.placeholder(tf.int64, [batch_size, None])
 19 |         TensorFlowConfig.support = tf.placeholder(tf.int64, [batch_size, None])
 20 |         TensorFlowConfig.input_length = tf.placeholder(tf.int64, [batch_size,])
 21 |         TensorFlowConfig.support_length = tf.placeholder(tf.int64, [batch_size,])
 22 |         TensorFlowConfig.target = tf.placeholder(tf.int64, [batch_size])
 23 |         TensorFlowConfig.index = tf.placeholder(tf.int64, [batch_size])
 24 | 
 25 |     @staticmethod
 26 |     def get_session():
 27 |         if TensorFlowConfig.sess is None:
 28 |             TensorFlowConfig.sess = tf.Session()
 29 |         return TensorFlowConfig.sess
 30 | 
 31 | 
 32 | 
 33 | class TensorFlowConverter(IAtBatchPreparedObservable):
 34 | 
 35 |     def at_batch_prepared(self, batch_parts):
 36 |         inp, inp_len, sup, sup_len, t, idx = batch_parts
 37 |         if TensorFlowConfig.inp == None:
 38 |             log.error('You need to initialize the batch size via TensorflowConfig.init_batch_size(batchsize)!')
 39 |         feed_dict = {}
 40 |         feed_dict[TensorFlowConfig.inp] = inp
 41 |         feed_dict[TensorFlowConfig.support] = sup
 42 |         feed_dict[TensorFlowConfig.input_length] = inp_len
 43 |         feed_dict[TensorFlowConfig.support_length] = sup_len
 44 |         feed_dict[TensorFlowConfig.target] = t
 45 |         feed_dict[TensorFlowConfig.index] = idx
 46 | 
 47 |         str2var = {}
 48 |         str2var['input'] = TensorFlowConfig.inp
 49 |         str2var['input_length'] = TensorFlowConfig.input_length
 50 |         str2var['support'] = TensorFlowConfig.support
 51 |         str2var['support_length'] = TensorFlowConfig.support_length
 52 |         str2var['target'] = TensorFlowConfig.target
 53 |         str2var['index'] = TensorFlowConfig.index
 54 | 
 55 |         return str2var, feed_dict
 56 | 
 57 | def build_str2var_dict():
 58 |     str2var = {}
 59 |     if TensorFlowConfig.inp is not None:
 60 |         str2var['input'] = TensorFlowConfig.inp
 61 |     if TensorFlowConfig.support is not None:
 62 |         str2var['support'] = TensorFlowConfig.support
 63 |     if TensorFlowConfig.target is not None:
 64 |         str2var['target'] = TensorFlowConfig.target
 65 |     if TensorFlowConfig.input_length is not None:
 66 |         str2var['input_length'] = TensorFlowConfig.input_length
 67 |     if TensorFlowConfig.support_length is not None:
 68 |         str2var['support_length'] = TensorFlowConfig.support_length
 69 |     if TensorFlowConfig.index is not None:
 70 |         str2var['index'] = TensorFlowConfig.index
 71 |         return str2var
 72 | 
 73 | class TFTrainer(object):
 74 |     def __init__(self, model):
 75 |         self.sess = TensorFlowConfig.get_session()
 76 |         str2var = build_str2var_dict()
 77 |         self.logits, self.loss, self.argmax = model.forward(str2var)
 78 |         optimizer = tf.train.AdamOptimizer(0.001)
 79 | 
 80 |         if Config.L2 != 0.0:
 81 |             self.loss += tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()]) * Config.L2
 82 | 
 83 |         self.min_op = optimizer.minimize(self.loss)
 84 | 
 85 |         tf.global_variables_initializer().run(session=self.sess)
 86 | 
 87 |     def train_model(self, batcher, epochs=1, iterations=None):
 88 |         for epoch in range(epochs):
 89 |             for i, (str2var, feed_dict) in enumerate(batcher):
 90 |                 _, argmax_values = self.sess.run([self.min_op, self.argmax], feed_dict=feed_dict)
 91 | 
 92 |                 batcher.state.argmax = argmax_values
 93 |                 batcher.state.targets = feed_dict[TensorFlowConfig.target]
 94 | 
 95 |                 if iterations > 0:
 96 |                     if i == iterations: break
 97 | 
 98 |     def eval_model(self, batcher, iterations=None):
 99 |         for i, (str2var, feed_dict) in enumerate(batcher):
100 |             argmax_values = self.sess.run([self.argmax], feed_dict=feed_dict)[0]
101 | 
102 |             batcher.state.argmax = argmax_values
103 |             batcher.state.targets = feed_dict[TensorFlowConfig.target]
104 | 
105 |             if iterations > 0:
106 |                 if i == iterations: break
107 | 
108 | 


--------------------------------------------------------------------------------
/IJCAI2021_demo/kg_completion/spodernet/backends/tfmodels.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from tensorflow import placeholder
  3 | from spodernet.backends.tfbackend import TensorFlowConfig
  4 | from spodernet.utils.global_config import Config
  5 | from spodernet.frontend import AbstractModel
  6 | import numpy as np
  7 | 
  8 | def reader(inputs, lengths, output_size, contexts=(None, None), scope=None):
  9 |     with tf.variable_scope(scope or "reader") as varscope:
 10 | 
 11 |         cell = tf.contrib.rnn.LSTMCell(output_size, state_is_tuple=True,initializer=tf.contrib.layers.xavier_initializer())
 12 | 
 13 |         cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=1.0-Config.dropout)
 14 | 
 15 |         outputs, states = tf.nn.bidirectional_dynamic_rnn(
 16 |             cell,
 17 |             cell,
 18 |             inputs,
 19 |             sequence_length=lengths,
 20 |             initial_state_fw=contexts[0],
 21 |             initial_state_bw=contexts[1],
 22 |             dtype=tf.float32)
 23 | 
 24 |         return outputs, states
 25 | 
 26 | def predictor(inputs, targets, target_size):
 27 |     init = tf.contrib.layers.xavier_initializer(uniform=True) #uniform=False for truncated normal
 28 |     logits = tf.contrib.layers.fully_connected(inputs, target_size, weights_initializer=init, activation_fn=None)
 29 | 
 30 |     loss = tf.reduce_mean(
 31 |         tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
 32 |             labels=targets), name='predictor_loss')
 33 |     predict = tf.arg_max(tf.nn.softmax(logits), 1, name='prediction')
 34 |     return [logits, loss, predict]
 35 | 
 36 | 
 37 | class TFEmbedding(AbstractModel):
 38 | 
 39 |     def __init__(self, embedding_size, num_embeddings, scope=None):
 40 |         super(TFEmbedding, self).__init__()
 41 | 
 42 |         self.embedding_size = embedding_size
 43 |         self.scope = scope
 44 |         self.num_embeddings = num_embeddings
 45 | 
 46 |     def forward(self, str2var, *args):
 47 |         self.expected_str2var_keys(str2var, ['input', 'support'])
 48 |         self.expected_args('None', 'None')
 49 |         self.generated_outputs('input idx, support idx', 'both sequences have shape = [batch, timesteps, embedding dim]')
 50 | 
 51 |         embeddings = tf.get_variable("embeddings", [self.num_embeddings, self.embedding_size],
 52 |                                 initializer=tf.random_normal_initializer(0., 1./np.sqrt(self.embedding_size)),
 53 |                                 trainable=True, dtype="float32")
 54 | 
 55 |         with tf.variable_scope("embedders") as varscope:
 56 |             seqQ = tf.nn.embedding_lookup(embeddings, TensorFlowConfig.inp)
 57 |             varscope.reuse_variables()
 58 |             seqS = tf.nn.embedding_lookup(embeddings, TensorFlowConfig.support)
 59 | 
 60 |         return seqQ, seqS
 61 | 
 62 | class TFPairedBiDirectionalLSTM(AbstractModel):
 63 | 
 64 |     def __init__(self, hidden_size, scope=None, conditional_encoding=True):
 65 |         super(TFPairedBiDirectionalLSTM, self).__init__()
 66 |         self.hidden_size = hidden_size
 67 |         self.scope = scope
 68 |         if not conditional_encoding:
 69 |             raise NotImplementedError("conditional_encoding=False is not implemented yet.")
 70 | 
 71 |     def forward(self, str2var, *args):
 72 |         self.expected_str2var_keys(str2var, ['input_length', 'support_length'])
 73 |         self.expected_args('seq input, seq support', 'dimension of both: [batch, timesteps, embedding dim]')
 74 |         self.generated_outputs('stacked outputs of last timestep', 'dim is [batch_size, 2x hidden size]')
 75 | 
 76 |         seqQ, seqS = args
 77 | 
 78 |         with tf.variable_scope(self.scope or "conditional_reader_seq1") as varscope1:
 79 |             #seq1_states: (c_fw, h_fw), (c_bw, h_bw)
 80 |             _, seq1_states = reader(seqQ, TensorFlowConfig.input_length, self.hidden_size, scope=varscope1)
 81 |         with tf.variable_scope(self.scope or "conditional_reader_seq2") as varscope2:
 82 |             varscope1.reuse_variables()
 83 |             # each [batch_size x max_seq_length x output_size]
 84 |             outputs, states = reader(seqS, TensorFlowConfig.support_length, self.hidden_size, seq1_states, scope=varscope2)
 85 | 
 86 |         output = tf.concat([states[0][1], states[1][1]], 1)
 87 | 
 88 |         return [output]
 89 | 
 90 | class TFSoftmaxCrossEntropy(AbstractModel):
 91 | 
 92 |     def __init__(self, num_labels):
 93 |         super(TFSoftmaxCrossEntropy, self).__init__()
 94 |         self.num_labels = num_labels
 95 | 
 96 |     def forward(self, str2var, *args):
 97 |         self.expected_str2var_keys(str2var, ['target'])
 98 |         self.expected_args('some inputs', 'dimension: [batch, any]')
 99 |         self.generated_outputs('logits, loss, argmax', 'dimensions: logits = [batch, labels], loss = 1x1, argmax = [batch, 1]')
100 |         outputs_prev_layer = args[0]
101 | 
102 |         logits, loss, argmax = predictor(outputs_prev_layer, TensorFlowConfig.target, self.num_labels)
103 | 
104 |         return [logits, loss, argmax]
105 | 


--------------------------------------------------------------------------------
/IJCAI2021_demo/kg_completion/spodernet/backends/torchbackend.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from torch.autograd import Variable
  3 | from itertools import chain
  4 | 
  5 | import torch
  6 | import numpy as np
  7 | 
  8 | from spodernet.interfaces import IAtBatchPreparedObservable
  9 | from spodernet.utils.util import Timer
 10 | from spodernet.utils.global_config import Config
 11 | 
 12 | class TorchConverter(IAtBatchPreparedObservable):
 13 |     def __init__(self, is_volatile):
 14 |         self.is_volatile = is_volatile
 15 | 
 16 |     def at_batch_prepared(self, str2var):
 17 |         for key in str2var.keys():
 18 |             if 'length' in key: continue
 19 |             if str2var[key].dtype == np.int32:
 20 |                 str2var[key] = np.int64(str2var[key])
 21 |             str2var[key] = Variable(torch.from_numpy(str2var[key]), volatile=self.is_volatile)
 22 |         return str2var
 23 | 
 24 | class TorchCUDAConverter(IAtBatchPreparedObservable):
 25 |     def __init__(self, device_id):
 26 |         self.device_id = device_id
 27 | 
 28 |     def at_batch_prepared(self, str2var):
 29 |         for key in str2var.keys():
 30 |             if 'length' in key: continue
 31 |             str2var[key] = str2var[key].cuda(self.device_id, True)
 32 |         return str2var
 33 | 
 34 | 
 35 | class TorchNegativeSampling(IAtBatchPreparedObservable):
 36 |     def __init__(self, max_index, keys_to_corrupt=['input', 'target']):
 37 |         self.max_index = max_index
 38 |         self.keys_to_corrupt = keys_to_corrupt
 39 |         self.rdm = np.random.RandomState(34534)
 40 | 
 41 |     def at_batch_prepared(self, str2var):
 42 |         samples_per_key = Config.batch_size/len(self.keys_to_corrupt)
 43 |         for i, key in enumerate(self.keys_to_corrupt):
 44 |             variable = str2var[key]
 45 |             new_idx = self.rdm.choice(self.max_index, samples_per_key)
 46 |             if Config.cuda:
 47 |                 variable_corrupted = Variable(torch.cuda.LongTensor(variable.size()))
 48 |                 variable_corrupted.data.copy_(variable.data)
 49 |                 variable_corrupted.data[i*samples_per_key: (i+1)*samples_per_key] = torch.from_numpy(new_idx).cuda()
 50 |             else:
 51 |                 variable_corrupted = Variable(torch.LongTensor(variable.size()))
 52 |                 variable_corrupted.data.copy_(variable.data)
 53 |                 variable_corrupted.data[i*samples_per_key: (i+1)*samples_per_key] = torch.from_numpy(new_idx)
 54 |             str2var[key + '_corrupt'] = variable_corrupted
 55 | 
 56 |         return str2var
 57 | 
 58 | 
 59 | ######################################
 60 | #
 61 | #          Util functions
 62 | #
 63 | ######################################
 64 | 
 65 | 
 66 | def get_list_of_torch_modules(model):
 67 |     modules = []
 68 |     for module in model.modules:
 69 |         if hasattr(module, 'modules'):
 70 |             for module2 in module.modules:
 71 |                 modules.append(module2)
 72 |         else:
 73 |             modules.append(module)
 74 |     return modules
 75 | 
 76 | 
 77 | 
 78 | def train_model(model, batcher, epochs=1, iterations=None):
 79 |     modules = get_list_of_torch_modules(model)
 80 |     generators = []
 81 |     for module in modules:
 82 |         if Config.cuda:
 83 |             module.cuda()
 84 |         generators.append(module.parameters())
 85 | 
 86 |     parameters = chain.from_iterable(generators)
 87 |     optimizer = torch.optim.Adam(parameters, lr=0.001)
 88 |     for module in modules:
 89 |         module.train()
 90 | 
 91 |     for epoch in range(epochs):
 92 |         for i, str2var in enumerate(batcher):
 93 |             optimizer.zero_grad()
 94 |             logits, loss, argmax = model.forward(str2var)
 95 |             loss.backward()
 96 |             optimizer.step()
 97 |             batcher.state.argmax = argmax
 98 |             batcher.state.targets = str2var['target']
 99 | 
100 |             if iterations > 0:
101 |                 if i == iterations: break
102 | 
103 | 
104 | def eval_model(model, batcher, iterations=None):
105 |     modules = get_list_of_torch_modules(model)
106 |     for module in modules:
107 |         module.eval()
108 | 
109 |     for i, str2var in enumerate(batcher):
110 |         logits, loss, argmax = model.forward(str2var)
111 |         batcher.state.argmax = argmax
112 |         batcher.state.targets = str2var['target']
113 | 
114 |         if iterations > 0:
115 |             if i == iterations: break
116 | 


--------------------------------------------------------------------------------
/IJCAI2021_demo/kg_completion/spodernet/backends/torchmodels.py:
--------------------------------------------------------------------------------
  1 | from torch.nn import LSTM
  2 | from torch.autograd import Variable
  3 | 
  4 | import torch
  5 | import torch.nn.functional as F
  6 | 
  7 | from spodernet.frontend import AbstractModel
  8 | from spodernet.utils.global_config import Config
  9 | 
 10 | class TorchEmbedding(torch.nn.Module, AbstractModel):
 11 |     def __init__(self, embedding_size, num_embeddings):
 12 |         super(TorchEmbedding, self).__init__()
 13 | 
 14 |         self.emb= torch.nn.Embedding(num_embeddings,
 15 |                 embedding_size, padding_idx=0)#, scale_grad_by_freq=True, padding_idx=0)
 16 | 
 17 |     def forward(self, str2var, *args):
 18 |         self.expected_str2var_keys_oneof(str2var, ['input', 'support'])
 19 |         self.expected_args('None', 'None')
 20 |         self.generated_outputs('input idx, support idx', 'both sequences have shape = [batch, timesteps, embedding dim]')
 21 | 
 22 |         embedded_results = []
 23 |         if 'input' in str2var:
 24 |             embedded_results.append(self.emb(str2var['input']))
 25 | 
 26 |         if 'support' in str2var:
 27 |             embedded_results.append(self.emb(str2var['support']))
 28 | 
 29 |         return embedded_results
 30 | 
 31 | class TorchBiDirectionalLSTM(torch.nn.Module, AbstractModel):
 32 |     def __init__(self, input_size, hidden_size,
 33 |             dropout=0.0, layers=1,
 34 |             bidirectional=True, to_cuda=False, conditional_encoding=True):
 35 |         super(TorchBiDirectionalLSTM, self).__init__()
 36 | 
 37 |         use_bias = True
 38 |         num_directions = (1 if not bidirectional else 2)
 39 | 
 40 |         self.lstm = LSTM(input_size,hidden_size,layers,
 41 |                          use_bias,True,0.2,bidirectional)
 42 | 
 43 |         # states of both LSTMs
 44 |         self.h0 = None
 45 |         self.c0 = None
 46 | 
 47 |         self.h0 = Variable(torch.FloatTensor(num_directions*layers, Config.batch_size, hidden_size))
 48 |         self.c0 = Variable(torch.FloatTensor(num_directions*layers, Config.batch_size, hidden_size))
 49 | 
 50 |         if Config.cuda:
 51 |             self.h0 = self.h0.cuda()
 52 |             self.c0 = self.c0.cuda()
 53 | 
 54 |     def forward(self, str2var, *args):
 55 |         self.expected_str2var_keys(str2var, [])
 56 |         self.expected_args('embedded seq', 'size [batch, time steps, embedding dim]')
 57 |         self.generated_outputs('LSTM output seq', 'size [batch, time steps, 2x hidden dim]')
 58 |         seq = args
 59 |         self.h0.data.zero_()
 60 |         self.c0.data.zero_()
 61 |         out, hid = self.lstm(seq, (self.h0, self.c0))
 62 |         return [out, hid]
 63 | 
 64 | 
 65 | class TorchPairedBiDirectionalLSTM(torch.nn.Module, AbstractModel):
 66 |     def __init__(self, input_size, hidden_size,
 67 |             dropout=0.0, layers=1,
 68 |             bidirectional=True, to_cuda=False, conditional_encoding=True):
 69 |         super(TorchPairedBiDirectionalLSTM, self).__init__()
 70 | 
 71 |         self.conditional_encoding = conditional_encoding
 72 |         use_bias = True
 73 |         num_directions = (1 if not bidirectional else 2)
 74 | 
 75 |         self.conditional_encoding = conditional_encoding
 76 |         self.lstm1 = LSTM(input_size,hidden_size,layers,
 77 |                          use_bias,True,Config.dropout,bidirectional)
 78 |         self.lstm2 = LSTM(input_size,hidden_size,layers,
 79 |                          use_bias,True,Config.dropout,bidirectional)
 80 | 
 81 |         # states of both LSTMs
 82 |         self.h01 = None
 83 |         self.c01 = None
 84 |         self.h02 = None
 85 |         self.c02 = None
 86 | 
 87 | 
 88 |         self.h01 = Variable(torch.FloatTensor(num_directions*layers, Config.batch_size, hidden_size))
 89 |         self.c01 = Variable(torch.FloatTensor(num_directions*layers, Config.batch_size, hidden_size))
 90 | 
 91 |         if Config.cuda:
 92 |             self.h01 = self.h01.cuda()
 93 |             self.c01 = self.c01.cuda()
 94 | 
 95 |         if not self.conditional_encoding:
 96 |             self.h02 = Variable(torch.FloatTensor(num_directions*layers, Config.batch_size, hidden_size))
 97 |             self.c02 = Variable(torch.FloatTensor(num_directions*layers, Config.batch_size, hidden_size))
 98 | 
 99 |             if Config.cuda:
100 |                 self.h02 = self.h02.cuda()
101 |                 self.c02 = self.c02.cuda()
102 | 
103 | 
104 |     def forward(self, str2var, *args):
105 |         self.expected_str2var_keys(str2var, [])
106 |         self.expected_args('embedded input seq, embedded seq support', 'both of size [batch, time steps, embedding dim]')
107 |         self.generated_outputs('LSTM output seq inputs, LSTM output seq support', 'both of size [batch, time steps, 2x hidden dim]')
108 |         seq1, seq2 = args
109 |         if self.conditional_encoding:
110 |             self.h01.data.zero_()
111 |             self.c01.data.zero_()
112 |             out1, hid1 = self.lstm1(seq1, (self.h01, self.c01))
113 |             out2, hid2 = self.lstm2(seq2, hid1)
114 |         else:
115 |             self.h01.data.zero_()
116 |             self.c01.data.zero_()
117 |             self.h02.data.zero_()
118 |             self.c02.data.zero_()
119 |             out1, hid1 = self.lstm1(seq1, (self.h01, self.c01))
120 |             out2, hid2 = self.lstm2(seq2, (self.h02, self.c02))
121 |         return [out1, out2]
122 | 
123 | class TorchVariableLengthOutputSelection(torch.nn.Module, AbstractModel):
124 |     def __init__(self):
125 |         super(TorchVariableLengthOutputSelection, self).__init__()
126 |         self.b1 = None
127 |         self.b2 = None
128 | 
129 |     def forward(self, str2var, *args):
130 |         self.expected_str2var_keys(str2var, ['input_length', 'support_length'])
131 |         self.expected_args('LSTM output sequence input , LSTM output sequence support', 'dimension of both: [batch, time steps, 2x LSTM hidden size]')
132 |         self.generated_outputs('stacked bidirectional outputs of last timestep', 'dim is [batch_size, 4x hidden size]')
133 | 
134 |         output_lstm1, output_lstm2 = args
135 | 
136 |         l1, l2 = str2var['input_length'], str2var['support_length']
137 |         if self.b1 == None:
138 |             b1 = torch.ByteTensor(output_lstm1.size())
139 |             b2 = torch.ByteTensor(output_lstm2.size())
140 |             if Config.cuda:
141 |                 b1 = b1.cuda()
142 |                 b2 = b2.cuda()
143 | 
144 |         b1.fill_(0)
145 |         for i, num in enumerate(l1.data):
146 |             b1[i,num-1,:] = 1
147 |         out1 = output_lstm1[b1].view(Config.batch_size, -1)
148 | 
149 |         b2.fill_(0)
150 |         for i, num in enumerate(l2.data):
151 |             b2[i,num-1,:] = 1
152 |         out2 = output_lstm2[b2].view(Config.batch_size, -1)
153 | 
154 |         out = torch.cat([out1,out2], 1)
155 |         return [out]
156 | 
157 | class TorchSoftmaxCrossEntropy(torch.nn.Module, AbstractModel):
158 | 
159 |     def __init__(self, input_dim, num_labels):
160 |         super(TorchSoftmaxCrossEntropy, self).__init__()
161 |         self.num_labels = num_labels
162 |         self.projection_to_labels = torch.nn.Linear(input_dim, num_labels)
163 | 
164 |     def forward(self, str2var, *args):
165 |         self.expected_str2var_keys(str2var, ['target'])
166 |         self.expected_args('some inputs', 'dimension: [batch, any]')
167 |         self.generated_outputs('logits, loss, argmax', 'dimensions: logits = [batch, labels], loss = 1x1, argmax = [batch, 1]')
168 | 
169 |         outputs_prev_layer = args[0]
170 |         t = str2var['target']
171 | 
172 |         logits = self.projection_to_labels(outputs_prev_layer)
173 |         out = F.log_softmax(logits)
174 |         loss = F.nll_loss(out, t)
175 |         maximum, argmax = torch.topk(out.data, 1)
176 | 
177 |         return [logits, loss, argmax]
178 | 
179 | 


--------------------------------------------------------------------------------
/IJCAI2021_demo/kg_completion/spodernet/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/IJCAI2021_demo/kg_completion/spodernet/data/__init__.py


--------------------------------------------------------------------------------
/IJCAI2021_demo/kg_completion/spodernet/data/snli2spoder.py:
--------------------------------------------------------------------------------
1 | '''Downloads SNLI data and wrangles it into the spoder format'''
2 | 
3 | 
4 | if __name__ == '__main__':
5 |     snli2spoder()
6 | 


--------------------------------------------------------------------------------
/IJCAI2021_demo/kg_completion/spodernet/frontend.py:
--------------------------------------------------------------------------------
  1 | from itertools import chain
  2 | 
  3 | from spodernet.utils.global_config import Config, Backends
  4 | 
  5 | from spodernet.utils.logger import Logger
  6 | log = Logger('frontend.py.txt')
  7 | 
  8 | 
  9 | class Model(object):
 10 | 
 11 |     def __init__(self, input_module=None):
 12 |         self.modules = []
 13 |         self.input_module = input_module
 14 |         self.module = self
 15 | 
 16 |     def add(self, module):
 17 |         self.modules.append(module)
 18 | 
 19 |     def forward(self, str2var, *inputs):
 20 |         outputs = inputs
 21 |         if inputs == None:
 22 |             outputs = []
 23 |         for module in self.modules:
 24 |             outputs = module.forward(str2var, *outputs)
 25 |         return outputs
 26 | 
 27 | class Trainer(object):
 28 |     def __init__(self, model):
 29 |         self.model = model
 30 | 
 31 |         self.trainer_backend = None
 32 |         self.train_func = None
 33 |         self.eval_func = None
 34 |         if Config.backend == Backends.TENSORFLOW:
 35 |             from spodernet.backends.tfbackend import TFTrainer
 36 |             self.trainer_backend = TFTrainer(model)
 37 |             self.train_func = lambda _, batch, epochs, iterations: self.trainer_backend.train_model(batch, epochs, iterations)
 38 |             self.eval_func = lambda _, batch, iterations: self.trainer_backend.eval_model(batch, iterations)
 39 |         elif Config.backend == Backends.TORCH:
 40 |             from spodernet.backends.torchbackend import train_model, eval_model
 41 |             self.train_func = train_model
 42 |             self.eval_func = eval_model
 43 | 
 44 |     def train(self, batcher, epochs=1, iterations=None):
 45 |         self.train_func(self.model, batcher, epochs, iterations)
 46 | 
 47 |     def evaluate(self, batcher, iterations=None):
 48 |         self.eval_func(self.model, batcher, iterations)
 49 | 
 50 | class AbstractModel(object):
 51 | 
 52 |     def __init__(self):
 53 |         super(AbstractModel, self).__init__()
 54 |         self.input_str_args = None
 55 |         self.output_str_args = None
 56 |         self.used_keys = None
 57 | 
 58 |     def forward(self, str2var, *args):
 59 |         raise NotImplementedError("Classes that inherit from AbstractModel need to implement the forward method.")
 60 | 
 61 |     @property
 62 |     def modules(self):
 63 |         raise NotImplementedError("Classes that inherit from AbstractModel need to overrite the modules property.")
 64 | 
 65 |     def expected_str2var_keys(self, str2var, keys):
 66 |         self.used_keys = keys
 67 |         for key in keys:
 68 |             if key not in str2var:
 69 |                 log.error('Variable with name {0} expected, but not found in str2variable dict with keys {1}'.format(key, str2var.keys()))
 70 | 
 71 |     def expected_str2var_keys_oneof(self, str2var, keys):
 72 |         self.used_keys = keys
 73 |         one_exists = False
 74 |         for key in keys:
 75 |             if key in str2var:
 76 |                 one_exists = True
 77 |         if not one_exists:
 78 |             log.error('At least one of these variable was expected: {0}. But str2var only has these variables: {1}.', keys, str2var.keys())
 79 | 
 80 |     def expected_args(self, str_arg_names, str_arg_description):
 81 |         log.debug_once('Expected args {0}'.format(str_arg_names))
 82 |         log.debug_once('Info for the expected arguments: {0}'.format(str_arg_description))
 83 |         self.input_str_args = str_arg_names
 84 | 
 85 |     def generated_outputs(self, str_output_names, str_output_description):
 86 |         log.debug_once('Generated outputs: {0}'.format(str_output_names))
 87 |         log.debug_once('Info for the provided outputs: {0}'.format(str_output_description))
 88 |         self.output_str_args = str_output_names
 89 |         self.used_keys
 90 |         self.input_str_args
 91 |         self.output_str_args
 92 |         message = '{0} + {1} -> {2}'.format(self.used_keys, self.input_str_args, self.output_str_args)
 93 |         log.info_once(message)
 94 | 
 95 | 
 96 | class Embedding(object):
 97 |     def __init__(self, embedding_size, num_embeddings, scope=None):
 98 |         self.embedding_size = embedding_size
 99 |         self.scope = scope
100 |         self.num_embeddings = num_embeddings
101 | 
102 |         self.module = None
103 |         if Config.backend == Backends.TENSORFLOW:
104 |             from spodernet.backends.tfmodels import TFEmbedding
105 |             self.module = TFEmbedding(embedding_size, num_embeddings, scope)
106 |         elif Config.backend == Backends.TORCH:
107 |             from spodernet.backends.torchmodels import TorchEmbedding
108 |             self.module = TorchEmbedding(embedding_size, num_embeddings)
109 |             self.modules = [self.module]
110 | 
111 |     def forward(self, str2var, *args):
112 |         return self.module.forward(str2var, *args)
113 | 
114 | 
115 | class PairedBiDirectionalLSTM(object):
116 | 
117 |     def __init__(self, input_size, hidden_size, scope=None, conditional_encoding=True):
118 |         super(PairedBiDirectionalLSTM, self).__init__()
119 |         self.hidden_size = hidden_size
120 |         self.scope = scope
121 | 
122 |         self.module = None
123 |         if Config.backend == Backends.TENSORFLOW:
124 |             from spodernet.backends.tfmodels import TFPairedBiDirectionalLSTM
125 |             self.module = TFPairedBiDirectionalLSTM(hidden_size, scope, conditional_encoding)
126 |         elif Config.backend == Backends.TORCH:
127 |             from spodernet.backends.torchmodels import TorchPairedBiDirectionalLSTM, TorchVariableLengthOutputSelection
128 |             model = Model()
129 |             model.add(TorchPairedBiDirectionalLSTM(input_size, hidden_size, conditional_encoding=conditional_encoding))
130 |             model.add(TorchVariableLengthOutputSelection())
131 | 
132 |             self.module = model
133 |             self.modules = model.modules
134 | 
135 |     def forward(self, str2var, *args):
136 |         return self.module.forward(str2var, *args)
137 | 
138 | 
139 | class SoftmaxCrossEntropy(object):
140 |     def __init__(self, input_size, num_labels):
141 |         super(SoftmaxCrossEntropy, self).__init__()
142 |         self.num_labels = num_labels
143 | 
144 |         self.module = None
145 |         if Config.backend == Backends.TENSORFLOW:
146 |             from spodernet.backends.tfmodels import TFSoftmaxCrossEntropy
147 |             self.module = TFSoftmaxCrossEntropy(num_labels)
148 |         elif Config.backend == Backends.TORCH:
149 |             from spodernet.backends.torchmodels import TorchSoftmaxCrossEntropy
150 |             self.module = TorchSoftmaxCrossEntropy(input_size, num_labels)
151 |             self.modules  = [self.module]
152 | 
153 |     def forward(self, str2var, *args):
154 |         return self.module.forward(str2var, *args)
155 | 


--------------------------------------------------------------------------------
/IJCAI2021_demo/kg_completion/spodernet/hooks.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.stats
  3 | import datetime
  4 | 
  5 | from spodernet.interfaces import IAtIterEndObservable, IAtEpochEndObservable, IAtEpochStartObservable
  6 | from spodernet.utils.util import Timer
  7 | from spodernet.utils.global_config import Config, Backends
  8 | 
  9 | from spodernet.utils.logger import Logger
 10 | log = Logger('hooks.py.txt')
 11 | 
 12 | class AbstractHook(IAtIterEndObservable, IAtEpochEndObservable):
 13 |     def __init__(self, name, metric_name, print_every_x_batches):
 14 |         self.epoch_errors = []
 15 |         self.current_scores = []
 16 |         self.name = name
 17 |         self.iter_count = 0
 18 |         self.print_every = print_every_x_batches
 19 |         self.metric_name = metric_name
 20 |         self.epoch = 1
 21 | 
 22 |         # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
 23 |         self.n = 0
 24 |         self.epoch_n = 0
 25 |         self.mean = 0
 26 |         self.M2 = 0
 27 |         self.load_backend_specific_functions()
 28 | 
 29 |     def load_backend_specific_functions(self):
 30 |         if Config.backend == Backends.TORCH:
 31 |             from torch.autograd import Variable
 32 |             def convert_state(state):
 33 |                 if isinstance(state.targets, Variable):
 34 |                     state.targets = state.targets.data
 35 |                 if isinstance(state.argmax, Variable):
 36 |                     state.argmax = state.argmax.data
 37 |                 if isinstance(state.pred, Variable):
 38 |                     state.pred = state.pred.data
 39 |                 if isinstance(state.loss, Variable):
 40 |                     state.loss = state.loss.data
 41 |                 if isinstance(state.multi_labels, Variable):
 42 |                     state.multi_labels = state.multi_labels.data
 43 | 
 44 |                 return state
 45 | 
 46 |             self.convert_state = convert_state
 47 |         else:
 48 |             self.convert_state = lambda x: x
 49 | 
 50 |     def calculate_metric(self, state):
 51 |         raise NotImplementedError('Classes that inherit from abstract hook need to implement the calcualte metric method.')
 52 | 
 53 |     def at_end_of_iter_event(self, state):
 54 |         state = self.convert_state(state)
 55 |         metric = self.calculate_metric(state)
 56 |         #print(metric)
 57 | 
 58 |         self.n += 1
 59 |         delta = metric - self.mean
 60 |         self.mean += delta/self.n
 61 |         delta2 = metric - self.mean
 62 |         self.M2 += delta*delta2
 63 | 
 64 |         self.current_scores.append(metric)
 65 |         self.iter_count += 1
 66 |         if self.iter_count % self.print_every == 0:
 67 |             lower, upper, m, n = self.print_statistic()
 68 |             self.n = 0
 69 |             self.mean = 0
 70 |             self.M2 = 0
 71 |             return lower, upper, m, n
 72 |         return 0, 0, self.mean, self.n
 73 | 
 74 |     def at_end_of_epoch_event(self, state):
 75 |         if self.n == 0: return 0, 0, 0, 0
 76 |         self.epoch_errors.append(self.get_confidence_intervals())
 77 |         lower, upper, m, n = self.print_statistic(True)
 78 |         del self.current_scores[:]
 79 |         self.n = 0
 80 |         self.mean = 0
 81 |         self.M2 = 0
 82 |         self.epoch += 1
 83 |         self.iter_count = 0
 84 |         return lower, upper, m, n
 85 | 
 86 |     def get_confidence_intervals(self, percentile=0.99, limit=1000):
 87 |         z = scipy.stats.norm.ppf(percentile)
 88 |         var = self.M2/ (self.n)
 89 |         SE = np.sqrt(var/self.n)
 90 |         lower = self.mean-(z*SE)
 91 |         upper = self.mean+(z*SE)
 92 |         return [self.n, lower, self.mean, upper]
 93 | 
 94 |     def print_statistic(self, at_epoch_end=False):
 95 |         n, lower, m, upper = self.get_confidence_intervals()
 96 |         str_message = '{3} {4}: {2:.5}\t99% CI: ({0:.5}, {1:.5}), n={5}'.format(lower, upper, m, self.name, self.metric_name, self.n)
 97 |         if at_epoch_end: log.info('\n')
 98 |         if at_epoch_end: log.info('#'*40)
 99 |         if at_epoch_end: log.info(' '*10 + 'COMPLETED EPOCH: {0}'.format(self.epoch) + ' '*30)
100 |         log.info(str_message)
101 |         if at_epoch_end: log.info('#'*40)
102 |         if at_epoch_end: log.info('\n')
103 |         return lower, upper, m, n
104 | 
105 | 
106 | class AccuracyHook(AbstractHook):
107 |     def __init__(self, name='', print_every_x_batches=1000):
108 |         super(AccuracyHook, self).__init__(name, 'Accuracy', print_every_x_batches)
109 |         self.func = None
110 |         self.topk = 1
111 |         if Config.backend == Backends.TORCH:
112 |             import torch
113 |             self.func = lambda x: torch.sum(x)
114 | 
115 |     def calculate_metric(self, state):
116 |         if Config.backend == Backends.TORCH:
117 |             correct = 0.0
118 |             if len(state.argmax.size()) == 1:
119 |                 correct += self.func(state.targets==state.argmax)
120 |             else:
121 |                 topk = state.argmax.size(1)
122 |                 for i in range(topk):
123 |                     correct += self.func(state.targets==state.argmax[:, i])
124 |             n = state.argmax.size()[0]
125 |             return correct.item()/np.float32(n)
126 |         elif Config.backend == Backends.TENSORFLOW:
127 |             n = state.argmax.shape[0]
128 |             return np.sum(state.targets==state.argmax)/np.float32(n)
129 |         elif Config.backend == Backends.TEST:
130 |             n = state.argmax.shape[0]
131 |             return np.sum(state.targets==state.argmax)/np.float32(n)
132 |         else:
133 |             raise Exception('Backend has unsupported value {0}'.format(Config.backend))
134 | 
135 | 
136 | class TopKRankingLoss(AbstractHook):
137 |     def __init__(self, k, filtered=False, name='', print_every_x_batches=1000):
138 |         super(TopKRankingLoss, self).__init__(name, '{1}Hits@{0} loss'.format(k, ('' if not filtered else 'Filtered ')), print_every_x_batches)
139 |         self.func = None
140 |         self.argsort = None
141 |         self.sum_func = None
142 |         self.k = k
143 |         self.filtered = filtered
144 |         if Config.backend == Backends.TORCH:
145 |             import torch
146 |             self.argsort = lambda x, k: torch.topk(x, k)
147 |             self.sum_func = lambda x: torch.sum(x)
148 | 
149 | 
150 |     def calculate_metric(self, state):
151 |         if Config.backend == Backends.TORCH:
152 |             if self.filtered:
153 |                 import torch
154 |                 saved = torch.index_select(state.pred,1,state.targets)
155 |                 state.pred[state.multi_labels.byte()] = -100000.0
156 |                 state.pred.index_copy_(1, state.targets, saved)
157 | 
158 |             max_values, argmax = self.argsort(state.pred, self.k)
159 |             in_topk = 0
160 |             for i in range(self.k):
161 |                 in_topk += self.sum_func(argmax[:,i] == state.targets)
162 |             n = state.pred.size()[0]
163 |             return in_topk/np.float32(n)
164 |         else:
165 |             raise Exception('Backend has unsupported value {0}'.format(Config.backend))
166 | 
167 | 
168 | 
169 | class LossHook(AbstractHook):
170 |     def __init__(self, name='', print_every_x_batches=1000):
171 |         super(LossHook, self).__init__(name, 'Loss', print_every_x_batches)
172 | 
173 |     def calculate_metric(self, state):
174 |         if Config.backend == Backends.TORCH:
175 |             state = self.convert_state(state)
176 |             return state.loss.item()
177 |         else:
178 |             return state.loss
179 | 
180 | 
181 | class IntersectionHook(AbstractHook):
182 |     def __init__(self, name='', print_every_x_batches=1000):
183 |         super(IntersectionHook, self).__init__(name, 'Intersection', print_every_x_batches)
184 | 
185 |     def calculate_metric(self, state):
186 |         state = self.convert_state(state)
187 |         preds = state.pred
188 |         targets = state.targets
189 |         if Config.cuda:
190 |             preds = preds.cpu()
191 |             targets = targets.cpu()
192 | 
193 |         preds = preds.numpy()
194 |         targets = targets.numpy()
195 |         n = targets.size
196 |         k = 0
197 |         for row in range(Config.batch_size):
198 |             k += np.intersect1d(preds[row], targets[row]).size
199 | 
200 |         return k/float(n)
201 | 
202 | 
203 | 
204 | class ETAHook(AbstractHook, IAtEpochStartObservable):
205 |     def __init__(self, name='', print_every_x_batches=1000):
206 |         super(ETAHook, self).__init__(name, 'ETA', print_every_x_batches)
207 |         self.t = Timer(silent=True)
208 |         self.cumulative_t = 0.0
209 |         self.skipped_first = False
210 | 
211 |     def get_time_string(self, seconds):
212 |         m, s = divmod(seconds, 60)
213 |         h, m = divmod(m, 60)
214 |         if h < 0: h = 0
215 |         if m < 0: m = 0
216 |         if s < 0: s = 0
217 |         return "%d:%02d:%02d" % (h, m, s)
218 | 
219 |     def calculate_metric(self, state):
220 |         n = state.num_batches
221 |         i = state.current_idx
222 |         cumulative_t = self.t.tick('ETA')
223 |         total_time_estimate = (cumulative_t/i)*n
224 |         self.t.tick('ETA')
225 |         self.cumulative_t = cumulative_t
226 | 
227 |         return total_time_estimate
228 | 
229 |     def print_statistic(self):
230 |         if not self.skipped_first:
231 |             # the first estimation is very unreliable for time measures
232 |             self.skipped_first = True
233 |             return 0, 0, 0, 0
234 |         n, lower, m, upper = self.get_confidence_intervals()
235 |         lower -= self.cumulative_t
236 |         m -= self.cumulative_t
237 |         upper -= self.cumulative_t
238 |         lower, m, upper = self.get_time_string(lower), self.get_time_string(m), self.get_time_string(upper)
239 |         log.info('{3} {4}: {2}\t99% CI: ({0}, {1}), n={5}'.format(lower, upper, m, self.name, self.metric_name, n))
240 |         return lower, upper, m, n
241 | 
242 |     def at_start_of_epoch_event(self, batcher_state):
243 |         self.t.tick('ETA')
244 |         t = self.t.tick('Epoch')
245 | 
246 |     def at_end_of_epoch_event(self, state):
247 |         self.t.tock('ETA')
248 |         epoch_time = self.t.tock('Epoch')
249 |         self.epoch_errors.append([epoch_time])
250 |         log.info('Total epoch time: {0}'.format(self.get_time_string(epoch_time)))
251 |         del self.current_scores[:]
252 |         self.n = 0
253 |         self.mean = 0
254 |         self.M2 = 0
255 |         self.skipped_first = False
256 |         self.epoch += 1
257 |         return epoch_time
258 | 


--------------------------------------------------------------------------------
/IJCAI2021_demo/kg_completion/spodernet/interfaces.py:
--------------------------------------------------------------------------------
 1 | #These are completly useless, but they signify intent which is important.
 2 | 
 3 | class IAtIterEndObservable(object):
 4 |     def at_end_of_iter_event(self, batcher_state):
 5 |         raise NotImplementedError('Subclasses of IAtIterEndObservable need to override the end_of_iter_event method')
 6 | 
 7 | class IAtEpochStartObservable(object):
 8 |     def at_start_of_epoch_event(self, batcher_state):
 9 |         raise NotImplementedError('Subclasses of IAtEpochStartObservable need to override the at_start_of_epoch method')
10 | 
11 | class IAtEpochEndObservable(object):
12 |     def at_end_of_epoch_event(self, batcher_state):
13 |         raise NotImplementedError('Subclasses of IAtEpochEndObservable need to override the end_of_iter_epoch method')
14 | 
15 | class IAtBatchPreparedObservable(object):
16 |     def at_batch_prepared(self, batch_parts):
17 |         raise NotImplementedError('Subclasses of IAtBatchPreparedObservable need to override the at_batch_prepared method')
18 | 


--------------------------------------------------------------------------------
/IJCAI2021_demo/kg_completion/spodernet/preprocessing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/IJCAI2021_demo/kg_completion/spodernet/preprocessing/__init__.py


--------------------------------------------------------------------------------
/IJCAI2021_demo/kg_completion/spodernet/preprocessing/vocab.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | 
  3 | import numpy as np
  4 | import os
  5 | import time
  6 | import datetime
  7 | import pickle
  8 | import urllib
  9 | # import bashmagic
 10 | import time
 11 | import json
 12 | 
 13 | from spodernet.utils.util import get_data_path, save_data, xavier_uniform_weight
 14 | from os.path import join
 15 | 
 16 | from spodernet.utils.util import Logger
 17 | log = Logger('vocab.py.txt')
 18 | 
 19 | '''This models the vocabulary and token embeddings'''
 20 | 
 21 | class Vocab(object):
 22 |     '''Class that manages work/char embeddings'''
 23 | 
 24 |     def __init__(self, path, vocab = Counter(), labels = {}):
 25 |         '''Constructor.
 26 |         Args:
 27 |             vocab: Counter object with vocabulary.
 28 |         '''
 29 |         self.index = None
 30 |         token2idx = {}
 31 |         idx2token = {}
 32 |         self.label2idx = {}
 33 |         self.idx2label = {}
 34 |         self.glove_cache = {}
 35 |         for i, item in enumerate(vocab.items()):
 36 |             token2idx[item[0]] = i+1
 37 |             idx2token[i+1] = item[0]
 38 | 
 39 |         for idx in labels:
 40 |             self.label2idx[labels[idx]] = idx
 41 |             self.idx2label[idx] = labels[idx]
 42 | 
 43 |         # out of vocabulary token
 44 |         token2idx['OOV'] = int(0)
 45 |         idx2token[int(0)] = 'OOV'
 46 |         # empty = 0
 47 |         token2idx[''] = int(1)
 48 |         idx2token[int(1)] = ''
 49 | 
 50 |         self.token2idx = token2idx
 51 |         self.idx2token = idx2token
 52 |         self.path = path
 53 |         if len(idx2token.keys()) > 0:
 54 |             self.next_idx = int(np.max(list(idx2token.keys())) + 1)
 55 |         else:
 56 |             self.next_idx = int(2)
 57 | 
 58 |         if len(self.idx2label.keys()) > 0:
 59 |             self.next_label_2dx = int(int(np.max(self.idx2label.keys())) + 1)
 60 |         else:
 61 |             self.next_label_idx = int(0)
 62 | 
 63 |     @property
 64 |     def num_token(self):
 65 |         return len(self.token2idx)
 66 | 
 67 |     @property
 68 |     def num_labels(self):
 69 |         return len(self.label2idx)
 70 | 
 71 |     def add_token(self, token):
 72 |         if token not in self.token2idx:
 73 |             self.token2idx[token] = self.next_idx
 74 |             self.idx2token[self.next_idx] = token
 75 |             self.next_idx += 1
 76 | 
 77 |     def add_label(self, label):
 78 |         if label not in self.label2idx:
 79 |             self.label2idx[label] = self.next_label_idx
 80 |             self.idx2label[self.next_label_idx] = label
 81 |             self.next_label_idx += 1
 82 | 
 83 |     def get_idx(self, word):
 84 |         '''Gets the idx if it exists, otherwise returns -1.'''
 85 |         if word in self.token2idx:
 86 |             return self.token2idx[word]
 87 |         else:
 88 |             return self.token2idx['OOV']
 89 | 
 90 |     def get_idx_label(self, label):
 91 |         '''Gets the idx of the label'''
 92 |         return self.label2idx[label]
 93 | 
 94 |     def get_word(self, idx):
 95 |         '''Gets the word if it exists, otherwise returns OOV.'''
 96 |         if idx in self.idx2token:
 97 |             return self.idx2token[idx]
 98 |         else:
 99 |             return self.idx2token[0]
100 | 
101 |     def save_to_disk(self, name=''):
102 |         log.info('Saving vocab to: {0}'.format(self.path))
103 |         pickle.dump([self.token2idx, self.idx2token, self.label2idx,
104 |             self.idx2label], open(self.path + name, 'wb'))
105 | 
106 |     def load_from_disk(self, name=''):
107 |         if not os.path.exists(self.path + name):
108 |             return False
109 |         timestamp = time.ctime(os.path.getmtime(self.path + name))
110 |         timestamp = datetime.datetime.strptime(timestamp, '%a %b %d %H:%M:%S %Y')
111 |         age_in_hours = (datetime.datetime.now() - timestamp).seconds/60./60.
112 |         log.info('Loading vocab from: {0}'.format(self.path + name))
113 |         self.token2idx, self.idx2token, self.label2idx, self.idx2label = pickle.load(open(self.path, 'rb'))
114 |         if age_in_hours > 12:
115 |             log.info('Vocabulary outdated: {0}'.format(self.path + name))
116 |             return False
117 |         else:
118 |             return True
119 | 
120 |     def download_glove(self):
121 |         if not os.path.exists(join(get_data_path(), 'glove')):
122 |             log.info('Glove data is missing, dowloading data now...')
123 |             os.mkdir(join(get_data_path(), 'glove'))
124 |             bashmagic.wget("http://nlp.stanford.edu/data/glove.6B.zip", join(get_data_path(),'glove'))
125 |             bashmagic.unzip(join(get_data_path(), 'glove', 'glove.6B.zip'), join(get_data_path(), 'glove'))
126 | 
127 |     def prepare_glove(self, dimension):
128 |         if self.index is not None: return
129 |         if not os.path.exists(join(get_data_path(), 'glove', 'index_50.p')):
130 |             dims = [50, 100, 200, 300]
131 |             base_filename = 'glove.6B.{0}d.txt'
132 |             paths = [join(get_data_path(), 'glove', base_filename.format(dim)) for dim in dims]
133 |             for path, dim in zip(paths, dims):
134 |                 index = {}
135 |                 index = {'PATH' : path}
136 |                 with open(path, 'rb') as f:
137 |                     log.info('Building index for {0}', path)
138 |                     while True:
139 |                         prev_pos = f.tell()
140 |                         line = f.readline().decode('utf-8')
141 |                         if line == '': break
142 |                         next_pos = f.tell()
143 |                         data = line.strip().split(' ')
144 |                         token = data[0]
145 |                         index[token] = (prev_pos, next_pos)
146 | 
147 |                 log.info('Saving glove index...')
148 |                 json.dump(index, open(join(get_data_path(), 'glove', 'index_{0}.p'.format(dim)), 'w'))
149 | 
150 |         log.info('Loading glove index...')
151 |         self.index = json.load(open(join(get_data_path(), 'glove', 'index_{0}.p'.format(dimension)), 'r'))
152 | 
153 | 
154 |     def load_matrix(self, dim):
155 |         log.info('Initializing glove matrix...')
156 |         X = xavier_uniform_weight(len(self.token2idx), dim)
157 |         log.info('Loading vectors into glove matrix with dimension: {0}', X.shape)
158 |         pretrained_count = 0
159 |         n = len(self.token2idx)-2
160 |         for i, (token, idx) in enumerate(self.token2idx.items()):
161 |             if i % 10000 == 0: print(i)
162 |             vec = self.get_glove_list(token, dim)
163 |             if vec is not None:
164 |                 X[idx] = vec
165 |                 pretrained_count += 1
166 |         log.info('Filled matrix with {0} pretrained embeddings and {1} xavier uniform initialized embeddings.', pretrained_count, n-pretrained_count)
167 |         return X
168 | 
169 |     def get_glove_vector(self, token, dimension=300):
170 |         if token in self.glove_cache: return self.glove_cache[token]
171 |         vec = self.get_glove_list(token, dimension)
172 |         if vec is not None:
173 |             arr = np.array(vec, dtype=np.float32)
174 |             self.glove_cache[token] = arr
175 |             return arr
176 |         else: return None
177 | 
178 |     def get_glove_list(self, token, dimension=300):
179 |         assert dimension in [50, 100, 200, 300], 'Dimension not supported! Only dimension 50, 100, 200, and 300 are supported!'
180 |         self.download_glove()
181 |         self.prepare_glove(dimension)
182 |         vec = None
183 |         if token in self.index:
184 |             p = self.index['PATH']
185 |             with open(p, 'rb') as f:
186 |                 start, end = self.index[token]
187 |                 f.seek(start)
188 |                 line = f.read(end-start).decode('utf-8')
189 |                 data = line.strip().split(' ')
190 |                 vec = data[1:]
191 | 
192 |         return vec
193 | 
194 |     def exists_in_glove(self, token, dimension=300):
195 |         self.download_glove()
196 |         self.prepare_glove(dimension)
197 |         return token in self.index
198 | 
199 | 
200 |     def get_glove_matrix(self, dimension):
201 |         assert dimension in [50, 100, 200, 300], 'Dimension not supported! Only dimension 50, 100, 200, and 300 are supported!'
202 |         self.download_glove()
203 |         return self.load_matrix(dimension)
204 | 


--------------------------------------------------------------------------------
/IJCAI2021_demo/kg_completion/spodernet/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/IJCAI2021_demo/kg_completion/spodernet/utils/__init__.py


--------------------------------------------------------------------------------
/IJCAI2021_demo/kg_completion/spodernet/utils/cuda_utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import torch
 3 | from torch.cuda import Event
 4 | 
 5 | class CUDATimer(object):
 6 |     def __init__(self, silent=False):
 7 |         self.cumulative_secs = {}
 8 |         self.current_ticks = {}
 9 |         self.silent = silent
10 |         self.end = Event(enable_timing=True, blocking=True)
11 | 
12 |     def tick(self, name='default'):
13 |         if name not in self.current_ticks:
14 |             start = Event(enable_timing=True, blocking=True)
15 |             start.record()
16 |             self.current_ticks[name] = start
17 | 
18 |             return 0.0
19 |         else:
20 |             if name not in self.cumulative_secs:
21 |                 self.cumulative_secs[name] = 0
22 |             self.end.record()
23 |             self.end.synchronize()
24 |             self.cumulative_secs[name] += self.current_ticks[name].elapsed_time(self.end)/1000.
25 |             self.current_ticks.pop(name)
26 | 
27 |             return self.cumulative_secs[name]
28 | 
29 |     def tock(self, name='default'):
30 |         self.tick(name)
31 |         value = self.cumulative_secs[name]
32 |         if not self.silent:
33 |             print('Time taken for {0}: {1:.8f}s'.format(name, value))
34 |         self.cumulative_secs.pop(name)
35 |         if name in self.current_ticks:
36 |             del self.current_ticks[name]
37 |         self.current_ticks.pop(name, None)
38 | 
39 |         return value
40 | 


--------------------------------------------------------------------------------
/IJCAI2021_demo/kg_completion/spodernet/utils/global_config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from collections import namedtuple
  3 | from spodernet.utils.logger import Logger
  4 | log = Logger('global_config.py.txt')
  5 | 
  6 | class Backends:
  7 |     TORCH = 'pytorch'
  8 |     TENSORFLOW = 'tensorflow'
  9 |     TEST = 'test'
 10 |     CNTK = 'cntk'
 11 | 
 12 | 
 13 | class Config:
 14 |     dropout = 0.0
 15 |     batch_size = 128
 16 |     learning_rate = 0.001
 17 |     backend = Backends.TORCH
 18 |     L2 = 0.000
 19 |     cuda = False
 20 |     embedding_dim = 128
 21 |     hidden_size = 256
 22 |     input_dropout = 0.0
 23 |     feature_map_dropout = 0.0
 24 |     use_conv_transpose = False
 25 |     use_bias = True
 26 |     optimizer = 'adam'
 27 |     learning_rate_decay = 1.0
 28 |     label_smoothing_epsilon = 0.1
 29 |     epochs = 1000
 30 |     dataset = None
 31 |     process = False
 32 |     model_name = None
 33 | 
 34 |     @staticmethod
 35 |     def parse_argv(argv):
 36 |         file_name = argv[0]
 37 |         args = argv[1:]
 38 |         assert len(args) % 2 == 0, 'Global parser expects an even number of arguments.'
 39 |         values = []
 40 |         names = []
 41 |         for i, token in enumerate(args):
 42 |             if i % 2 == 0:
 43 |                 names.append(token)
 44 |             else:
 45 |                 values.append(token)
 46 | 
 47 |         for i in range(len(names)):
 48 |             if names[i] in alias2params:
 49 |                 log.debug('Replaced parameters alias {0} with name {1}', names[i], alias2params[names[i]])
 50 |                 names[i] = alias2params[names[i]]
 51 | 
 52 |         for i in range(len(names)):
 53 |             name = names[i]
 54 |             if name[:2] == '--': continue
 55 |             if name not in params2type:
 56 |                 log.info('List of possible parameters: {0}', params2type.keys())
 57 |                 log.error('Parameter {0} does not exist. Prefix your custom parameters with -- to skip parsing for global config', name)
 58 |             values[i] = params2type[name](values[i])
 59 | 
 60 |         for name, value in zip(names, values):
 61 |             if name[:2] == '--': continue
 62 |             params2field[name](value)
 63 |             log.info('Set parameter {0} to {1}', name, value)
 64 | 
 65 |     input_dropout = 0.0
 66 |     feature_map_dropout = 0.0
 67 |     use_transposed_convolutions = False
 68 |     use_bias = True
 69 | 
 70 | params2type = {}
 71 | params2type['learning_rate'] = lambda x: float(x)
 72 | params2type['learning_rate_decay'] = lambda x: float(x)
 73 | params2type['dropout'] = lambda x: float(x)
 74 | params2type['batch_size'] = lambda x: int(x)
 75 | params2type['L2'] = lambda x: float(x)
 76 | params2type['embedding_dim'] = lambda x: int(x)
 77 | params2type['hidden_size'] = lambda x: int(x)
 78 | params2type['input_dropout'] = lambda x: float(x)
 79 | params2type['label_smoothing_epsilon'] = lambda x: float(x)
 80 | params2type['feature_map_dropout'] = lambda x: float(x)
 81 | params2type['use_conv_transpose'] = lambda x: x.lower() == 'true' or x == '1'
 82 | params2type['use_bias'] = lambda x: x.lower() == 'true' or x == '1'
 83 | params2type['optimizer'] = lambda x: x
 84 | params2type['epochs'] = lambda x: int(x)
 85 | params2type['dataset'] = lambda x: x
 86 | params2type['model_name'] = lambda x: x
 87 | params2type['process'] = lambda x: x.lower() == 'true' or x == '1'
 88 | 
 89 | alias2params = {}
 90 | alias2params['lr'] = 'learning_rate'
 91 | alias2params['lr_decay'] = 'learning_rate_decay'
 92 | alias2params['l2'] = 'L2'
 93 | alias2params['input_drop'] = 'input_dropout'
 94 | alias2params['hidden_drop'] = 'dropout'
 95 | alias2params['feat_drop'] = 'feature_map_dropout'
 96 | alias2params['bias'] = 'use_bias'
 97 | alias2params['conv_trans'] = 'use_conv_transpose'
 98 | alias2params['opt'] = 'optimizer'
 99 | alias2params['label_smoothing'] = 'label_smoothing_epsilon'
100 | alias2params['model'] = 'model_name'
101 | 
102 | 
103 | 
104 | params2field = {}
105 | params2field['learning_rate'] = lambda x: setattr(Config, 'learning_rate', x)
106 | params2field['learning_rate_decay'] = lambda x: setattr(Config, 'learning_rate_decay', x)
107 | params2field['dropout'] = lambda x: setattr(Config, 'dropout', x)
108 | params2field['batch_size'] = lambda x: setattr(Config, 'batch_size', x)
109 | params2field['L2'] = lambda x: setattr(Config, 'L2', x)
110 | params2field['embedding_dim'] = lambda x: setattr(Config, 'embedding_dim', x)
111 | params2field['hidden_size'] = lambda x: setattr(Config, 'hidden_size', x)
112 | params2field['input_dropout'] = lambda x: setattr(Config, 'input_dropout', x)
113 | params2field['feature_map_dropout'] = lambda x: setattr(Config, 'feature_map_dropout', x)
114 | params2field['use_conv_transpose'] = lambda x: setattr(Config, 'use_conv_transpose', x)
115 | params2field['use_bias'] = lambda x: setattr(Config, 'use_bias', x)
116 | params2field['optimizer'] = lambda x: setattr(Config, 'optimizer', x)
117 | params2field['label_smoothing_epsilon'] = lambda x: setattr(Config, 'label_smoothing_epsilon', x)
118 | params2field['epochs'] = lambda x: setattr(Config, 'epochs', x)
119 | params2field['dataset'] = lambda x: setattr(Config, 'dataset', x)
120 | params2field['process'] = lambda x: setattr(Config, 'process', x)
121 | params2field['model_name'] = lambda x: setattr(Config, 'model_name', x)
122 | 
123 | 
124 | 


--------------------------------------------------------------------------------
/IJCAI2021_demo/kg_completion/spodernet/utils/logger.py:
--------------------------------------------------------------------------------
  1 | from enum import IntEnum
  2 | from os.path import join
  3 | 
  4 | import os
  5 | import datetime
  6 | import numpy as np
  7 | import time
  8 | 
  9 | # util functions start
 10 | #
 11 | # these function also exist in util.py,
 12 | # but since logger is imported everywere these function need to be included here
 13 | 
 14 | def get_home_path():
 15 |     return os.environ['HOME']
 16 | 
 17 | def get_logger_path():
 18 |     return join(get_home_path(), '.data', 'log_files')
 19 | 
 20 | def make_dirs_if_not_exists(path):
 21 |     if not os.path.exists(path):
 22 |         os.makedirs(path)
 23 | 
 24 | # util functions end
 25 | class GlobalLogger:
 26 |     timestr = None
 27 |     global_logger_path = None
 28 |     f_global_logger = None
 29 | 
 30 |     @staticmethod
 31 |     def init():
 32 |         GlobalLogger.timestr = time.strftime("%Y%m%d-%H%M%S")
 33 |         if not os.path.exists(join(get_logger_path(), 'full_logs')):
 34 |             os.mkdir(join(get_logger_path(), 'full_logs'))
 35 |         GlobalLogger.global_logger_path = join(get_logger_path(), 'full_logs', GlobalLogger.timestr +  '.txt')
 36 |         GlobalLogger.f_global_logger = open(GlobalLogger.global_logger_path, 'w')
 37 | 
 38 |     @staticmethod
 39 |     def flush():
 40 |         GlobalLogger.f_global_logger.close()
 41 |         GlobalLogger.f_global_logger = open(GlobalLogger.global_logger_path, 'a')
 42 | 
 43 |     def __del__(self):
 44 |         GlobalLogger.f_global_logger.close()
 45 | 
 46 | class LogLevel(IntEnum):
 47 |     STATISTICAL = 0
 48 |     DEBUG = 1
 49 |     INFO = 2
 50 |     WARNING = 3
 51 |     ERROR = 4
 52 | 
 53 | class Logger:
 54 |     GLOBAL_LOG_LEVEL = LogLevel.INFO
 55 |     LOG_PROPABILITY = 0.05
 56 |     USE_GLOBAL_STATISTICAL_LOG_PROBABILITY = False
 57 |     PRINT_COUNT = 2
 58 | 
 59 |     def __init__(self, file_name, write_type='w'):
 60 |         path = join(get_logger_path(), file_name)
 61 |         path_statistical = join(get_logger_path(), 'statistical_' + file_name)
 62 |         self.path = path
 63 |         make_dirs_if_not_exists(get_logger_path())
 64 |         self.f = open(path, write_type)
 65 |         self.f_statistical = open(path_statistical, write_type)
 66 |         self.rdm = np.random.RandomState(234234)
 67 |         self.debug('Created log file at: {0} with write type: {1}'.format(path, write_type))
 68 |         self.once_dict = {}
 69 | 
 70 |     def __del__(self):
 71 |         self.f.close()
 72 |         self.f_statistical.close()
 73 | 
 74 |     def wrap_message(self, message, log_level, *args):
 75 |         return '{0} ({2}): {1}'.format(datetime.datetime.now(), message.format(*args), log_level.name)
 76 | 
 77 |     def statistical(self, message, p, *args):
 78 |         if Logger.GLOBAL_LOG_LEVEL == LogLevel.STATISTICAL:
 79 |             self._log_statistical(message, p, *args)
 80 | 
 81 |     def debug(self, message, *args):
 82 |         self._log(message, LogLevel.DEBUG, *args)
 83 | 
 84 |     def info_once(self, message, *args):
 85 |         if LogLevel.INFO < Logger.GLOBAL_LOG_LEVEL: return
 86 |         if message not in self.once_dict: self.once_dict[message] = 0
 87 |         if self.once_dict[message] < Logger.PRINT_COUNT:
 88 |             self.once_dict[message] += 1
 89 |             self._log(message, LogLevel.INFO, *args)
 90 | 
 91 |     def debug_once(self, message, *args):
 92 |         if LogLevel.DEBUG < Logger.GLOBAL_LOG_LEVEL: return
 93 |         if message not in self.once_dict: self.once_dict[message] = 0
 94 |         if self.once_dict[message] < Logger.PRINT_COUNT:
 95 |             self.once_dict[message] += 1
 96 |             self._log(message, LogLevel.DEBUG, *args)
 97 | 
 98 |     def info(self, message, *args):
 99 |         self._log(message, LogLevel.INFO, *args)
100 | 
101 |     def warning(self, message, *args):
102 |         self._log(message, LogLevel.WARNING, *args)
103 | 
104 |     def error(self, message, *args):
105 |         self._log(message, LogLevel.ERROR, *args)
106 |         raise Exception(message.format(*args))
107 | 
108 |     def _log_statistical(self, message, p, *args):
109 |         rdm_num = self.rdm.rand()
110 |         if Logger.USE_GLOBAL_STATISTICAL_LOG_PROBABILITY:
111 |             if rdm_num < Logger.LOG_PROPABILITY:
112 |                 message = self.wrap_message(message, LogLevel.STATISTICAL, *args)
113 |                 self.f_statistical.write(message + '\n')
114 |         else:
115 |             if rdm_num < p:
116 |                 message = self.wrap_message(message, LogLevel.STATISTICAL, *args)
117 |                 self.f_statistical.write(message + '\n')
118 | 
119 |     def _log(self, message, log_level=LogLevel.INFO, *args):
120 |         if log_level >= Logger.GLOBAL_LOG_LEVEL:
121 |             message = self.wrap_message(message, log_level, *args)
122 |             if message.strip() != '':
123 |                 print(message)
124 |                 self.f.write(message + '\n')
125 |                 if GlobalLogger.f_global_logger is None: GlobalLogger.init()
126 |                 GlobalLogger.f_global_logger.write(message + '\n')
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/IJCAI2021_demo/kg_completion/spodernet/utils/spacy_util.py:
--------------------------------------------------------------------------------
 1 | import spacy
 2 | 
 3 | subjects = set(['nsubj'])
 4 | objects = set(['dobj', 'pobj'])
 5 | 
 6 | def merge_noun_phrases(sent_doc):
 7 |     for np in sent_doc.noun_chunks:
 8 |        np.merge(np.root.tag_, np.text, np.root.ent_type_)
 9 | 
10 | def merge_entities(sent_doc):
11 |     for ent in sent_doc.ents:
12 |         ent.merge(ent.root.dep_, ent.text, ent.label_)
13 | 
14 | def merge_verbs(sent_doc):
15 |     has_double_verb = False
16 |     for span_length in [3, 2]:
17 |         i = 1
18 |         while i < len(sent_doc)-1:
19 |             token = sent_doc[i]
20 |             if token.pos_ == 'VERB':
21 |                 full_match = True
22 |                 for j in range(1, span_length):
23 |                     full_match &= sent_doc[i-j].pos_ == 'VERB'
24 |                 if full_match:
25 |                     span = sent_doc[i-1:i+span_length-1]
26 |                     span.merge()
27 |                     i += span_length-1
28 |                     has_double_verb = True
29 |             i += 1
30 | 
31 | def merge_with_set(sent_doc, to_match, write_key='pobj'):
32 |     for span_length in [5, 4, 3, 2]:
33 |         i = span_length-1
34 |         while i < len(sent_doc)-1:
35 |             token = sent_doc[i]
36 |             if token.dep_ in write_key:
37 |                 pos, dep = token.pos_, token.dep_
38 |                 full_match = True
39 |                 for j in range(1, span_length):
40 |                     full_match &= sent_doc[i-j].dep_ in to_match
41 |                     full_match &= sent_doc[i-j].pos_ != 'VERB'
42 |                     idx = sent_doc[i-j].idx
43 |                 if full_match:
44 |                     span = sent_doc[i-1:i+span_length-1]
45 |                     span.merge()
46 |                     sent_doc[i-1].dep_ = dep
47 |                     i += span_length-1
48 |             i += 1
49 | 
50 | def merge_tokens(sent_doc):
51 |     merge_noun_phrases(sent_doc)
52 |     merge_entities(sent_doc)
53 |     merge_verbs(sent_doc)
54 |     merge_with_set(sent_doc, set(['pobj', 'prep']))
55 |     merge_with_set(sent_doc, set(['pobj', 'prep']))
56 |     merge_with_set(sent_doc, set(['pobj', 'prep']))
57 |     merge_with_set(sent_doc, set(['attr', 'punct', 'cc', 'conj', 'pobj']), 'pobj')
58 |     merge_with_set(sent_doc, set(['attr', 'punct', 'cc', 'conj', 'pobj']), 'pobj')
59 |     merge_with_set(sent_doc, set(['dobj', 'pobj']), 'dobj')
60 |     merge_with_set(sent_doc, set(['dobj', 'pobj']), 'dobj')
61 |     merge_with_set(sent_doc, set(['attr', 'punct', 'cc', 'conj', 'pobj', 'dobj']), 'dobj')
62 |     merge_with_set(sent_doc, set(['attr', 'punct', 'cc', 'conj', 'pobj', 'dobj']), 'dobj')
63 | 
64 | def extract_triples(sent_doc):
65 |     triples = []
66 |     triple = []
67 |     for token in sent_doc:
68 |         if token.pos_ == 'VERB':
69 |             if len(triple) == 0: continue
70 |             if triple[-1].dep_ in subjects:
71 |                 triple.append(token)
72 |             else:
73 |                 triple = []
74 |         if token.dep_ in subjects:
75 |             if len(triple) == 0:
76 |                 triple.append(token)
77 |             else:
78 |                 triple = [token]
79 |         if token.dep_ in objects:
80 |             if len(triple) == 0: continue
81 |             if triple[-1].pos_ == 'VERB':
82 |                 triple.append(token)
83 |                 triples.append(triple)
84 |                 triple = []
85 |             else:
86 |                 triple = []
87 |     return triples
88 | 


--------------------------------------------------------------------------------
/IJCAI2021_demo/kg_completion/spodernet/utils/util.py:
--------------------------------------------------------------------------------
  1 | from os.path import join
  2 | from scipy.sparse import csr_matrix, spmatrix
  3 | 
  4 | import h5py
  5 | import os
  6 | import time
  7 | import os
  8 | import numpy as np
  9 | import torch
 10 | 
 11 | from spodernet.utils.logger import Logger
 12 | log = Logger('util.py.txt')
 13 | 
 14 | rdm = np.random.RandomState(2345235)
 15 | 
 16 | def save_dense_hdf(path, data):
 17 |     '''Writes a numpy array to a hdf5 file under the given path.'''
 18 |     log.debug_once('Saving hdf5 file to: {0}', path)
 19 |     h5file = h5py.File(path, "w")
 20 |     h5file.create_dataset("default", data=data)
 21 |     h5file.close()
 22 | 
 23 | 
 24 | def load_dense_hdf(path, keyword='default'):
 25 |     '''Reads and returns a numpy array for a hdf5 file'''
 26 |     log.debug_once('Reading hdf5 file from: {0}', path)
 27 |     h5file = h5py.File(path, 'r')
 28 |     dset = h5file.get(keyword)
 29 |     data = dset[:]
 30 |     h5file.close()
 31 |     return data
 32 | 
 33 | def save_sparse_hdf(path, data):
 34 |     shape = data.shape
 35 |     sparse = csr_matrix(data)
 36 |     folder, filename = os.path.split(path)
 37 |     save_dense_hdf(join(folder, 'data_' + filename), sparse.data)
 38 |     save_dense_hdf(join(folder, 'indices_' + filename), sparse.indices)
 39 |     save_dense_hdf(join(folder, 'indptr_' + filename), sparse.indptr)
 40 |     save_dense_hdf(join(folder, 'shape_dense_' + filename), shape)
 41 |     save_dense_hdf(join(folder, 'shape_sparse_' + filename), sparse.shape)
 42 | 
 43 | def load_sparse_hdf(path, keyword='default'):
 44 |     folder, filename = os.path.split(path)
 45 |     data = load_dense_hdf(join(folder, 'data_' + filename))
 46 |     indices = load_dense_hdf(join(folder, 'indices_' + filename))
 47 |     indptr = load_dense_hdf(join(folder, 'indptr_' + filename))
 48 |     shape = load_dense_hdf(join(folder, 'shape_dense_' + filename))
 49 |     shape_sparse = load_dense_hdf(join(folder, 'shape_sparse_' + filename))
 50 |     return csr_matrix((data, indices, indptr), shape=shape_sparse).toarray().reshape(shape)
 51 | 
 52 | def load_data(path):
 53 |     folder, filename = os.path.split(path)
 54 |     if os.path.exists(join(folder, 'indptr_' + filename)):
 55 |         data = load_sparse_hdf(path)
 56 |         return data
 57 |     else:
 58 |         return load_dense_hdf(path)
 59 | 
 60 | def save_data(path, data):
 61 |     assert data.size > 0
 62 |     is_sparse = isinstance(data, spmatrix)
 63 |     if is_sparse:
 64 |         save_sparse_hdf(path, data)
 65 |         return
 66 | 
 67 |     zero = (data == 0.0).sum()
 68 |     percent = zero/float(data.size)
 69 |     if percent > 0.5:
 70 |         save_sparse_hdf(path, data)
 71 |     else:
 72 |         save_dense_hdf(path, data)
 73 | 
 74 | 
 75 | def load_hdf5_paths(paths, limit=None):
 76 |     data = []
 77 |     for path in paths:
 78 |         if limit != None:
 79 |             data.append(load_data(path)[:limit])
 80 |         else:
 81 |             data.append(load_data(path))
 82 |     return data
 83 | 
 84 | def get_home_path():
 85 |     return os.environ['HOME']
 86 | 
 87 | def get_data_path():
 88 |     return join(os.environ['HOME'], '.data')
 89 | 
 90 | def make_dirs_if_not_exists(path):
 91 |     if not os.path.exists(path):
 92 |         os.makedirs(path)
 93 | 
 94 | # taken from pytorch; gain parameter is omitted
 95 | def xavier_uniform_weight(fan_in, fan_out):
 96 |     std = np.sqrt(2.0 / (fan_in + fan_out))
 97 |     a = np.sqrt(3.0) * std
 98 |     return np.float32(rdm.uniform(-a, a, size=(fan_in, fan_out)))
 99 | 
100 | def embedding_sequence2text(vocab, embedding, break_at_0=True):
101 |     if not isinstance(embedding, np.ndarray):
102 |         if isinstance(embedding, torch.autograd.Variable):
103 |             emb = embedding.data.cpu().numpy()
104 |         else:
105 |             emb = embedding.cpu().numpy()
106 |     else:
107 |         emb = embedding
108 |     sentences = []
109 |     for row in emb:
110 |         sentence_array = []
111 |         for idx in row:
112 |             if idx == 0: break
113 |             sentence_array.append(vocab.get_word(idx))
114 |         sentences.append(sentence_array)
115 |     return sentences
116 | 
117 | class PercentileRejecter(object):
118 | 
119 |     def __init__(self, above_percentile_threshold):
120 |         self.values = []
121 |         self.percentile_threshold = above_percentile_threshold
122 |         self.threshold_value = 0
123 |         self.current_iter = 0
124 |         self.compute_every = 1
125 | 
126 |     def above_percentile(self, value, percentile=None):
127 |         self.values.append(value)
128 |         self.current_iter += 1
129 |         if len(self.values) < 20:
130 |             return False
131 |         else:
132 |             if percentile is None:
133 |                 if self.current_iter % self.compute_every == 0:
134 |                     p = np.percentile(self.values, self.percentile_threshold)
135 |                     if p*1.05 < self.threshold_value or p*0.95 > self.threshold_value:
136 |                         self.threshold_value = p
137 |                         self.compute_every -= 1
138 |                         if self.compute_every < 1: self.compute_every = 1
139 |                     else:
140 |                         self.compute_every += 1
141 |                 else:
142 |                     p = self.threshold_value
143 |             else:
144 |                 p = np.percentile(self.values, percentile)
145 |                 self.threshold_value = p
146 |             return value > p
147 | 
148 | 
149 | class Timer(object):
150 |     def __init__(self, silent=False):
151 |         self.cumulative_secs = {}
152 |         self.current_ticks = {}
153 |         self.silent = silent
154 | 
155 |     def tick(self, name='default'):
156 |         if name not in self.current_ticks:
157 |             self.current_ticks[name] = time.time()
158 | 
159 |             return 0.0
160 |         else:
161 |             if name not in self.cumulative_secs:
162 |                 self.cumulative_secs[name] = 0
163 |             t = time.time()
164 |             self.cumulative_secs[name] += t - self.current_ticks[name]
165 |             self.current_ticks.pop(name)
166 | 
167 |             return self.cumulative_secs[name]
168 | 
169 |     def tock(self, name='default'):
170 |         self.tick(name)
171 |         value = self.cumulative_secs[name]
172 |         if not self.silent:
173 |             log.info('Time taken for {0}: {1:.8f}s'.format(name, value))
174 |         self.cumulative_secs.pop(name)
175 |         self.current_ticks.pop(name, None)
176 | 
177 |         return value
178 | 
179 | 


--------------------------------------------------------------------------------
/IJCAI2021_demo/kg_completion/wrangle_KG.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from os.path import join
  3 | import json
  4 | 
  5 | import argparse
  6 | import datetime
  7 | import json
  8 | import urllib
  9 | import pickle
 10 | import os
 11 | import numpy as np
 12 | import operator
 13 | import sys
 14 | 
 15 | rdm = np.random.RandomState(234234)
 16 | 
 17 | if len(sys.argv) > 1:
 18 |     dataset_name = sys.argv[1]
 19 | else:
 20 |     dataset_name = 'FB15k-237'
 21 |     #dataset_name = 'FB15k'
 22 |     #dataset_name = 'yago'
 23 |     #dataset_name = 'WN18RR'
 24 | 
 25 | print('Processing dataset {0}'.format(dataset_name))
 26 | 
 27 | rdm = np.random.RandomState(2342423)
 28 | base_path = 'kg_completion/data/{0}/'.format(dataset_name)
 29 | files = ['train.txt', 'valid.txt', 'test.txt']
 30 | 
 31 | data = []
 32 | for p in files:
 33 |     with open(join(base_path, p)) as f:
 34 |         data = f.readlines() + data
 35 | 
 36 | 
 37 | label_graph = {}
 38 | train_graph = {}
 39 | test_cases = {}
 40 | for p in files:
 41 |     test_cases[p] = []
 42 |     train_graph[p] = {}
 43 | 
 44 | 
 45 | for p in files:
 46 |     with open(join(base_path, p)) as f:
 47 |         for i, line in enumerate(f):
 48 |             e1, rel, e2 = line.split('\t')
 49 |             e1 = e1.strip()
 50 |             e2 = e2.strip()
 51 |             rel = rel.strip()
 52 |             rel_reverse = rel+ '_reverse'
 53 | 
 54 |             # data
 55 |             # (Mike, fatherOf, John)
 56 |             # (John, fatherOf, Tom)
 57 | 
 58 |             if (e1 , rel) not in label_graph:
 59 |                 label_graph[(e1, rel)] = set()
 60 | 
 61 |             if (e2,  rel_reverse) not in label_graph:
 62 |                 label_graph[(e2, rel_reverse)] = set()
 63 | 
 64 |             if (e1,  rel) not in train_graph[p]:
 65 |                 train_graph[p][(e1, rel)] = set()
 66 |             if (e2, rel_reverse) not in train_graph[p]:
 67 |                 train_graph[p][(e2, rel_reverse)] = set()
 68 | 
 69 |             # labels
 70 |             # (Mike, fatherOf, John)
 71 |             # (John, fatherOf, Tom)
 72 |             # (John, fatherOf_reverse, Mike)
 73 |             # (Tom, fatherOf_reverse, Mike)
 74 |             label_graph[(e1, rel)].add(e2)
 75 | 
 76 |             label_graph[(e2, rel_reverse)].add(e1)
 77 | 
 78 |             # test cases
 79 |             # (Mike, fatherOf, John)
 80 |             # (John, fatherOf, Tom)
 81 |             test_cases[p].append([e1, rel, e2])
 82 | 
 83 |             # data
 84 |             # (Mike, fatherOf, John)
 85 |             # (John, fatherOf, Tom)
 86 |             # (John, fatherOf_reverse, Mike)
 87 |             # (Tom, fatherOf_reverse, John)
 88 |             train_graph[p][(e1, rel)].add(e2)
 89 |             train_graph[p][(e2, rel_reverse)].add(e1)
 90 | 
 91 | 
 92 | 
 93 | def write_training_graph(cases, graph, path):
 94 |     with open(path, 'w') as f:
 95 |         n = len(graph)
 96 |         for i, key in enumerate(graph):
 97 |             e1, rel = key
 98 |             # (Mike, fatherOf, John)
 99 |             # (John, fatherOf, Tom)
100 |             # (John, fatherOf_reverse, Mike)
101 |             # (Tom, fatherOf_reverse, John)
102 | 
103 |             # (John, fatherOf) -> Tom
104 |             # (John, fatherOf_reverse, Mike) 
105 |             entities1 = " ".join(list(graph[key]))
106 | 
107 |             data_point = {}
108 |             data_point['e1'] = e1
109 |             data_point['e2'] = 'None'
110 |             data_point['rel'] = rel
111 |             data_point['rel_eval'] = 'None'
112 |             data_point['e2_multi1'] =  entities1
113 |             data_point['e2_multi2'] = "None"
114 | 
115 |             f.write(json.dumps(data_point)  + '\n')
116 | 
117 | def write_evaluation_graph(cases, graph, path):
118 |     with open(path, 'w') as f:
119 |         n = len(cases)
120 |         n1 = 0
121 |         n2 = 0
122 |         for i, (e1, rel, e2) in enumerate(cases):
123 |             # (Mike, fatherOf) -> John
124 |             # (John, fatherOf, Tom)
125 |             rel_reverse = rel+'_reverse'
126 |             entities1 = " ".join(list(graph[(e1, rel)]))
127 |             entities2 = " ".join(list(graph[(e2, rel_reverse)]))
128 | 
129 |             n1 += len(entities1.split(' '))
130 |             n2 += len(entities2.split(' '))
131 | 
132 | 
133 |             data_point = {}
134 |             data_point['e1'] = e1
135 |             data_point['e2'] = e2
136 |             data_point['rel'] = rel
137 |             data_point['rel_eval'] = rel_reverse
138 |             data_point['e2_multi1'] = entities1
139 |             data_point['e2_multi2'] = entities2
140 | 
141 |             f.write(json.dumps(data_point)  + '\n')
142 | 
143 | 
144 | all_cases = test_cases['train.txt'] + test_cases['valid.txt'] + test_cases['test.txt']
145 | write_training_graph(test_cases['train.txt'], train_graph['train.txt'], 'kg_completion/data/{0}/e1rel_to_e2_train.json'.format(dataset_name))
146 | write_evaluation_graph(test_cases['valid.txt'], label_graph, join('kg_completion/data/{0}/e1rel_to_e2_ranking_dev.json'.format(dataset_name)))
147 | write_evaluation_graph(test_cases['test.txt'], label_graph, 'kg_completion/data/{0}/e1rel_to_e2_ranking_test.json'.format(dataset_name))
148 | write_training_graph(all_cases, label_graph, 'kg_completion/data/{0}/e1rel_to_e2_full.json'.format(dataset_name))
149 | 


--------------------------------------------------------------------------------
/IJCAI2021_demo/math_word_problem_solving/config.yaml:
--------------------------------------------------------------------------------
 1 | graph_construction_name: "dependency"
 2 | graph_embedding_name: "graphsage"
 3 | decoder_name: "stdtree"
 4 | 
 5 | graph_construction_args:
 6 |   graph_construction_share:
 7 |     graph_type: 'dependency'
 8 |     root_dir: "./data"
 9 |     topology_subdir: 'DependencyGraph'
10 |     thread_number: 4
11 |     port: 9000
12 |     timeout: 15000
13 | 
14 |   graph_construction_private:
15 |     edge_strategy: 'homogeneous'
16 |     merge_strategy: 'tailhead'
17 |     sequential_link: true
18 |     as_node: false
19 | 
20 |   node_embedding:
21 |     input_size: 300
22 |     hidden_size: 300
23 |     word_dropout: 0.1
24 |     rnn_dropout: 0.1
25 |     fix_bert_emb: false
26 |     fix_word_emb: false
27 |     embedding_style:
28 |       single_token_item: true
29 |       emb_strategy: "w2v_bilstm"
30 |       num_rnn_layers: 1
31 |       bert_model_name: null
32 |       bert_lower_case: null
33 | 
34 |     sim_metric_type: 'weighted_cosine'
35 |     num_heads: 1
36 |     top_k_neigh: null
37 |     epsilon_neigh: 0.5
38 |     smoothness_ratio: 0.1
39 |     connectivity_ratio: 0.05
40 |     sparsity_ratio: 0.1
41 | 
42 | graph_embedding_args:
43 |   graph_embedding_share:
44 |     num_layers: 1
45 |     input_size: 300
46 |     hidden_size: 300
47 |     output_size: 300
48 |     direction_option: "undirected"
49 |     feat_drop: 0.0
50 |     attn_drop: 0.0
51 | 
52 |   graph_embedding_private:
53 |     aggregator_type: "lstm"
54 |     bias: true
55 |     norm: null
56 |     activation: "relu"
57 |     use_edge_weight: false
58 | 
59 | decoder_args:
60 |   rnn_decoder_share:
61 |     rnn_type: "lstm"
62 |     input_size: 300
63 |     hidden_size: 300
64 |     rnn_emb_input_size: 300
65 |     use_copy: true
66 |     graph_pooling_strategy: null
67 |     attention_type: "uniform"
68 |     fuse_strategy: "concatenate"
69 |     dropout: 0.3
70 |     teacher_forcing_rate: 1.0
71 | 
72 |   rnn_decoder_private:
73 |     max_decoder_step: 35
74 |     max_tree_depth: 8
75 |     use_sibling: false
76 |     use_input_feed: true
77 | 


--------------------------------------------------------------------------------
/IJCAI2021_demo/math_word_problem_solving/data/processed/DependencyGraph/data.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/IJCAI2021_demo/math_word_problem_solving/data/processed/DependencyGraph/data.pt


--------------------------------------------------------------------------------
/IJCAI2021_demo/math_word_problem_solving/data/processed/DependencyGraph/vocab.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/IJCAI2021_demo/math_word_problem_solving/data/processed/DependencyGraph/vocab.pt


--------------------------------------------------------------------------------
/IJCAI2021_demo/math_word_problem_solving/imgs/g2t.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/IJCAI2021_demo/math_word_problem_solving/imgs/g2t.png


--------------------------------------------------------------------------------
/IJCAI2021_demo/math_word_problem_solving/utils.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import torch
 3 | import sympy
 4 | from random import randint
 5 | from sympy.parsing.sympy_parser import parse_expr
 6 | 
 7 | def convert_to_string(idx_list, form_manager):
 8 |     w_list = []
 9 |     for i in range(len(idx_list)):
10 |         w_list.append(form_manager.get_idx_symbol(int(idx_list[i])))
11 |     return " ".join(w_list)
12 | 
13 | def is_all_same(c1, c2, form_manager):
14 |     all_same = False
15 |     if len(c1) == len(c2):
16 |         all_same = True
17 |         for j in range(len(c1)):
18 |             if c1[j] != c2[j]:
19 |                 all_same = False
20 |                 break
21 |     if all_same == False:
22 |         if is_solution_same(c1, c2, form_manager):
23 |             return True
24 |         return False
25 |     else:
26 |         return True
27 | 
28 | 
29 | def is_solution_same(i1, i2, form_manager):
30 |     c1 = " ".join([form_manager.get_idx_symbol(x) for x in i1])
31 |     c2 = " ".join([form_manager.get_idx_symbol(x) for x in i2])
32 |     if ('=' not in c1) or ('=' not in c2):
33 |         return False
34 |     elif (form_manager.unk_token in c1) or (form_manager.unk_token in c2):
35 |         return False
36 |     else:
37 |         try:
38 |             s1 = c1.split('=')
39 |             s2 = c2.split('=')
40 |             eq1 = []
41 |             eq2 = []
42 |             x = sympy.Symbol('x')
43 |             eq1.append(parse_expr(s1[0]))
44 |             eq1.append(parse_expr(s1[1]))
45 |             eq2.append(parse_expr(s2[0]))
46 |             eq2.append(parse_expr(s2[1]))
47 |             res1 = sympy.solve(sympy.Eq(eq1[0], eq1[1]), x)
48 |             res2 = sympy.solve(sympy.Eq(eq2[0], eq2[1]), x)
49 | 
50 |             if not res1 or not res2:
51 |                 return False
52 |             if res1[0] == res2[0]:
53 |                 # print("Excution_true: ", c1, '\t', c2)
54 |                 pass
55 |             return res1[0] == res2[0]
56 | 
57 |         except BaseException:
58 |             # print("Excution_error: ", c1, '\t', c2)
59 |             pass
60 |             return False
61 | 
62 | def compute_accuracy(candidate_list, reference_list, form_manager):
63 |     if len(candidate_list) != len(reference_list):
64 |         print("candidate list has length {}, reference list has length {}\n".format(
65 |             len(candidate_list), len(reference_list)))
66 |     len_min = min(len(candidate_list), len(reference_list))
67 |     c = 0
68 |     for i in range(len_min):
69 |         if is_all_same(candidate_list[i], reference_list[i], form_manager):
70 |             c = c+1
71 |         else:
72 |             pass
73 |     return c/float(len_min)
74 | 
75 | 
76 | def compute_tree_accuracy(candidate_list_, reference_list_, form_manager):
77 |     candidate_list = []
78 |     for i in range(len(candidate_list_)):
79 |         candidate_list.append(candidate_list_[i])
80 |     reference_list = []
81 |     for i in range(len(reference_list_)):
82 |         reference_list.append(reference_list_[i])
83 |     return compute_accuracy(candidate_list, reference_list, form_manager)
84 | 
85 | def prepare_oov(batch_graph, src_vocab, device):
86 |     oov_dict = copy.deepcopy(src_vocab)
87 |     token_matrix = []
88 |     for n in batch_graph.node_attributes:
89 |         node_token = n['token']
90 |         if oov_dict.get_symbol_idx(node_token) == oov_dict.get_symbol_idx(oov_dict.unk_token):
91 |             oov_dict.add_symbol(node_token)
92 |         token_matrix.append(oov_dict.get_symbol_idx(node_token))
93 |     batch_graph.node_features['token_id_oov'] = torch.tensor(token_matrix, dtype=torch.long).to(device)
94 |     return oov_dict


--------------------------------------------------------------------------------
/KDD2021_demo/kg_completion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/KDD2021_demo/kg_completion/__init__.py


--------------------------------------------------------------------------------
/KDD2021_demo/kg_completion/evaluation.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import datetime
  4 | 
  5 | from spodernet.utils.logger import Logger
  6 | from torch.autograd import Variable
  7 | from sklearn import metrics
  8 | 
  9 | log = Logger('evaluation{0}.py.txt'.format(datetime.datetime.now()))
 10 | 
 11 | def ranking_and_hits(model, dev_rank_batcher, vocab, name, kg_graph=None):
 12 |     log.info('')
 13 |     log.info('-'*50)
 14 |     log.info(name)
 15 |     log.info('-'*50)
 16 |     log.info('')
 17 |     hits_left = []
 18 |     hits_right = []
 19 |     hits = []
 20 |     ranks = []
 21 |     ranks_left = []
 22 |     ranks_right = []
 23 |     for i in range(10):
 24 |         hits_left.append([])
 25 |         hits_right.append([])
 26 |         hits.append([])
 27 | 
 28 |     for i, str2var in enumerate(dev_rank_batcher):
 29 |         e1 = str2var['e1']
 30 |         e2 = str2var['e2']
 31 |         rel = str2var['rel']
 32 |         rel_reverse = str2var['rel_eval']
 33 |         e2_multi1 = str2var['e2_multi1'].float()
 34 |         e2_multi2 = str2var['e2_multi2'].float()
 35 |         pred1 = model.forward(e1, rel, kg_graph)
 36 |         pred2 = model.forward(e2, rel_reverse, kg_graph)
 37 |         pred1, pred2 = pred1.data, pred2.data
 38 |         e1, e2 = e1.data, e2.data
 39 |         e2_multi1, e2_multi2 = e2_multi1.data, e2_multi2.data
 40 |         for i in range(e1.shape[0]):
 41 |             # these filters contain ALL labels
 42 |             filter1 = e2_multi1[i].long()
 43 |             filter2 = e2_multi2[i].long()
 44 | 
 45 |             num = e1[i, 0].item()
 46 |             # save the prediction that is relevant
 47 |             target_value1 = pred1[i,e2[i, 0].item()].item()
 48 |             target_value2 = pred2[i,e1[i, 0].item()].item()
 49 |             # zero all known cases (this are not interesting)
 50 |             # this corresponds to the filtered setting
 51 |             pred1[i][filter1] = 0.0
 52 |             pred2[i][filter2] = 0.0
 53 |             # write base the saved values
 54 |             pred1[i][e2[i]] = target_value1
 55 |             pred2[i][e1[i]] = target_value2
 56 | 
 57 | 
 58 |         # sort and rank
 59 |         max_values, argsort1 = torch.sort(pred1, 1, descending=True)
 60 |         max_values, argsort2 = torch.sort(pred2, 1, descending=True)
 61 | 
 62 |         argsort1 = argsort1.cpu().numpy()
 63 |         argsort2 = argsort2.cpu().numpy()
 64 |         for i in range(e1.shape[0]):
 65 |             # find the rank of the target entities
 66 |             rank1 = np.where(argsort1[i]==e2[i, 0].item())[0][0]
 67 |             rank2 = np.where(argsort2[i]==e1[i, 0].item())[0][0]
 68 |             # rank+1, since the lowest rank is rank 1 not rank 0
 69 |             ranks.append(rank1+1)
 70 |             ranks_left.append(rank1+1)
 71 |             ranks.append(rank2+1)
 72 |             ranks_right.append(rank2+1)
 73 | 
 74 |             # this could be done more elegantly, but here you go
 75 |             for hits_level in range(10):
 76 |                 if rank1 <= hits_level:
 77 |                     hits[hits_level].append(1.0)
 78 |                     hits_left[hits_level].append(1.0)
 79 |                 else:
 80 |                     hits[hits_level].append(0.0)
 81 |                     hits_left[hits_level].append(0.0)
 82 | 
 83 |                 if rank2 <= hits_level:
 84 |                     hits[hits_level].append(1.0)
 85 |                     hits_right[hits_level].append(1.0)
 86 |                 else:
 87 |                     hits[hits_level].append(0.0)
 88 |                     hits_right[hits_level].append(0.0)
 89 | 
 90 |         dev_rank_batcher.state.loss = [0]
 91 | 
 92 |     for i in range(10):
 93 |         log.info('Hits left @{0}: {1}'.format(i+1, np.mean(hits_left[i])))
 94 |         log.info('Hits right @{0}: {1}'.format(i+1, np.mean(hits_right[i])))
 95 |         log.info('Hits @{0}: {1}'.format(i+1, np.mean(hits[i])))
 96 |     log.info('Mean rank left: {0}', np.mean(ranks_left))
 97 |     log.info('Mean rank right: {0}', np.mean(ranks_right))
 98 |     log.info('Mean rank: {0}', np.mean(ranks))
 99 |     log.info('Mean reciprocal rank left: {0}', np.mean(1./np.array(ranks_left)))
100 |     log.info('Mean reciprocal rank right: {0}', np.mean(1./np.array(ranks_right)))
101 |     log.info('Mean reciprocal rank: {0}', np.mean(1./np.array(ranks)))
102 | 
103 |     return np.mean(1./np.array(ranks))


--------------------------------------------------------------------------------
/KDD2021_demo/kg_completion/kinship/processed/KG_graph.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/KDD2021_demo/kg_completion/kinship/processed/KG_graph.pt


--------------------------------------------------------------------------------
/KDD2021_demo/kg_completion/kinship/raw/kinship.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/KDD2021_demo/kg_completion/kinship/raw/kinship.tar.gz


--------------------------------------------------------------------------------
/KDD2021_demo/kg_completion/preprocess.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | mkdir kg_completion/data
3 | mkdir kg_completion/data/kinship
4 | mkdir kg_completion/saved_models
5 | tar -xvf kg_completion/kinship/raw/kinship.tar.gz -C kg_completion/data/kinship
6 | python kg_completion/wrangle_KG.py kinship
7 | 


--------------------------------------------------------------------------------
/KDD2021_demo/kg_completion/saved_models/kinship_ggnn_distmult_0.2_0.25.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/KDD2021_demo/kg_completion/saved_models/kinship_ggnn_distmult_0.2_0.25.model


--------------------------------------------------------------------------------
/KDD2021_demo/kg_completion/spodernet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/KDD2021_demo/kg_completion/spodernet/__init__.py


--------------------------------------------------------------------------------
/KDD2021_demo/kg_completion/spodernet/backends/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/KDD2021_demo/kg_completion/spodernet/backends/__init__.py


--------------------------------------------------------------------------------
/KDD2021_demo/kg_completion/spodernet/backends/tfbackend.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | from spodernet.interfaces import IAtBatchPreparedObservable
  4 | from spodernet.utils.util import Timer
  5 | from spodernet.utils.global_config import Config
  6 | 
  7 | class TensorFlowConfig:
  8 |     inp = None
  9 |     support = None
 10 |     input_length = None
 11 |     support_length = None
 12 |     target = None
 13 |     index = None
 14 |     sess = None
 15 | 
 16 |     @staticmethod
 17 |     def init_batch_size(batch_size):
 18 |         TensorFlowConfig.inp = tf.placeholder(tf.int64, [batch_size, None])
 19 |         TensorFlowConfig.support = tf.placeholder(tf.int64, [batch_size, None])
 20 |         TensorFlowConfig.input_length = tf.placeholder(tf.int64, [batch_size,])
 21 |         TensorFlowConfig.support_length = tf.placeholder(tf.int64, [batch_size,])
 22 |         TensorFlowConfig.target = tf.placeholder(tf.int64, [batch_size])
 23 |         TensorFlowConfig.index = tf.placeholder(tf.int64, [batch_size])
 24 | 
 25 |     @staticmethod
 26 |     def get_session():
 27 |         if TensorFlowConfig.sess is None:
 28 |             TensorFlowConfig.sess = tf.Session()
 29 |         return TensorFlowConfig.sess
 30 | 
 31 | 
 32 | 
 33 | class TensorFlowConverter(IAtBatchPreparedObservable):
 34 | 
 35 |     def at_batch_prepared(self, batch_parts):
 36 |         inp, inp_len, sup, sup_len, t, idx = batch_parts
 37 |         if TensorFlowConfig.inp == None:
 38 |             log.error('You need to initialize the batch size via TensorflowConfig.init_batch_size(batchsize)!')
 39 |         feed_dict = {}
 40 |         feed_dict[TensorFlowConfig.inp] = inp
 41 |         feed_dict[TensorFlowConfig.support] = sup
 42 |         feed_dict[TensorFlowConfig.input_length] = inp_len
 43 |         feed_dict[TensorFlowConfig.support_length] = sup_len
 44 |         feed_dict[TensorFlowConfig.target] = t
 45 |         feed_dict[TensorFlowConfig.index] = idx
 46 | 
 47 |         str2var = {}
 48 |         str2var['input'] = TensorFlowConfig.inp
 49 |         str2var['input_length'] = TensorFlowConfig.input_length
 50 |         str2var['support'] = TensorFlowConfig.support
 51 |         str2var['support_length'] = TensorFlowConfig.support_length
 52 |         str2var['target'] = TensorFlowConfig.target
 53 |         str2var['index'] = TensorFlowConfig.index
 54 | 
 55 |         return str2var, feed_dict
 56 | 
 57 | def build_str2var_dict():
 58 |     str2var = {}
 59 |     if TensorFlowConfig.inp is not None:
 60 |         str2var['input'] = TensorFlowConfig.inp
 61 |     if TensorFlowConfig.support is not None:
 62 |         str2var['support'] = TensorFlowConfig.support
 63 |     if TensorFlowConfig.target is not None:
 64 |         str2var['target'] = TensorFlowConfig.target
 65 |     if TensorFlowConfig.input_length is not None:
 66 |         str2var['input_length'] = TensorFlowConfig.input_length
 67 |     if TensorFlowConfig.support_length is not None:
 68 |         str2var['support_length'] = TensorFlowConfig.support_length
 69 |     if TensorFlowConfig.index is not None:
 70 |         str2var['index'] = TensorFlowConfig.index
 71 |         return str2var
 72 | 
 73 | class TFTrainer(object):
 74 |     def __init__(self, model):
 75 |         self.sess = TensorFlowConfig.get_session()
 76 |         str2var = build_str2var_dict()
 77 |         self.logits, self.loss, self.argmax = model.forward(str2var)
 78 |         optimizer = tf.train.AdamOptimizer(0.001)
 79 | 
 80 |         if Config.L2 != 0.0:
 81 |             self.loss += tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()]) * Config.L2
 82 | 
 83 |         self.min_op = optimizer.minimize(self.loss)
 84 | 
 85 |         tf.global_variables_initializer().run(session=self.sess)
 86 | 
 87 |     def train_model(self, batcher, epochs=1, iterations=None):
 88 |         for epoch in range(epochs):
 89 |             for i, (str2var, feed_dict) in enumerate(batcher):
 90 |                 _, argmax_values = self.sess.run([self.min_op, self.argmax], feed_dict=feed_dict)
 91 | 
 92 |                 batcher.state.argmax = argmax_values
 93 |                 batcher.state.targets = feed_dict[TensorFlowConfig.target]
 94 | 
 95 |                 if iterations > 0:
 96 |                     if i == iterations: break
 97 | 
 98 |     def eval_model(self, batcher, iterations=None):
 99 |         for i, (str2var, feed_dict) in enumerate(batcher):
100 |             argmax_values = self.sess.run([self.argmax], feed_dict=feed_dict)[0]
101 | 
102 |             batcher.state.argmax = argmax_values
103 |             batcher.state.targets = feed_dict[TensorFlowConfig.target]
104 | 
105 |             if iterations > 0:
106 |                 if i == iterations: break
107 | 
108 | 


--------------------------------------------------------------------------------
/KDD2021_demo/kg_completion/spodernet/backends/tfmodels.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from tensorflow import placeholder
  3 | from spodernet.backends.tfbackend import TensorFlowConfig
  4 | from spodernet.utils.global_config import Config
  5 | from spodernet.frontend import AbstractModel
  6 | import numpy as np
  7 | 
  8 | def reader(inputs, lengths, output_size, contexts=(None, None), scope=None):
  9 |     with tf.variable_scope(scope or "reader") as varscope:
 10 | 
 11 |         cell = tf.contrib.rnn.LSTMCell(output_size, state_is_tuple=True,initializer=tf.contrib.layers.xavier_initializer())
 12 | 
 13 |         cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=1.0-Config.dropout)
 14 | 
 15 |         outputs, states = tf.nn.bidirectional_dynamic_rnn(
 16 |             cell,
 17 |             cell,
 18 |             inputs,
 19 |             sequence_length=lengths,
 20 |             initial_state_fw=contexts[0],
 21 |             initial_state_bw=contexts[1],
 22 |             dtype=tf.float32)
 23 | 
 24 |         return outputs, states
 25 | 
 26 | def predictor(inputs, targets, target_size):
 27 |     init = tf.contrib.layers.xavier_initializer(uniform=True) #uniform=False for truncated normal
 28 |     logits = tf.contrib.layers.fully_connected(inputs, target_size, weights_initializer=init, activation_fn=None)
 29 | 
 30 |     loss = tf.reduce_mean(
 31 |         tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
 32 |             labels=targets), name='predictor_loss')
 33 |     predict = tf.arg_max(tf.nn.softmax(logits), 1, name='prediction')
 34 |     return [logits, loss, predict]
 35 | 
 36 | 
 37 | class TFEmbedding(AbstractModel):
 38 | 
 39 |     def __init__(self, embedding_size, num_embeddings, scope=None):
 40 |         super(TFEmbedding, self).__init__()
 41 | 
 42 |         self.embedding_size = embedding_size
 43 |         self.scope = scope
 44 |         self.num_embeddings = num_embeddings
 45 | 
 46 |     def forward(self, str2var, *args):
 47 |         self.expected_str2var_keys(str2var, ['input', 'support'])
 48 |         self.expected_args('None', 'None')
 49 |         self.generated_outputs('input idx, support idx', 'both sequences have shape = [batch, timesteps, embedding dim]')
 50 | 
 51 |         embeddings = tf.get_variable("embeddings", [self.num_embeddings, self.embedding_size],
 52 |                                 initializer=tf.random_normal_initializer(0., 1./np.sqrt(self.embedding_size)),
 53 |                                 trainable=True, dtype="float32")
 54 | 
 55 |         with tf.variable_scope("embedders") as varscope:
 56 |             seqQ = tf.nn.embedding_lookup(embeddings, TensorFlowConfig.inp)
 57 |             varscope.reuse_variables()
 58 |             seqS = tf.nn.embedding_lookup(embeddings, TensorFlowConfig.support)
 59 | 
 60 |         return seqQ, seqS
 61 | 
 62 | class TFPairedBiDirectionalLSTM(AbstractModel):
 63 | 
 64 |     def __init__(self, hidden_size, scope=None, conditional_encoding=True):
 65 |         super(TFPairedBiDirectionalLSTM, self).__init__()
 66 |         self.hidden_size = hidden_size
 67 |         self.scope = scope
 68 |         if not conditional_encoding:
 69 |             raise NotImplementedError("conditional_encoding=False is not implemented yet.")
 70 | 
 71 |     def forward(self, str2var, *args):
 72 |         self.expected_str2var_keys(str2var, ['input_length', 'support_length'])
 73 |         self.expected_args('seq input, seq support', 'dimension of both: [batch, timesteps, embedding dim]')
 74 |         self.generated_outputs('stacked outputs of last timestep', 'dim is [batch_size, 2x hidden size]')
 75 | 
 76 |         seqQ, seqS = args
 77 | 
 78 |         with tf.variable_scope(self.scope or "conditional_reader_seq1") as varscope1:
 79 |             #seq1_states: (c_fw, h_fw), (c_bw, h_bw)
 80 |             _, seq1_states = reader(seqQ, TensorFlowConfig.input_length, self.hidden_size, scope=varscope1)
 81 |         with tf.variable_scope(self.scope or "conditional_reader_seq2") as varscope2:
 82 |             varscope1.reuse_variables()
 83 |             # each [batch_size x max_seq_length x output_size]
 84 |             outputs, states = reader(seqS, TensorFlowConfig.support_length, self.hidden_size, seq1_states, scope=varscope2)
 85 | 
 86 |         output = tf.concat([states[0][1], states[1][1]], 1)
 87 | 
 88 |         return [output]
 89 | 
 90 | class TFSoftmaxCrossEntropy(AbstractModel):
 91 | 
 92 |     def __init__(self, num_labels):
 93 |         super(TFSoftmaxCrossEntropy, self).__init__()
 94 |         self.num_labels = num_labels
 95 | 
 96 |     def forward(self, str2var, *args):
 97 |         self.expected_str2var_keys(str2var, ['target'])
 98 |         self.expected_args('some inputs', 'dimension: [batch, any]')
 99 |         self.generated_outputs('logits, loss, argmax', 'dimensions: logits = [batch, labels], loss = 1x1, argmax = [batch, 1]')
100 |         outputs_prev_layer = args[0]
101 | 
102 |         logits, loss, argmax = predictor(outputs_prev_layer, TensorFlowConfig.target, self.num_labels)
103 | 
104 |         return [logits, loss, argmax]
105 | 


--------------------------------------------------------------------------------
/KDD2021_demo/kg_completion/spodernet/backends/torchbackend.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from torch.autograd import Variable
  3 | from itertools import chain
  4 | 
  5 | import torch
  6 | import numpy as np
  7 | 
  8 | from spodernet.interfaces import IAtBatchPreparedObservable
  9 | from spodernet.utils.util import Timer
 10 | from spodernet.utils.global_config import Config
 11 | 
 12 | class TorchConverter(IAtBatchPreparedObservable):
 13 |     def __init__(self, is_volatile):
 14 |         self.is_volatile = is_volatile
 15 | 
 16 |     def at_batch_prepared(self, str2var):
 17 |         for key in str2var.keys():
 18 |             if 'length' in key: continue
 19 |             if str2var[key].dtype == np.int32:
 20 |                 str2var[key] = np.int64(str2var[key])
 21 |             str2var[key] = Variable(torch.from_numpy(str2var[key]), volatile=self.is_volatile)
 22 |         return str2var
 23 | 
 24 | class TorchCUDAConverter(IAtBatchPreparedObservable):
 25 |     def __init__(self, device_id):
 26 |         self.device_id = device_id
 27 | 
 28 |     def at_batch_prepared(self, str2var):
 29 |         for key in str2var.keys():
 30 |             if 'length' in key: continue
 31 |             str2var[key] = str2var[key].cuda(self.device_id, True)
 32 |         return str2var
 33 | 
 34 | 
 35 | class TorchNegativeSampling(IAtBatchPreparedObservable):
 36 |     def __init__(self, max_index, keys_to_corrupt=['input', 'target']):
 37 |         self.max_index = max_index
 38 |         self.keys_to_corrupt = keys_to_corrupt
 39 |         self.rdm = np.random.RandomState(34534)
 40 | 
 41 |     def at_batch_prepared(self, str2var):
 42 |         samples_per_key = Config.batch_size/len(self.keys_to_corrupt)
 43 |         for i, key in enumerate(self.keys_to_corrupt):
 44 |             variable = str2var[key]
 45 |             new_idx = self.rdm.choice(self.max_index, samples_per_key)
 46 |             if Config.cuda:
 47 |                 variable_corrupted = Variable(torch.cuda.LongTensor(variable.size()))
 48 |                 variable_corrupted.data.copy_(variable.data)
 49 |                 variable_corrupted.data[i*samples_per_key: (i+1)*samples_per_key] = torch.from_numpy(new_idx).cuda()
 50 |             else:
 51 |                 variable_corrupted = Variable(torch.LongTensor(variable.size()))
 52 |                 variable_corrupted.data.copy_(variable.data)
 53 |                 variable_corrupted.data[i*samples_per_key: (i+1)*samples_per_key] = torch.from_numpy(new_idx)
 54 |             str2var[key + '_corrupt'] = variable_corrupted
 55 | 
 56 |         return str2var
 57 | 
 58 | 
 59 | ######################################
 60 | #
 61 | #          Util functions
 62 | #
 63 | ######################################
 64 | 
 65 | 
 66 | def get_list_of_torch_modules(model):
 67 |     modules = []
 68 |     for module in model.modules:
 69 |         if hasattr(module, 'modules'):
 70 |             for module2 in module.modules:
 71 |                 modules.append(module2)
 72 |         else:
 73 |             modules.append(module)
 74 |     return modules
 75 | 
 76 | 
 77 | 
 78 | def train_model(model, batcher, epochs=1, iterations=None):
 79 |     modules = get_list_of_torch_modules(model)
 80 |     generators = []
 81 |     for module in modules:
 82 |         if Config.cuda:
 83 |             module.cuda()
 84 |         generators.append(module.parameters())
 85 | 
 86 |     parameters = chain.from_iterable(generators)
 87 |     optimizer = torch.optim.Adam(parameters, lr=0.001)
 88 |     for module in modules:
 89 |         module.train()
 90 | 
 91 |     for epoch in range(epochs):
 92 |         for i, str2var in enumerate(batcher):
 93 |             optimizer.zero_grad()
 94 |             logits, loss, argmax = model.forward(str2var)
 95 |             loss.backward()
 96 |             optimizer.step()
 97 |             batcher.state.argmax = argmax
 98 |             batcher.state.targets = str2var['target']
 99 | 
100 |             if iterations > 0:
101 |                 if i == iterations: break
102 | 
103 | 
104 | def eval_model(model, batcher, iterations=None):
105 |     modules = get_list_of_torch_modules(model)
106 |     for module in modules:
107 |         module.eval()
108 | 
109 |     for i, str2var in enumerate(batcher):
110 |         logits, loss, argmax = model.forward(str2var)
111 |         batcher.state.argmax = argmax
112 |         batcher.state.targets = str2var['target']
113 | 
114 |         if iterations > 0:
115 |             if i == iterations: break
116 | 


--------------------------------------------------------------------------------
/KDD2021_demo/kg_completion/spodernet/backends/torchmodels.py:
--------------------------------------------------------------------------------
  1 | from torch.nn import LSTM
  2 | from torch.autograd import Variable
  3 | 
  4 | import torch
  5 | import torch.nn.functional as F
  6 | 
  7 | from spodernet.frontend import AbstractModel
  8 | from spodernet.utils.global_config import Config
  9 | 
 10 | class TorchEmbedding(torch.nn.Module, AbstractModel):
 11 |     def __init__(self, embedding_size, num_embeddings):
 12 |         super(TorchEmbedding, self).__init__()
 13 | 
 14 |         self.emb= torch.nn.Embedding(num_embeddings,
 15 |                 embedding_size, padding_idx=0)#, scale_grad_by_freq=True, padding_idx=0)
 16 | 
 17 |     def forward(self, str2var, *args):
 18 |         self.expected_str2var_keys_oneof(str2var, ['input', 'support'])
 19 |         self.expected_args('None', 'None')
 20 |         self.generated_outputs('input idx, support idx', 'both sequences have shape = [batch, timesteps, embedding dim]')
 21 | 
 22 |         embedded_results = []
 23 |         if 'input' in str2var:
 24 |             embedded_results.append(self.emb(str2var['input']))
 25 | 
 26 |         if 'support' in str2var:
 27 |             embedded_results.append(self.emb(str2var['support']))
 28 | 
 29 |         return embedded_results
 30 | 
 31 | class TorchBiDirectionalLSTM(torch.nn.Module, AbstractModel):
 32 |     def __init__(self, input_size, hidden_size,
 33 |             dropout=0.0, layers=1,
 34 |             bidirectional=True, to_cuda=False, conditional_encoding=True):
 35 |         super(TorchBiDirectionalLSTM, self).__init__()
 36 | 
 37 |         use_bias = True
 38 |         num_directions = (1 if not bidirectional else 2)
 39 | 
 40 |         self.lstm = LSTM(input_size,hidden_size,layers,
 41 |                          use_bias,True,0.2,bidirectional)
 42 | 
 43 |         # states of both LSTMs
 44 |         self.h0 = None
 45 |         self.c0 = None
 46 | 
 47 |         self.h0 = Variable(torch.FloatTensor(num_directions*layers, Config.batch_size, hidden_size))
 48 |         self.c0 = Variable(torch.FloatTensor(num_directions*layers, Config.batch_size, hidden_size))
 49 | 
 50 |         if Config.cuda:
 51 |             self.h0 = self.h0.cuda()
 52 |             self.c0 = self.c0.cuda()
 53 | 
 54 |     def forward(self, str2var, *args):
 55 |         self.expected_str2var_keys(str2var, [])
 56 |         self.expected_args('embedded seq', 'size [batch, time steps, embedding dim]')
 57 |         self.generated_outputs('LSTM output seq', 'size [batch, time steps, 2x hidden dim]')
 58 |         seq = args
 59 |         self.h0.data.zero_()
 60 |         self.c0.data.zero_()
 61 |         out, hid = self.lstm(seq, (self.h0, self.c0))
 62 |         return [out, hid]
 63 | 
 64 | 
 65 | class TorchPairedBiDirectionalLSTM(torch.nn.Module, AbstractModel):
 66 |     def __init__(self, input_size, hidden_size,
 67 |             dropout=0.0, layers=1,
 68 |             bidirectional=True, to_cuda=False, conditional_encoding=True):
 69 |         super(TorchPairedBiDirectionalLSTM, self).__init__()
 70 | 
 71 |         self.conditional_encoding = conditional_encoding
 72 |         use_bias = True
 73 |         num_directions = (1 if not bidirectional else 2)
 74 | 
 75 |         self.conditional_encoding = conditional_encoding
 76 |         self.lstm1 = LSTM(input_size,hidden_size,layers,
 77 |                          use_bias,True,Config.dropout,bidirectional)
 78 |         self.lstm2 = LSTM(input_size,hidden_size,layers,
 79 |                          use_bias,True,Config.dropout,bidirectional)
 80 | 
 81 |         # states of both LSTMs
 82 |         self.h01 = None
 83 |         self.c01 = None
 84 |         self.h02 = None
 85 |         self.c02 = None
 86 | 
 87 | 
 88 |         self.h01 = Variable(torch.FloatTensor(num_directions*layers, Config.batch_size, hidden_size))
 89 |         self.c01 = Variable(torch.FloatTensor(num_directions*layers, Config.batch_size, hidden_size))
 90 | 
 91 |         if Config.cuda:
 92 |             self.h01 = self.h01.cuda()
 93 |             self.c01 = self.c01.cuda()
 94 | 
 95 |         if not self.conditional_encoding:
 96 |             self.h02 = Variable(torch.FloatTensor(num_directions*layers, Config.batch_size, hidden_size))
 97 |             self.c02 = Variable(torch.FloatTensor(num_directions*layers, Config.batch_size, hidden_size))
 98 | 
 99 |             if Config.cuda:
100 |                 self.h02 = self.h02.cuda()
101 |                 self.c02 = self.c02.cuda()
102 | 
103 | 
104 |     def forward(self, str2var, *args):
105 |         self.expected_str2var_keys(str2var, [])
106 |         self.expected_args('embedded input seq, embedded seq support', 'both of size [batch, time steps, embedding dim]')
107 |         self.generated_outputs('LSTM output seq inputs, LSTM output seq support', 'both of size [batch, time steps, 2x hidden dim]')
108 |         seq1, seq2 = args
109 |         if self.conditional_encoding:
110 |             self.h01.data.zero_()
111 |             self.c01.data.zero_()
112 |             out1, hid1 = self.lstm1(seq1, (self.h01, self.c01))
113 |             out2, hid2 = self.lstm2(seq2, hid1)
114 |         else:
115 |             self.h01.data.zero_()
116 |             self.c01.data.zero_()
117 |             self.h02.data.zero_()
118 |             self.c02.data.zero_()
119 |             out1, hid1 = self.lstm1(seq1, (self.h01, self.c01))
120 |             out2, hid2 = self.lstm2(seq2, (self.h02, self.c02))
121 |         return [out1, out2]
122 | 
123 | class TorchVariableLengthOutputSelection(torch.nn.Module, AbstractModel):
124 |     def __init__(self):
125 |         super(TorchVariableLengthOutputSelection, self).__init__()
126 |         self.b1 = None
127 |         self.b2 = None
128 | 
129 |     def forward(self, str2var, *args):
130 |         self.expected_str2var_keys(str2var, ['input_length', 'support_length'])
131 |         self.expected_args('LSTM output sequence input , LSTM output sequence support', 'dimension of both: [batch, time steps, 2x LSTM hidden size]')
132 |         self.generated_outputs('stacked bidirectional outputs of last timestep', 'dim is [batch_size, 4x hidden size]')
133 | 
134 |         output_lstm1, output_lstm2 = args
135 | 
136 |         l1, l2 = str2var['input_length'], str2var['support_length']
137 |         if self.b1 == None:
138 |             b1 = torch.ByteTensor(output_lstm1.size())
139 |             b2 = torch.ByteTensor(output_lstm2.size())
140 |             if Config.cuda:
141 |                 b1 = b1.cuda()
142 |                 b2 = b2.cuda()
143 | 
144 |         b1.fill_(0)
145 |         for i, num in enumerate(l1.data):
146 |             b1[i,num-1,:] = 1
147 |         out1 = output_lstm1[b1].view(Config.batch_size, -1)
148 | 
149 |         b2.fill_(0)
150 |         for i, num in enumerate(l2.data):
151 |             b2[i,num-1,:] = 1
152 |         out2 = output_lstm2[b2].view(Config.batch_size, -1)
153 | 
154 |         out = torch.cat([out1,out2], 1)
155 |         return [out]
156 | 
157 | class TorchSoftmaxCrossEntropy(torch.nn.Module, AbstractModel):
158 | 
159 |     def __init__(self, input_dim, num_labels):
160 |         super(TorchSoftmaxCrossEntropy, self).__init__()
161 |         self.num_labels = num_labels
162 |         self.projection_to_labels = torch.nn.Linear(input_dim, num_labels)
163 | 
164 |     def forward(self, str2var, *args):
165 |         self.expected_str2var_keys(str2var, ['target'])
166 |         self.expected_args('some inputs', 'dimension: [batch, any]')
167 |         self.generated_outputs('logits, loss, argmax', 'dimensions: logits = [batch, labels], loss = 1x1, argmax = [batch, 1]')
168 | 
169 |         outputs_prev_layer = args[0]
170 |         t = str2var['target']
171 | 
172 |         logits = self.projection_to_labels(outputs_prev_layer)
173 |         out = F.log_softmax(logits)
174 |         loss = F.nll_loss(out, t)
175 |         maximum, argmax = torch.topk(out.data, 1)
176 | 
177 |         return [logits, loss, argmax]
178 | 
179 | 


--------------------------------------------------------------------------------
/KDD2021_demo/kg_completion/spodernet/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/KDD2021_demo/kg_completion/spodernet/data/__init__.py


--------------------------------------------------------------------------------
/KDD2021_demo/kg_completion/spodernet/data/snli2spoder.py:
--------------------------------------------------------------------------------
1 | '''Downloads SNLI data and wrangles it into the spoder format'''
2 | 
3 | 
4 | if __name__ == '__main__':
5 |     snli2spoder()
6 | 


--------------------------------------------------------------------------------
/KDD2021_demo/kg_completion/spodernet/frontend.py:
--------------------------------------------------------------------------------
  1 | from itertools import chain
  2 | 
  3 | from spodernet.utils.global_config import Config, Backends
  4 | 
  5 | from spodernet.utils.logger import Logger
  6 | log = Logger('frontend.py.txt')
  7 | 
  8 | 
  9 | class Model(object):
 10 | 
 11 |     def __init__(self, input_module=None):
 12 |         self.modules = []
 13 |         self.input_module = input_module
 14 |         self.module = self
 15 | 
 16 |     def add(self, module):
 17 |         self.modules.append(module)
 18 | 
 19 |     def forward(self, str2var, *inputs):
 20 |         outputs = inputs
 21 |         if inputs == None:
 22 |             outputs = []
 23 |         for module in self.modules:
 24 |             outputs = module.forward(str2var, *outputs)
 25 |         return outputs
 26 | 
 27 | class Trainer(object):
 28 |     def __init__(self, model):
 29 |         self.model = model
 30 | 
 31 |         self.trainer_backend = None
 32 |         self.train_func = None
 33 |         self.eval_func = None
 34 |         if Config.backend == Backends.TENSORFLOW:
 35 |             from spodernet.backends.tfbackend import TFTrainer
 36 |             self.trainer_backend = TFTrainer(model)
 37 |             self.train_func = lambda _, batch, epochs, iterations: self.trainer_backend.train_model(batch, epochs, iterations)
 38 |             self.eval_func = lambda _, batch, iterations: self.trainer_backend.eval_model(batch, iterations)
 39 |         elif Config.backend == Backends.TORCH:
 40 |             from spodernet.backends.torchbackend import train_model, eval_model
 41 |             self.train_func = train_model
 42 |             self.eval_func = eval_model
 43 | 
 44 |     def train(self, batcher, epochs=1, iterations=None):
 45 |         self.train_func(self.model, batcher, epochs, iterations)
 46 | 
 47 |     def evaluate(self, batcher, iterations=None):
 48 |         self.eval_func(self.model, batcher, iterations)
 49 | 
 50 | class AbstractModel(object):
 51 | 
 52 |     def __init__(self):
 53 |         super(AbstractModel, self).__init__()
 54 |         self.input_str_args = None
 55 |         self.output_str_args = None
 56 |         self.used_keys = None
 57 | 
 58 |     def forward(self, str2var, *args):
 59 |         raise NotImplementedError("Classes that inherit from AbstractModel need to implement the forward method.")
 60 | 
 61 |     @property
 62 |     def modules(self):
 63 |         raise NotImplementedError("Classes that inherit from AbstractModel need to overrite the modules property.")
 64 | 
 65 |     def expected_str2var_keys(self, str2var, keys):
 66 |         self.used_keys = keys
 67 |         for key in keys:
 68 |             if key not in str2var:
 69 |                 log.error('Variable with name {0} expected, but not found in str2variable dict with keys {1}'.format(key, str2var.keys()))
 70 | 
 71 |     def expected_str2var_keys_oneof(self, str2var, keys):
 72 |         self.used_keys = keys
 73 |         one_exists = False
 74 |         for key in keys:
 75 |             if key in str2var:
 76 |                 one_exists = True
 77 |         if not one_exists:
 78 |             log.error('At least one of these variable was expected: {0}. But str2var only has these variables: {1}.', keys, str2var.keys())
 79 | 
 80 |     def expected_args(self, str_arg_names, str_arg_description):
 81 |         log.debug_once('Expected args {0}'.format(str_arg_names))
 82 |         log.debug_once('Info for the expected arguments: {0}'.format(str_arg_description))
 83 |         self.input_str_args = str_arg_names
 84 | 
 85 |     def generated_outputs(self, str_output_names, str_output_description):
 86 |         log.debug_once('Generated outputs: {0}'.format(str_output_names))
 87 |         log.debug_once('Info for the provided outputs: {0}'.format(str_output_description))
 88 |         self.output_str_args = str_output_names
 89 |         self.used_keys
 90 |         self.input_str_args
 91 |         self.output_str_args
 92 |         message = '{0} + {1} -> {2}'.format(self.used_keys, self.input_str_args, self.output_str_args)
 93 |         log.info_once(message)
 94 | 
 95 | 
 96 | class Embedding(object):
 97 |     def __init__(self, embedding_size, num_embeddings, scope=None):
 98 |         self.embedding_size = embedding_size
 99 |         self.scope = scope
100 |         self.num_embeddings = num_embeddings
101 | 
102 |         self.module = None
103 |         if Config.backend == Backends.TENSORFLOW:
104 |             from spodernet.backends.tfmodels import TFEmbedding
105 |             self.module = TFEmbedding(embedding_size, num_embeddings, scope)
106 |         elif Config.backend == Backends.TORCH:
107 |             from spodernet.backends.torchmodels import TorchEmbedding
108 |             self.module = TorchEmbedding(embedding_size, num_embeddings)
109 |             self.modules = [self.module]
110 | 
111 |     def forward(self, str2var, *args):
112 |         return self.module.forward(str2var, *args)
113 | 
114 | 
115 | class PairedBiDirectionalLSTM(object):
116 | 
117 |     def __init__(self, input_size, hidden_size, scope=None, conditional_encoding=True):
118 |         super(PairedBiDirectionalLSTM, self).__init__()
119 |         self.hidden_size = hidden_size
120 |         self.scope = scope
121 | 
122 |         self.module = None
123 |         if Config.backend == Backends.TENSORFLOW:
124 |             from spodernet.backends.tfmodels import TFPairedBiDirectionalLSTM
125 |             self.module = TFPairedBiDirectionalLSTM(hidden_size, scope, conditional_encoding)
126 |         elif Config.backend == Backends.TORCH:
127 |             from spodernet.backends.torchmodels import TorchPairedBiDirectionalLSTM, TorchVariableLengthOutputSelection
128 |             model = Model()
129 |             model.add(TorchPairedBiDirectionalLSTM(input_size, hidden_size, conditional_encoding=conditional_encoding))
130 |             model.add(TorchVariableLengthOutputSelection())
131 | 
132 |             self.module = model
133 |             self.modules = model.modules
134 | 
135 |     def forward(self, str2var, *args):
136 |         return self.module.forward(str2var, *args)
137 | 
138 | 
139 | class SoftmaxCrossEntropy(object):
140 |     def __init__(self, input_size, num_labels):
141 |         super(SoftmaxCrossEntropy, self).__init__()
142 |         self.num_labels = num_labels
143 | 
144 |         self.module = None
145 |         if Config.backend == Backends.TENSORFLOW:
146 |             from spodernet.backends.tfmodels import TFSoftmaxCrossEntropy
147 |             self.module = TFSoftmaxCrossEntropy(num_labels)
148 |         elif Config.backend == Backends.TORCH:
149 |             from spodernet.backends.torchmodels import TorchSoftmaxCrossEntropy
150 |             self.module = TorchSoftmaxCrossEntropy(input_size, num_labels)
151 |             self.modules  = [self.module]
152 | 
153 |     def forward(self, str2var, *args):
154 |         return self.module.forward(str2var, *args)
155 | 


--------------------------------------------------------------------------------
/KDD2021_demo/kg_completion/spodernet/hooks.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.stats
  3 | import datetime
  4 | 
  5 | from spodernet.interfaces import IAtIterEndObservable, IAtEpochEndObservable, IAtEpochStartObservable
  6 | from spodernet.utils.util import Timer
  7 | from spodernet.utils.global_config import Config, Backends
  8 | 
  9 | from spodernet.utils.logger import Logger
 10 | log = Logger('hooks.py.txt')
 11 | 
 12 | class AbstractHook(IAtIterEndObservable, IAtEpochEndObservable):
 13 |     def __init__(self, name, metric_name, print_every_x_batches):
 14 |         self.epoch_errors = []
 15 |         self.current_scores = []
 16 |         self.name = name
 17 |         self.iter_count = 0
 18 |         self.print_every = print_every_x_batches
 19 |         self.metric_name = metric_name
 20 |         self.epoch = 1
 21 | 
 22 |         # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
 23 |         self.n = 0
 24 |         self.epoch_n = 0
 25 |         self.mean = 0
 26 |         self.M2 = 0
 27 |         self.load_backend_specific_functions()
 28 | 
 29 |     def load_backend_specific_functions(self):
 30 |         if Config.backend == Backends.TORCH:
 31 |             from torch.autograd import Variable
 32 |             def convert_state(state):
 33 |                 if isinstance(state.targets, Variable):
 34 |                     state.targets = state.targets.data
 35 |                 if isinstance(state.argmax, Variable):
 36 |                     state.argmax = state.argmax.data
 37 |                 if isinstance(state.pred, Variable):
 38 |                     state.pred = state.pred.data
 39 |                 if isinstance(state.loss, Variable):
 40 |                     state.loss = state.loss.data
 41 |                 if isinstance(state.multi_labels, Variable):
 42 |                     state.multi_labels = state.multi_labels.data
 43 | 
 44 |                 return state
 45 | 
 46 |             self.convert_state = convert_state
 47 |         else:
 48 |             self.convert_state = lambda x: x
 49 | 
 50 |     def calculate_metric(self, state):
 51 |         raise NotImplementedError('Classes that inherit from abstract hook need to implement the calcualte metric method.')
 52 | 
 53 |     def at_end_of_iter_event(self, state):
 54 |         state = self.convert_state(state)
 55 |         metric = self.calculate_metric(state)
 56 |         #print(metric)
 57 | 
 58 |         self.n += 1
 59 |         delta = metric - self.mean
 60 |         self.mean += delta/self.n
 61 |         delta2 = metric - self.mean
 62 |         self.M2 += delta*delta2
 63 | 
 64 |         self.current_scores.append(metric)
 65 |         self.iter_count += 1
 66 |         if self.iter_count % self.print_every == 0:
 67 |             lower, upper, m, n = self.print_statistic()
 68 |             self.n = 0
 69 |             self.mean = 0
 70 |             self.M2 = 0
 71 |             return lower, upper, m, n
 72 |         return 0, 0, self.mean, self.n
 73 | 
 74 |     def at_end_of_epoch_event(self, state):
 75 |         if self.n == 0: return 0, 0, 0, 0
 76 |         self.epoch_errors.append(self.get_confidence_intervals())
 77 |         lower, upper, m, n = self.print_statistic(True)
 78 |         del self.current_scores[:]
 79 |         self.n = 0
 80 |         self.mean = 0
 81 |         self.M2 = 0
 82 |         self.epoch += 1
 83 |         self.iter_count = 0
 84 |         return lower, upper, m, n
 85 | 
 86 |     def get_confidence_intervals(self, percentile=0.99, limit=1000):
 87 |         z = scipy.stats.norm.ppf(percentile)
 88 |         var = self.M2/ (self.n)
 89 |         SE = np.sqrt(var/self.n)
 90 |         lower = self.mean-(z*SE)
 91 |         upper = self.mean+(z*SE)
 92 |         return [self.n, lower, self.mean, upper]
 93 | 
 94 |     def print_statistic(self, at_epoch_end=False):
 95 |         n, lower, m, upper = self.get_confidence_intervals()
 96 |         str_message = '{3} {4}: {2:.5}\t99% CI: ({0:.5}, {1:.5}), n={5}'.format(lower, upper, m, self.name, self.metric_name, self.n)
 97 |         if at_epoch_end: log.info('\n')
 98 |         if at_epoch_end: log.info('#'*40)
 99 |         if at_epoch_end: log.info(' '*10 + 'COMPLETED EPOCH: {0}'.format(self.epoch) + ' '*30)
100 |         log.info(str_message)
101 |         if at_epoch_end: log.info('#'*40)
102 |         if at_epoch_end: log.info('\n')
103 |         return lower, upper, m, n
104 | 
105 | 
106 | class AccuracyHook(AbstractHook):
107 |     def __init__(self, name='', print_every_x_batches=1000):
108 |         super(AccuracyHook, self).__init__(name, 'Accuracy', print_every_x_batches)
109 |         self.func = None
110 |         self.topk = 1
111 |         if Config.backend == Backends.TORCH:
112 |             import torch
113 |             self.func = lambda x: torch.sum(x)
114 | 
115 |     def calculate_metric(self, state):
116 |         if Config.backend == Backends.TORCH:
117 |             correct = 0.0
118 |             if len(state.argmax.size()) == 1:
119 |                 correct += self.func(state.targets==state.argmax)
120 |             else:
121 |                 topk = state.argmax.size(1)
122 |                 for i in range(topk):
123 |                     correct += self.func(state.targets==state.argmax[:, i])
124 |             n = state.argmax.size()[0]
125 |             return correct.item()/np.float32(n)
126 |         elif Config.backend == Backends.TENSORFLOW:
127 |             n = state.argmax.shape[0]
128 |             return np.sum(state.targets==state.argmax)/np.float32(n)
129 |         elif Config.backend == Backends.TEST:
130 |             n = state.argmax.shape[0]
131 |             return np.sum(state.targets==state.argmax)/np.float32(n)
132 |         else:
133 |             raise Exception('Backend has unsupported value {0}'.format(Config.backend))
134 | 
135 | 
136 | class TopKRankingLoss(AbstractHook):
137 |     def __init__(self, k, filtered=False, name='', print_every_x_batches=1000):
138 |         super(TopKRankingLoss, self).__init__(name, '{1}Hits@{0} loss'.format(k, ('' if not filtered else 'Filtered ')), print_every_x_batches)
139 |         self.func = None
140 |         self.argsort = None
141 |         self.sum_func = None
142 |         self.k = k
143 |         self.filtered = filtered
144 |         if Config.backend == Backends.TORCH:
145 |             import torch
146 |             self.argsort = lambda x, k: torch.topk(x, k)
147 |             self.sum_func = lambda x: torch.sum(x)
148 | 
149 | 
150 |     def calculate_metric(self, state):
151 |         if Config.backend == Backends.TORCH:
152 |             if self.filtered:
153 |                 import torch
154 |                 saved = torch.index_select(state.pred,1,state.targets)
155 |                 state.pred[state.multi_labels.byte()] = -100000.0
156 |                 state.pred.index_copy_(1, state.targets, saved)
157 | 
158 |             max_values, argmax = self.argsort(state.pred, self.k)
159 |             in_topk = 0
160 |             for i in range(self.k):
161 |                 in_topk += self.sum_func(argmax[:,i] == state.targets)
162 |             n = state.pred.size()[0]
163 |             return in_topk/np.float32(n)
164 |         else:
165 |             raise Exception('Backend has unsupported value {0}'.format(Config.backend))
166 | 
167 | 
168 | 
169 | class LossHook(AbstractHook):
170 |     def __init__(self, name='', print_every_x_batches=1000):
171 |         super(LossHook, self).__init__(name, 'Loss', print_every_x_batches)
172 | 
173 |     def calculate_metric(self, state):
174 |         if Config.backend == Backends.TORCH:
175 |             state = self.convert_state(state)
176 |             return state.loss.item()
177 |         else:
178 |             return state.loss
179 | 
180 | 
181 | class IntersectionHook(AbstractHook):
182 |     def __init__(self, name='', print_every_x_batches=1000):
183 |         super(IntersectionHook, self).__init__(name, 'Intersection', print_every_x_batches)
184 | 
185 |     def calculate_metric(self, state):
186 |         state = self.convert_state(state)
187 |         preds = state.pred
188 |         targets = state.targets
189 |         if Config.cuda:
190 |             preds = preds.cpu()
191 |             targets = targets.cpu()
192 | 
193 |         preds = preds.numpy()
194 |         targets = targets.numpy()
195 |         n = targets.size
196 |         k = 0
197 |         for row in range(Config.batch_size):
198 |             k += np.intersect1d(preds[row], targets[row]).size
199 | 
200 |         return k/float(n)
201 | 
202 | 
203 | 
204 | class ETAHook(AbstractHook, IAtEpochStartObservable):
205 |     def __init__(self, name='', print_every_x_batches=1000):
206 |         super(ETAHook, self).__init__(name, 'ETA', print_every_x_batches)
207 |         self.t = Timer(silent=True)
208 |         self.cumulative_t = 0.0
209 |         self.skipped_first = False
210 | 
211 |     def get_time_string(self, seconds):
212 |         m, s = divmod(seconds, 60)
213 |         h, m = divmod(m, 60)
214 |         if h < 0: h = 0
215 |         if m < 0: m = 0
216 |         if s < 0: s = 0
217 |         return "%d:%02d:%02d" % (h, m, s)
218 | 
219 |     def calculate_metric(self, state):
220 |         n = state.num_batches
221 |         i = state.current_idx
222 |         cumulative_t = self.t.tick('ETA')
223 |         total_time_estimate = (cumulative_t/i)*n
224 |         self.t.tick('ETA')
225 |         self.cumulative_t = cumulative_t
226 | 
227 |         return total_time_estimate
228 | 
229 |     def print_statistic(self):
230 |         if not self.skipped_first:
231 |             # the first estimation is very unreliable for time measures
232 |             self.skipped_first = True
233 |             return 0, 0, 0, 0
234 |         n, lower, m, upper = self.get_confidence_intervals()
235 |         lower -= self.cumulative_t
236 |         m -= self.cumulative_t
237 |         upper -= self.cumulative_t
238 |         lower, m, upper = self.get_time_string(lower), self.get_time_string(m), self.get_time_string(upper)
239 |         log.info('{3} {4}: {2}\t99% CI: ({0}, {1}), n={5}'.format(lower, upper, m, self.name, self.metric_name, n))
240 |         return lower, upper, m, n
241 | 
242 |     def at_start_of_epoch_event(self, batcher_state):
243 |         self.t.tick('ETA')
244 |         t = self.t.tick('Epoch')
245 | 
246 |     def at_end_of_epoch_event(self, state):
247 |         self.t.tock('ETA')
248 |         epoch_time = self.t.tock('Epoch')
249 |         self.epoch_errors.append([epoch_time])
250 |         log.info('Total epoch time: {0}'.format(self.get_time_string(epoch_time)))
251 |         del self.current_scores[:]
252 |         self.n = 0
253 |         self.mean = 0
254 |         self.M2 = 0
255 |         self.skipped_first = False
256 |         self.epoch += 1
257 |         return epoch_time
258 | 


--------------------------------------------------------------------------------
/KDD2021_demo/kg_completion/spodernet/interfaces.py:
--------------------------------------------------------------------------------
 1 | #These are completly useless, but they signify intent which is important.
 2 | 
 3 | class IAtIterEndObservable(object):
 4 |     def at_end_of_iter_event(self, batcher_state):
 5 |         raise NotImplementedError('Subclasses of IAtIterEndObservable need to override the end_of_iter_event method')
 6 | 
 7 | class IAtEpochStartObservable(object):
 8 |     def at_start_of_epoch_event(self, batcher_state):
 9 |         raise NotImplementedError('Subclasses of IAtEpochStartObservable need to override the at_start_of_epoch method')
10 | 
11 | class IAtEpochEndObservable(object):
12 |     def at_end_of_epoch_event(self, batcher_state):
13 |         raise NotImplementedError('Subclasses of IAtEpochEndObservable need to override the end_of_iter_epoch method')
14 | 
15 | class IAtBatchPreparedObservable(object):
16 |     def at_batch_prepared(self, batch_parts):
17 |         raise NotImplementedError('Subclasses of IAtBatchPreparedObservable need to override the at_batch_prepared method')
18 | 


--------------------------------------------------------------------------------
/KDD2021_demo/kg_completion/spodernet/preprocessing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/KDD2021_demo/kg_completion/spodernet/preprocessing/__init__.py


--------------------------------------------------------------------------------
/KDD2021_demo/kg_completion/spodernet/preprocessing/vocab.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | 
  3 | import numpy as np
  4 | import os
  5 | import time
  6 | import datetime
  7 | import pickle
  8 | import urllib
  9 | # import bashmagic
 10 | import time
 11 | import json
 12 | 
 13 | from spodernet.utils.util import get_data_path, save_data, xavier_uniform_weight
 14 | from os.path import join
 15 | 
 16 | from spodernet.utils.util import Logger
 17 | log = Logger('vocab.py.txt')
 18 | 
 19 | '''This models the vocabulary and token embeddings'''
 20 | 
 21 | class Vocab(object):
 22 |     '''Class that manages work/char embeddings'''
 23 | 
 24 |     def __init__(self, path, vocab = Counter(), labels = {}):
 25 |         '''Constructor.
 26 |         Args:
 27 |             vocab: Counter object with vocabulary.
 28 |         '''
 29 |         self.index = None
 30 |         token2idx = {}
 31 |         idx2token = {}
 32 |         self.label2idx = {}
 33 |         self.idx2label = {}
 34 |         self.glove_cache = {}
 35 |         for i, item in enumerate(vocab.items()):
 36 |             token2idx[item[0]] = i+1
 37 |             idx2token[i+1] = item[0]
 38 | 
 39 |         for idx in labels:
 40 |             self.label2idx[labels[idx]] = idx
 41 |             self.idx2label[idx] = labels[idx]
 42 | 
 43 |         # out of vocabulary token
 44 |         token2idx['OOV'] = int(0)
 45 |         idx2token[int(0)] = 'OOV'
 46 |         # empty = 0
 47 |         token2idx[''] = int(1)
 48 |         idx2token[int(1)] = ''
 49 | 
 50 |         self.token2idx = token2idx
 51 |         self.idx2token = idx2token
 52 |         self.path = path
 53 |         if len(idx2token.keys()) > 0:
 54 |             self.next_idx = int(np.max(list(idx2token.keys())) + 1)
 55 |         else:
 56 |             self.next_idx = int(2)
 57 | 
 58 |         if len(self.idx2label.keys()) > 0:
 59 |             self.next_label_2dx = int(int(np.max(self.idx2label.keys())) + 1)
 60 |         else:
 61 |             self.next_label_idx = int(0)
 62 | 
 63 |     @property
 64 |     def num_token(self):
 65 |         return len(self.token2idx)
 66 | 
 67 |     @property
 68 |     def num_labels(self):
 69 |         return len(self.label2idx)
 70 | 
 71 |     def add_token(self, token):
 72 |         if token not in self.token2idx:
 73 |             self.token2idx[token] = self.next_idx
 74 |             self.idx2token[self.next_idx] = token
 75 |             self.next_idx += 1
 76 | 
 77 |     def add_label(self, label):
 78 |         if label not in self.label2idx:
 79 |             self.label2idx[label] = self.next_label_idx
 80 |             self.idx2label[self.next_label_idx] = label
 81 |             self.next_label_idx += 1
 82 | 
 83 |     def get_idx(self, word):
 84 |         '''Gets the idx if it exists, otherwise returns -1.'''
 85 |         if word in self.token2idx:
 86 |             return self.token2idx[word]
 87 |         else:
 88 |             return self.token2idx['OOV']
 89 | 
 90 |     def get_idx_label(self, label):
 91 |         '''Gets the idx of the label'''
 92 |         return self.label2idx[label]
 93 | 
 94 |     def get_word(self, idx):
 95 |         '''Gets the word if it exists, otherwise returns OOV.'''
 96 |         if idx in self.idx2token:
 97 |             return self.idx2token[idx]
 98 |         else:
 99 |             return self.idx2token[0]
100 | 
101 |     def save_to_disk(self, name=''):
102 |         log.info('Saving vocab to: {0}'.format(self.path))
103 |         pickle.dump([self.token2idx, self.idx2token, self.label2idx,
104 |             self.idx2label], open(self.path + name, 'wb'))
105 | 
106 |     def load_from_disk(self, name=''):
107 |         if not os.path.exists(self.path + name):
108 |             return False
109 |         timestamp = time.ctime(os.path.getmtime(self.path + name))
110 |         timestamp = datetime.datetime.strptime(timestamp, '%a %b %d %H:%M:%S %Y')
111 |         age_in_hours = (datetime.datetime.now() - timestamp).seconds/60./60.
112 |         log.info('Loading vocab from: {0}'.format(self.path + name))
113 |         self.token2idx, self.idx2token, self.label2idx, self.idx2label = pickle.load(open(self.path, 'rb'))
114 |         if age_in_hours > 12:
115 |             log.info('Vocabulary outdated: {0}'.format(self.path + name))
116 |             return False
117 |         else:
118 |             return True
119 | 
120 |     def download_glove(self):
121 |         if not os.path.exists(join(get_data_path(), 'glove')):
122 |             log.info('Glove data is missing, dowloading data now...')
123 |             os.mkdir(join(get_data_path(), 'glove'))
124 |             bashmagic.wget("http://nlp.stanford.edu/data/glove.6B.zip", join(get_data_path(),'glove'))
125 |             bashmagic.unzip(join(get_data_path(), 'glove', 'glove.6B.zip'), join(get_data_path(), 'glove'))
126 | 
127 |     def prepare_glove(self, dimension):
128 |         if self.index is not None: return
129 |         if not os.path.exists(join(get_data_path(), 'glove', 'index_50.p')):
130 |             dims = [50, 100, 200, 300]
131 |             base_filename = 'glove.6B.{0}d.txt'
132 |             paths = [join(get_data_path(), 'glove', base_filename.format(dim)) for dim in dims]
133 |             for path, dim in zip(paths, dims):
134 |                 index = {}
135 |                 index = {'PATH' : path}
136 |                 with open(path, 'rb') as f:
137 |                     log.info('Building index for {0}', path)
138 |                     while True:
139 |                         prev_pos = f.tell()
140 |                         line = f.readline().decode('utf-8')
141 |                         if line == '': break
142 |                         next_pos = f.tell()
143 |                         data = line.strip().split(' ')
144 |                         token = data[0]
145 |                         index[token] = (prev_pos, next_pos)
146 | 
147 |                 log.info('Saving glove index...')
148 |                 json.dump(index, open(join(get_data_path(), 'glove', 'index_{0}.p'.format(dim)), 'w'))
149 | 
150 |         log.info('Loading glove index...')
151 |         self.index = json.load(open(join(get_data_path(), 'glove', 'index_{0}.p'.format(dimension)), 'r'))
152 | 
153 | 
154 |     def load_matrix(self, dim):
155 |         log.info('Initializing glove matrix...')
156 |         X = xavier_uniform_weight(len(self.token2idx), dim)
157 |         log.info('Loading vectors into glove matrix with dimension: {0}', X.shape)
158 |         pretrained_count = 0
159 |         n = len(self.token2idx)-2
160 |         for i, (token, idx) in enumerate(self.token2idx.items()):
161 |             if i % 10000 == 0: print(i)
162 |             vec = self.get_glove_list(token, dim)
163 |             if vec is not None:
164 |                 X[idx] = vec
165 |                 pretrained_count += 1
166 |         log.info('Filled matrix with {0} pretrained embeddings and {1} xavier uniform initialized embeddings.', pretrained_count, n-pretrained_count)
167 |         return X
168 | 
169 |     def get_glove_vector(self, token, dimension=300):
170 |         if token in self.glove_cache: return self.glove_cache[token]
171 |         vec = self.get_glove_list(token, dimension)
172 |         if vec is not None:
173 |             arr = np.array(vec, dtype=np.float32)
174 |             self.glove_cache[token] = arr
175 |             return arr
176 |         else: return None
177 | 
178 |     def get_glove_list(self, token, dimension=300):
179 |         assert dimension in [50, 100, 200, 300], 'Dimension not supported! Only dimension 50, 100, 200, and 300 are supported!'
180 |         self.download_glove()
181 |         self.prepare_glove(dimension)
182 |         vec = None
183 |         if token in self.index:
184 |             p = self.index['PATH']
185 |             with open(p, 'rb') as f:
186 |                 start, end = self.index[token]
187 |                 f.seek(start)
188 |                 line = f.read(end-start).decode('utf-8')
189 |                 data = line.strip().split(' ')
190 |                 vec = data[1:]
191 | 
192 |         return vec
193 | 
194 |     def exists_in_glove(self, token, dimension=300):
195 |         self.download_glove()
196 |         self.prepare_glove(dimension)
197 |         return token in self.index
198 | 
199 | 
200 |     def get_glove_matrix(self, dimension):
201 |         assert dimension in [50, 100, 200, 300], 'Dimension not supported! Only dimension 50, 100, 200, and 300 are supported!'
202 |         self.download_glove()
203 |         return self.load_matrix(dimension)
204 | 


--------------------------------------------------------------------------------
/KDD2021_demo/kg_completion/spodernet/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/KDD2021_demo/kg_completion/spodernet/utils/__init__.py


--------------------------------------------------------------------------------
/KDD2021_demo/kg_completion/spodernet/utils/cuda_utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import torch
 3 | from torch.cuda import Event
 4 | 
 5 | class CUDATimer(object):
 6 |     def __init__(self, silent=False):
 7 |         self.cumulative_secs = {}
 8 |         self.current_ticks = {}
 9 |         self.silent = silent
10 |         self.end = Event(enable_timing=True, blocking=True)
11 | 
12 |     def tick(self, name='default'):
13 |         if name not in self.current_ticks:
14 |             start = Event(enable_timing=True, blocking=True)
15 |             start.record()
16 |             self.current_ticks[name] = start
17 | 
18 |             return 0.0
19 |         else:
20 |             if name not in self.cumulative_secs:
21 |                 self.cumulative_secs[name] = 0
22 |             self.end.record()
23 |             self.end.synchronize()
24 |             self.cumulative_secs[name] += self.current_ticks[name].elapsed_time(self.end)/1000.
25 |             self.current_ticks.pop(name)
26 | 
27 |             return self.cumulative_secs[name]
28 | 
29 |     def tock(self, name='default'):
30 |         self.tick(name)
31 |         value = self.cumulative_secs[name]
32 |         if not self.silent:
33 |             print('Time taken for {0}: {1:.8f}s'.format(name, value))
34 |         self.cumulative_secs.pop(name)
35 |         if name in self.current_ticks:
36 |             del self.current_ticks[name]
37 |         self.current_ticks.pop(name, None)
38 | 
39 |         return value
40 | 


--------------------------------------------------------------------------------
/KDD2021_demo/kg_completion/spodernet/utils/global_config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from collections import namedtuple
  3 | from spodernet.utils.logger import Logger
  4 | log = Logger('global_config.py.txt')
  5 | 
  6 | class Backends:
  7 |     TORCH = 'pytorch'
  8 |     TENSORFLOW = 'tensorflow'
  9 |     TEST = 'test'
 10 |     CNTK = 'cntk'
 11 | 
 12 | 
 13 | class Config:
 14 |     dropout = 0.0
 15 |     batch_size = 128
 16 |     learning_rate = 0.001
 17 |     backend = Backends.TORCH
 18 |     L2 = 0.000
 19 |     cuda = False
 20 |     embedding_dim = 128
 21 |     hidden_size = 256
 22 |     input_dropout = 0.0
 23 |     feature_map_dropout = 0.0
 24 |     use_conv_transpose = False
 25 |     use_bias = True
 26 |     optimizer = 'adam'
 27 |     learning_rate_decay = 1.0
 28 |     label_smoothing_epsilon = 0.1
 29 |     epochs = 1000
 30 |     dataset = None
 31 |     process = False
 32 |     model_name = None
 33 | 
 34 |     @staticmethod
 35 |     def parse_argv(argv):
 36 |         file_name = argv[0]
 37 |         args = argv[1:]
 38 |         assert len(args) % 2 == 0, 'Global parser expects an even number of arguments.'
 39 |         values = []
 40 |         names = []
 41 |         for i, token in enumerate(args):
 42 |             if i % 2 == 0:
 43 |                 names.append(token)
 44 |             else:
 45 |                 values.append(token)
 46 | 
 47 |         for i in range(len(names)):
 48 |             if names[i] in alias2params:
 49 |                 log.debug('Replaced parameters alias {0} with name {1}', names[i], alias2params[names[i]])
 50 |                 names[i] = alias2params[names[i]]
 51 | 
 52 |         for i in range(len(names)):
 53 |             name = names[i]
 54 |             if name[:2] == '--': continue
 55 |             if name not in params2type:
 56 |                 log.info('List of possible parameters: {0}', params2type.keys())
 57 |                 log.error('Parameter {0} does not exist. Prefix your custom parameters with -- to skip parsing for global config', name)
 58 |             values[i] = params2type[name](values[i])
 59 | 
 60 |         for name, value in zip(names, values):
 61 |             if name[:2] == '--': continue
 62 |             params2field[name](value)
 63 |             log.info('Set parameter {0} to {1}', name, value)
 64 | 
 65 |     input_dropout = 0.0
 66 |     feature_map_dropout = 0.0
 67 |     use_transposed_convolutions = False
 68 |     use_bias = True
 69 | 
 70 | params2type = {}
 71 | params2type['learning_rate'] = lambda x: float(x)
 72 | params2type['learning_rate_decay'] = lambda x: float(x)
 73 | params2type['dropout'] = lambda x: float(x)
 74 | params2type['batch_size'] = lambda x: int(x)
 75 | params2type['L2'] = lambda x: float(x)
 76 | params2type['embedding_dim'] = lambda x: int(x)
 77 | params2type['hidden_size'] = lambda x: int(x)
 78 | params2type['input_dropout'] = lambda x: float(x)
 79 | params2type['label_smoothing_epsilon'] = lambda x: float(x)
 80 | params2type['feature_map_dropout'] = lambda x: float(x)
 81 | params2type['use_conv_transpose'] = lambda x: x.lower() == 'true' or x == '1'
 82 | params2type['use_bias'] = lambda x: x.lower() == 'true' or x == '1'
 83 | params2type['optimizer'] = lambda x: x
 84 | params2type['epochs'] = lambda x: int(x)
 85 | params2type['dataset'] = lambda x: x
 86 | params2type['model_name'] = lambda x: x
 87 | params2type['process'] = lambda x: x.lower() == 'true' or x == '1'
 88 | 
 89 | alias2params = {}
 90 | alias2params['lr'] = 'learning_rate'
 91 | alias2params['lr_decay'] = 'learning_rate_decay'
 92 | alias2params['l2'] = 'L2'
 93 | alias2params['input_drop'] = 'input_dropout'
 94 | alias2params['hidden_drop'] = 'dropout'
 95 | alias2params['feat_drop'] = 'feature_map_dropout'
 96 | alias2params['bias'] = 'use_bias'
 97 | alias2params['conv_trans'] = 'use_conv_transpose'
 98 | alias2params['opt'] = 'optimizer'
 99 | alias2params['label_smoothing'] = 'label_smoothing_epsilon'
100 | alias2params['model'] = 'model_name'
101 | 
102 | 
103 | 
104 | params2field = {}
105 | params2field['learning_rate'] = lambda x: setattr(Config, 'learning_rate', x)
106 | params2field['learning_rate_decay'] = lambda x: setattr(Config, 'learning_rate_decay', x)
107 | params2field['dropout'] = lambda x: setattr(Config, 'dropout', x)
108 | params2field['batch_size'] = lambda x: setattr(Config, 'batch_size', x)
109 | params2field['L2'] = lambda x: setattr(Config, 'L2', x)
110 | params2field['embedding_dim'] = lambda x: setattr(Config, 'embedding_dim', x)
111 | params2field['hidden_size'] = lambda x: setattr(Config, 'hidden_size', x)
112 | params2field['input_dropout'] = lambda x: setattr(Config, 'input_dropout', x)
113 | params2field['feature_map_dropout'] = lambda x: setattr(Config, 'feature_map_dropout', x)
114 | params2field['use_conv_transpose'] = lambda x: setattr(Config, 'use_conv_transpose', x)
115 | params2field['use_bias'] = lambda x: setattr(Config, 'use_bias', x)
116 | params2field['optimizer'] = lambda x: setattr(Config, 'optimizer', x)
117 | params2field['label_smoothing_epsilon'] = lambda x: setattr(Config, 'label_smoothing_epsilon', x)
118 | params2field['epochs'] = lambda x: setattr(Config, 'epochs', x)
119 | params2field['dataset'] = lambda x: setattr(Config, 'dataset', x)
120 | params2field['process'] = lambda x: setattr(Config, 'process', x)
121 | params2field['model_name'] = lambda x: setattr(Config, 'model_name', x)
122 | 
123 | 
124 | 


--------------------------------------------------------------------------------
/KDD2021_demo/kg_completion/spodernet/utils/logger.py:
--------------------------------------------------------------------------------
  1 | from enum import IntEnum
  2 | from os.path import join
  3 | 
  4 | import os
  5 | import datetime
  6 | import numpy as np
  7 | import time
  8 | 
  9 | # util functions start
 10 | #
 11 | # these function also exist in util.py,
 12 | # but since logger is imported everywere these function need to be included here
 13 | 
 14 | def get_home_path():
 15 |     return os.environ['HOME']
 16 | 
 17 | def get_logger_path():
 18 |     return join(get_home_path(), '.data', 'log_files')
 19 | 
 20 | def make_dirs_if_not_exists(path):
 21 |     if not os.path.exists(path):
 22 |         os.makedirs(path)
 23 | 
 24 | # util functions end
 25 | class GlobalLogger:
 26 |     timestr = None
 27 |     global_logger_path = None
 28 |     f_global_logger = None
 29 | 
 30 |     @staticmethod
 31 |     def init():
 32 |         GlobalLogger.timestr = time.strftime("%Y%m%d-%H%M%S")
 33 |         if not os.path.exists(join(get_logger_path(), 'full_logs')):
 34 |             os.mkdir(join(get_logger_path(), 'full_logs'))
 35 |         GlobalLogger.global_logger_path = join(get_logger_path(), 'full_logs', GlobalLogger.timestr +  '.txt')
 36 |         GlobalLogger.f_global_logger = open(GlobalLogger.global_logger_path, 'w')
 37 | 
 38 |     @staticmethod
 39 |     def flush():
 40 |         GlobalLogger.f_global_logger.close()
 41 |         GlobalLogger.f_global_logger = open(GlobalLogger.global_logger_path, 'a')
 42 | 
 43 |     def __del__(self):
 44 |         GlobalLogger.f_global_logger.close()
 45 | 
 46 | class LogLevel(IntEnum):
 47 |     STATISTICAL = 0
 48 |     DEBUG = 1
 49 |     INFO = 2
 50 |     WARNING = 3
 51 |     ERROR = 4
 52 | 
 53 | class Logger:
 54 |     GLOBAL_LOG_LEVEL = LogLevel.INFO
 55 |     LOG_PROPABILITY = 0.05
 56 |     USE_GLOBAL_STATISTICAL_LOG_PROBABILITY = False
 57 |     PRINT_COUNT = 2
 58 | 
 59 |     def __init__(self, file_name, write_type='w'):
 60 |         path = join(get_logger_path(), file_name)
 61 |         path_statistical = join(get_logger_path(), 'statistical_' + file_name)
 62 |         self.path = path
 63 |         make_dirs_if_not_exists(get_logger_path())
 64 |         self.f = open(path, write_type)
 65 |         self.f_statistical = open(path_statistical, write_type)
 66 |         self.rdm = np.random.RandomState(234234)
 67 |         self.debug('Created log file at: {0} with write type: {1}'.format(path, write_type))
 68 |         self.once_dict = {}
 69 | 
 70 |     def __del__(self):
 71 |         self.f.close()
 72 |         self.f_statistical.close()
 73 | 
 74 |     def wrap_message(self, message, log_level, *args):
 75 |         return '{0} ({2}): {1}'.format(datetime.datetime.now(), message.format(*args), log_level.name)
 76 | 
 77 |     def statistical(self, message, p, *args):
 78 |         if Logger.GLOBAL_LOG_LEVEL == LogLevel.STATISTICAL:
 79 |             self._log_statistical(message, p, *args)
 80 | 
 81 |     def debug(self, message, *args):
 82 |         self._log(message, LogLevel.DEBUG, *args)
 83 | 
 84 |     def info_once(self, message, *args):
 85 |         if LogLevel.INFO < Logger.GLOBAL_LOG_LEVEL: return
 86 |         if message not in self.once_dict: self.once_dict[message] = 0
 87 |         if self.once_dict[message] < Logger.PRINT_COUNT:
 88 |             self.once_dict[message] += 1
 89 |             self._log(message, LogLevel.INFO, *args)
 90 | 
 91 |     def debug_once(self, message, *args):
 92 |         if LogLevel.DEBUG < Logger.GLOBAL_LOG_LEVEL: return
 93 |         if message not in self.once_dict: self.once_dict[message] = 0
 94 |         if self.once_dict[message] < Logger.PRINT_COUNT:
 95 |             self.once_dict[message] += 1
 96 |             self._log(message, LogLevel.DEBUG, *args)
 97 | 
 98 |     def info(self, message, *args):
 99 |         self._log(message, LogLevel.INFO, *args)
100 | 
101 |     def warning(self, message, *args):
102 |         self._log(message, LogLevel.WARNING, *args)
103 | 
104 |     def error(self, message, *args):
105 |         self._log(message, LogLevel.ERROR, *args)
106 |         raise Exception(message.format(*args))
107 | 
108 |     def _log_statistical(self, message, p, *args):
109 |         rdm_num = self.rdm.rand()
110 |         if Logger.USE_GLOBAL_STATISTICAL_LOG_PROBABILITY:
111 |             if rdm_num < Logger.LOG_PROPABILITY:
112 |                 message = self.wrap_message(message, LogLevel.STATISTICAL, *args)
113 |                 self.f_statistical.write(message + '\n')
114 |         else:
115 |             if rdm_num < p:
116 |                 message = self.wrap_message(message, LogLevel.STATISTICAL, *args)
117 |                 self.f_statistical.write(message + '\n')
118 | 
119 |     def _log(self, message, log_level=LogLevel.INFO, *args):
120 |         if log_level >= Logger.GLOBAL_LOG_LEVEL:
121 |             message = self.wrap_message(message, log_level, *args)
122 |             if message.strip() != '':
123 |                 print(message)
124 |                 self.f.write(message + '\n')
125 |                 if GlobalLogger.f_global_logger is None: GlobalLogger.init()
126 |                 GlobalLogger.f_global_logger.write(message + '\n')
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/KDD2021_demo/kg_completion/spodernet/utils/spacy_util.py:
--------------------------------------------------------------------------------
 1 | import spacy
 2 | 
 3 | subjects = set(['nsubj'])
 4 | objects = set(['dobj', 'pobj'])
 5 | 
 6 | def merge_noun_phrases(sent_doc):
 7 |     for np in sent_doc.noun_chunks:
 8 |        np.merge(np.root.tag_, np.text, np.root.ent_type_)
 9 | 
10 | def merge_entities(sent_doc):
11 |     for ent in sent_doc.ents:
12 |         ent.merge(ent.root.dep_, ent.text, ent.label_)
13 | 
14 | def merge_verbs(sent_doc):
15 |     has_double_verb = False
16 |     for span_length in [3, 2]:
17 |         i = 1
18 |         while i < len(sent_doc)-1:
19 |             token = sent_doc[i]
20 |             if token.pos_ == 'VERB':
21 |                 full_match = True
22 |                 for j in range(1, span_length):
23 |                     full_match &= sent_doc[i-j].pos_ == 'VERB'
24 |                 if full_match:
25 |                     span = sent_doc[i-1:i+span_length-1]
26 |                     span.merge()
27 |                     i += span_length-1
28 |                     has_double_verb = True
29 |             i += 1
30 | 
31 | def merge_with_set(sent_doc, to_match, write_key='pobj'):
32 |     for span_length in [5, 4, 3, 2]:
33 |         i = span_length-1
34 |         while i < len(sent_doc)-1:
35 |             token = sent_doc[i]
36 |             if token.dep_ in write_key:
37 |                 pos, dep = token.pos_, token.dep_
38 |                 full_match = True
39 |                 for j in range(1, span_length):
40 |                     full_match &= sent_doc[i-j].dep_ in to_match
41 |                     full_match &= sent_doc[i-j].pos_ != 'VERB'
42 |                     idx = sent_doc[i-j].idx
43 |                 if full_match:
44 |                     span = sent_doc[i-1:i+span_length-1]
45 |                     span.merge()
46 |                     sent_doc[i-1].dep_ = dep
47 |                     i += span_length-1
48 |             i += 1
49 | 
50 | def merge_tokens(sent_doc):
51 |     merge_noun_phrases(sent_doc)
52 |     merge_entities(sent_doc)
53 |     merge_verbs(sent_doc)
54 |     merge_with_set(sent_doc, set(['pobj', 'prep']))
55 |     merge_with_set(sent_doc, set(['pobj', 'prep']))
56 |     merge_with_set(sent_doc, set(['pobj', 'prep']))
57 |     merge_with_set(sent_doc, set(['attr', 'punct', 'cc', 'conj', 'pobj']), 'pobj')
58 |     merge_with_set(sent_doc, set(['attr', 'punct', 'cc', 'conj', 'pobj']), 'pobj')
59 |     merge_with_set(sent_doc, set(['dobj', 'pobj']), 'dobj')
60 |     merge_with_set(sent_doc, set(['dobj', 'pobj']), 'dobj')
61 |     merge_with_set(sent_doc, set(['attr', 'punct', 'cc', 'conj', 'pobj', 'dobj']), 'dobj')
62 |     merge_with_set(sent_doc, set(['attr', 'punct', 'cc', 'conj', 'pobj', 'dobj']), 'dobj')
63 | 
64 | def extract_triples(sent_doc):
65 |     triples = []
66 |     triple = []
67 |     for token in sent_doc:
68 |         if token.pos_ == 'VERB':
69 |             if len(triple) == 0: continue
70 |             if triple[-1].dep_ in subjects:
71 |                 triple.append(token)
72 |             else:
73 |                 triple = []
74 |         if token.dep_ in subjects:
75 |             if len(triple) == 0:
76 |                 triple.append(token)
77 |             else:
78 |                 triple = [token]
79 |         if token.dep_ in objects:
80 |             if len(triple) == 0: continue
81 |             if triple[-1].pos_ == 'VERB':
82 |                 triple.append(token)
83 |                 triples.append(triple)
84 |                 triple = []
85 |             else:
86 |                 triple = []
87 |     return triples
88 | 


--------------------------------------------------------------------------------
/KDD2021_demo/kg_completion/spodernet/utils/util.py:
--------------------------------------------------------------------------------
  1 | from os.path import join
  2 | from scipy.sparse import csr_matrix, spmatrix
  3 | 
  4 | import h5py
  5 | import os
  6 | import time
  7 | import os
  8 | import numpy as np
  9 | import torch
 10 | 
 11 | from spodernet.utils.logger import Logger
 12 | log = Logger('util.py.txt')
 13 | 
 14 | rdm = np.random.RandomState(2345235)
 15 | 
 16 | def save_dense_hdf(path, data):
 17 |     '''Writes a numpy array to a hdf5 file under the given path.'''
 18 |     log.debug_once('Saving hdf5 file to: {0}', path)
 19 |     h5file = h5py.File(path, "w")
 20 |     h5file.create_dataset("default", data=data)
 21 |     h5file.close()
 22 | 
 23 | 
 24 | def load_dense_hdf(path, keyword='default'):
 25 |     '''Reads and returns a numpy array for a hdf5 file'''
 26 |     log.debug_once('Reading hdf5 file from: {0}', path)
 27 |     h5file = h5py.File(path, 'r')
 28 |     dset = h5file.get(keyword)
 29 |     data = dset[:]
 30 |     h5file.close()
 31 |     return data
 32 | 
 33 | def save_sparse_hdf(path, data):
 34 |     shape = data.shape
 35 |     sparse = csr_matrix(data)
 36 |     folder, filename = os.path.split(path)
 37 |     save_dense_hdf(join(folder, 'data_' + filename), sparse.data)
 38 |     save_dense_hdf(join(folder, 'indices_' + filename), sparse.indices)
 39 |     save_dense_hdf(join(folder, 'indptr_' + filename), sparse.indptr)
 40 |     save_dense_hdf(join(folder, 'shape_dense_' + filename), shape)
 41 |     save_dense_hdf(join(folder, 'shape_sparse_' + filename), sparse.shape)
 42 | 
 43 | def load_sparse_hdf(path, keyword='default'):
 44 |     folder, filename = os.path.split(path)
 45 |     data = load_dense_hdf(join(folder, 'data_' + filename))
 46 |     indices = load_dense_hdf(join(folder, 'indices_' + filename))
 47 |     indptr = load_dense_hdf(join(folder, 'indptr_' + filename))
 48 |     shape = load_dense_hdf(join(folder, 'shape_dense_' + filename))
 49 |     shape_sparse = load_dense_hdf(join(folder, 'shape_sparse_' + filename))
 50 |     return csr_matrix((data, indices, indptr), shape=shape_sparse).toarray().reshape(shape)
 51 | 
 52 | def load_data(path):
 53 |     folder, filename = os.path.split(path)
 54 |     if os.path.exists(join(folder, 'indptr_' + filename)):
 55 |         data = load_sparse_hdf(path)
 56 |         return data
 57 |     else:
 58 |         return load_dense_hdf(path)
 59 | 
 60 | def save_data(path, data):
 61 |     assert data.size > 0
 62 |     is_sparse = isinstance(data, spmatrix)
 63 |     if is_sparse:
 64 |         save_sparse_hdf(path, data)
 65 |         return
 66 | 
 67 |     zero = (data == 0.0).sum()
 68 |     percent = zero/float(data.size)
 69 |     if percent > 0.5:
 70 |         save_sparse_hdf(path, data)
 71 |     else:
 72 |         save_dense_hdf(path, data)
 73 | 
 74 | 
 75 | def load_hdf5_paths(paths, limit=None):
 76 |     data = []
 77 |     for path in paths:
 78 |         if limit != None:
 79 |             data.append(load_data(path)[:limit])
 80 |         else:
 81 |             data.append(load_data(path))
 82 |     return data
 83 | 
 84 | def get_home_path():
 85 |     return os.environ['HOME']
 86 | 
 87 | def get_data_path():
 88 |     return join(os.environ['HOME'], '.data')
 89 | 
 90 | def make_dirs_if_not_exists(path):
 91 |     if not os.path.exists(path):
 92 |         os.makedirs(path)
 93 | 
 94 | # taken from pytorch; gain parameter is omitted
 95 | def xavier_uniform_weight(fan_in, fan_out):
 96 |     std = np.sqrt(2.0 / (fan_in + fan_out))
 97 |     a = np.sqrt(3.0) * std
 98 |     return np.float32(rdm.uniform(-a, a, size=(fan_in, fan_out)))
 99 | 
100 | def embedding_sequence2text(vocab, embedding, break_at_0=True):
101 |     if not isinstance(embedding, np.ndarray):
102 |         if isinstance(embedding, torch.autograd.Variable):
103 |             emb = embedding.data.cpu().numpy()
104 |         else:
105 |             emb = embedding.cpu().numpy()
106 |     else:
107 |         emb = embedding
108 |     sentences = []
109 |     for row in emb:
110 |         sentence_array = []
111 |         for idx in row:
112 |             if idx == 0: break
113 |             sentence_array.append(vocab.get_word(idx))
114 |         sentences.append(sentence_array)
115 |     return sentences
116 | 
117 | class PercentileRejecter(object):
118 | 
119 |     def __init__(self, above_percentile_threshold):
120 |         self.values = []
121 |         self.percentile_threshold = above_percentile_threshold
122 |         self.threshold_value = 0
123 |         self.current_iter = 0
124 |         self.compute_every = 1
125 | 
126 |     def above_percentile(self, value, percentile=None):
127 |         self.values.append(value)
128 |         self.current_iter += 1
129 |         if len(self.values) < 20:
130 |             return False
131 |         else:
132 |             if percentile is None:
133 |                 if self.current_iter % self.compute_every == 0:
134 |                     p = np.percentile(self.values, self.percentile_threshold)
135 |                     if p*1.05 < self.threshold_value or p*0.95 > self.threshold_value:
136 |                         self.threshold_value = p
137 |                         self.compute_every -= 1
138 |                         if self.compute_every < 1: self.compute_every = 1
139 |                     else:
140 |                         self.compute_every += 1
141 |                 else:
142 |                     p = self.threshold_value
143 |             else:
144 |                 p = np.percentile(self.values, percentile)
145 |                 self.threshold_value = p
146 |             return value > p
147 | 
148 | 
149 | class Timer(object):
150 |     def __init__(self, silent=False):
151 |         self.cumulative_secs = {}
152 |         self.current_ticks = {}
153 |         self.silent = silent
154 | 
155 |     def tick(self, name='default'):
156 |         if name not in self.current_ticks:
157 |             self.current_ticks[name] = time.time()
158 | 
159 |             return 0.0
160 |         else:
161 |             if name not in self.cumulative_secs:
162 |                 self.cumulative_secs[name] = 0
163 |             t = time.time()
164 |             self.cumulative_secs[name] += t - self.current_ticks[name]
165 |             self.current_ticks.pop(name)
166 | 
167 |             return self.cumulative_secs[name]
168 | 
169 |     def tock(self, name='default'):
170 |         self.tick(name)
171 |         value = self.cumulative_secs[name]
172 |         if not self.silent:
173 |             log.info('Time taken for {0}: {1:.8f}s'.format(name, value))
174 |         self.cumulative_secs.pop(name)
175 |         self.current_ticks.pop(name, None)
176 | 
177 |         return value
178 | 
179 | 


--------------------------------------------------------------------------------
/KDD2021_demo/kg_completion/wrangle_KG.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from os.path import join
  3 | import json
  4 | 
  5 | import argparse
  6 | import datetime
  7 | import json
  8 | import urllib
  9 | import pickle
 10 | import os
 11 | import numpy as np
 12 | import operator
 13 | import sys
 14 | 
 15 | rdm = np.random.RandomState(234234)
 16 | 
 17 | if len(sys.argv) > 1:
 18 |     dataset_name = sys.argv[1]
 19 | else:
 20 |     dataset_name = 'FB15k-237'
 21 |     #dataset_name = 'FB15k'
 22 |     #dataset_name = 'yago'
 23 |     #dataset_name = 'WN18RR'
 24 | 
 25 | print('Processing dataset {0}'.format(dataset_name))
 26 | 
 27 | rdm = np.random.RandomState(2342423)
 28 | base_path = 'kg_completion/data/{0}/'.format(dataset_name)
 29 | files = ['train.txt', 'valid.txt', 'test.txt']
 30 | 
 31 | data = []
 32 | for p in files:
 33 |     with open(join(base_path, p)) as f:
 34 |         data = f.readlines() + data
 35 | 
 36 | 
 37 | label_graph = {}
 38 | train_graph = {}
 39 | test_cases = {}
 40 | for p in files:
 41 |     test_cases[p] = []
 42 |     train_graph[p] = {}
 43 | 
 44 | 
 45 | for p in files:
 46 |     with open(join(base_path, p)) as f:
 47 |         for i, line in enumerate(f):
 48 |             e1, rel, e2 = line.split('\t')
 49 |             e1 = e1.strip()
 50 |             e2 = e2.strip()
 51 |             rel = rel.strip()
 52 |             rel_reverse = rel+ '_reverse'
 53 | 
 54 |             # data
 55 |             # (Mike, fatherOf, John)
 56 |             # (John, fatherOf, Tom)
 57 | 
 58 |             if (e1 , rel) not in label_graph:
 59 |                 label_graph[(e1, rel)] = set()
 60 | 
 61 |             if (e2,  rel_reverse) not in label_graph:
 62 |                 label_graph[(e2, rel_reverse)] = set()
 63 | 
 64 |             if (e1,  rel) not in train_graph[p]:
 65 |                 train_graph[p][(e1, rel)] = set()
 66 |             if (e2, rel_reverse) not in train_graph[p]:
 67 |                 train_graph[p][(e2, rel_reverse)] = set()
 68 | 
 69 |             # labels
 70 |             # (Mike, fatherOf, John)
 71 |             # (John, fatherOf, Tom)
 72 |             # (John, fatherOf_reverse, Mike)
 73 |             # (Tom, fatherOf_reverse, Mike)
 74 |             label_graph[(e1, rel)].add(e2)
 75 | 
 76 |             label_graph[(e2, rel_reverse)].add(e1)
 77 | 
 78 |             # test cases
 79 |             # (Mike, fatherOf, John)
 80 |             # (John, fatherOf, Tom)
 81 |             test_cases[p].append([e1, rel, e2])
 82 | 
 83 |             # data
 84 |             # (Mike, fatherOf, John)
 85 |             # (John, fatherOf, Tom)
 86 |             # (John, fatherOf_reverse, Mike)
 87 |             # (Tom, fatherOf_reverse, John)
 88 |             train_graph[p][(e1, rel)].add(e2)
 89 |             train_graph[p][(e2, rel_reverse)].add(e1)
 90 | 
 91 | 
 92 | 
 93 | def write_training_graph(cases, graph, path):
 94 |     with open(path, 'w') as f:
 95 |         n = len(graph)
 96 |         for i, key in enumerate(graph):
 97 |             e1, rel = key
 98 |             # (Mike, fatherOf, John)
 99 |             # (John, fatherOf, Tom)
100 |             # (John, fatherOf_reverse, Mike)
101 |             # (Tom, fatherOf_reverse, John)
102 | 
103 |             # (John, fatherOf) -> Tom
104 |             # (John, fatherOf_reverse, Mike) 
105 |             entities1 = " ".join(list(graph[key]))
106 | 
107 |             data_point = {}
108 |             data_point['e1'] = e1
109 |             data_point['e2'] = 'None'
110 |             data_point['rel'] = rel
111 |             data_point['rel_eval'] = 'None'
112 |             data_point['e2_multi1'] =  entities1
113 |             data_point['e2_multi2'] = "None"
114 | 
115 |             f.write(json.dumps(data_point)  + '\n')
116 | 
117 | def write_evaluation_graph(cases, graph, path):
118 |     with open(path, 'w') as f:
119 |         n = len(cases)
120 |         n1 = 0
121 |         n2 = 0
122 |         for i, (e1, rel, e2) in enumerate(cases):
123 |             # (Mike, fatherOf) -> John
124 |             # (John, fatherOf, Tom)
125 |             rel_reverse = rel+'_reverse'
126 |             entities1 = " ".join(list(graph[(e1, rel)]))
127 |             entities2 = " ".join(list(graph[(e2, rel_reverse)]))
128 | 
129 |             n1 += len(entities1.split(' '))
130 |             n2 += len(entities2.split(' '))
131 | 
132 | 
133 |             data_point = {}
134 |             data_point['e1'] = e1
135 |             data_point['e2'] = e2
136 |             data_point['rel'] = rel
137 |             data_point['rel_eval'] = rel_reverse
138 |             data_point['e2_multi1'] = entities1
139 |             data_point['e2_multi2'] = entities2
140 | 
141 |             f.write(json.dumps(data_point)  + '\n')
142 | 
143 | 
144 | all_cases = test_cases['train.txt'] + test_cases['valid.txt'] + test_cases['test.txt']
145 | write_training_graph(test_cases['train.txt'], train_graph['train.txt'], 'kg_completion/data/{0}/e1rel_to_e2_train.json'.format(dataset_name))
146 | write_evaluation_graph(test_cases['valid.txt'], label_graph, join('kg_completion/data/{0}/e1rel_to_e2_ranking_dev.json'.format(dataset_name)))
147 | write_evaluation_graph(test_cases['test.txt'], label_graph, 'kg_completion/data/{0}/e1rel_to_e2_ranking_test.json'.format(dataset_name))
148 | write_training_graph(all_cases, label_graph, 'kg_completion/data/{0}/e1rel_to_e2_full.json'.format(dataset_name))
149 | 


--------------------------------------------------------------------------------
/KDD2021_demo/math_word_problem_solving/config.yaml:
--------------------------------------------------------------------------------
 1 | graph_construction_name: "dependency"
 2 | graph_embedding_name: "graphsage"
 3 | decoder_name: "stdtree"
 4 | 
 5 | graph_construction_args:
 6 |   graph_construction_share:
 7 |     graph_type: 'dependency'
 8 |     root_dir: "./data"
 9 |     topology_subdir: 'DependencyGraph'
10 |     thread_number: 4
11 |     port: 9000
12 |     timeout: 15000
13 | 
14 |   graph_construction_private:
15 |     edge_strategy: 'homogeneous'
16 |     merge_strategy: 'tailhead'
17 |     sequential_link: true
18 |     as_node: false
19 | 
20 |   node_embedding:
21 |     input_size: 300
22 |     hidden_size: 300
23 |     word_dropout: 0.1
24 |     rnn_dropout: 0.1
25 |     fix_bert_emb: false
26 |     fix_word_emb: false
27 |     embedding_style:
28 |       single_token_item: true
29 |       emb_strategy: "w2v_bilstm"
30 |       num_rnn_layers: 1
31 |       bert_model_name: null
32 |       bert_lower_case: null
33 | 
34 |     sim_metric_type: 'weighted_cosine'
35 |     num_heads: 1
36 |     top_k_neigh: null
37 |     epsilon_neigh: 0.5
38 |     smoothness_ratio: 0.1
39 |     connectivity_ratio: 0.05
40 |     sparsity_ratio: 0.1
41 | 
42 | graph_embedding_args:
43 |   graph_embedding_share:
44 |     num_layers: 1
45 |     input_size: 300
46 |     hidden_size: 300
47 |     output_size: 300
48 |     direction_option: "undirected"
49 |     feat_drop: 0.0
50 |     attn_drop: 0.0
51 | 
52 |   graph_embedding_private:
53 |     aggregator_type: "lstm"
54 |     bias: true
55 |     norm: null
56 |     activation: "relu"
57 |     use_edge_weight: false
58 | 
59 | decoder_args:
60 |   rnn_decoder_share:
61 |     rnn_type: "lstm"
62 |     input_size: 300
63 |     hidden_size: 300
64 |     rnn_emb_input_size: 300
65 |     use_copy: true
66 |     graph_pooling_strategy: null
67 |     attention_type: "uniform"
68 |     fuse_strategy: "concatenate"
69 |     dropout: 0.3
70 |     teacher_forcing_rate: 1.0
71 | 
72 |   rnn_decoder_private:
73 |     max_decoder_step: 35
74 |     max_tree_depth: 8
75 |     use_sibling: false
76 |     use_input_feed: true
77 | 


--------------------------------------------------------------------------------
/KDD2021_demo/math_word_problem_solving/data/processed/DependencyGraph/data.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/KDD2021_demo/math_word_problem_solving/data/processed/DependencyGraph/data.pt


--------------------------------------------------------------------------------
/KDD2021_demo/math_word_problem_solving/data/processed/DependencyGraph/vocab.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/KDD2021_demo/math_word_problem_solving/data/processed/DependencyGraph/vocab.pt


--------------------------------------------------------------------------------
/KDD2021_demo/math_word_problem_solving/imgs/g2t.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/KDD2021_demo/math_word_problem_solving/imgs/g2t.png


--------------------------------------------------------------------------------
/KDD2021_demo/math_word_problem_solving/utils.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import torch
 3 | import sympy
 4 | from random import randint
 5 | from sympy.parsing.sympy_parser import parse_expr
 6 | 
 7 | def convert_to_string(idx_list, form_manager):
 8 |     w_list = []
 9 |     for i in range(len(idx_list)):
10 |         w_list.append(form_manager.get_idx_symbol(int(idx_list[i])))
11 |     return " ".join(w_list)
12 | 
13 | def is_all_same(c1, c2, form_manager):
14 |     all_same = False
15 |     if len(c1) == len(c2):
16 |         all_same = True
17 |         for j in range(len(c1)):
18 |             if c1[j] != c2[j]:
19 |                 all_same = False
20 |                 break
21 |     if all_same == False:
22 |         if is_solution_same(c1, c2, form_manager):
23 |             return True
24 |         return False
25 |     else:
26 |         return True
27 | 
28 | 
29 | def is_solution_same(i1, i2, form_manager):
30 |     c1 = " ".join([form_manager.get_idx_symbol(x) for x in i1])
31 |     c2 = " ".join([form_manager.get_idx_symbol(x) for x in i2])
32 |     if ('=' not in c1) or ('=' not in c2):
33 |         return False
34 |     elif (form_manager.unk_token in c1) or (form_manager.unk_token in c2):
35 |         return False
36 |     else:
37 |         try:
38 |             s1 = c1.split('=')
39 |             s2 = c2.split('=')
40 |             eq1 = []
41 |             eq2 = []
42 |             x = sympy.Symbol('x')
43 |             eq1.append(parse_expr(s1[0]))
44 |             eq1.append(parse_expr(s1[1]))
45 |             eq2.append(parse_expr(s2[0]))
46 |             eq2.append(parse_expr(s2[1]))
47 |             res1 = sympy.solve(sympy.Eq(eq1[0], eq1[1]), x)
48 |             res2 = sympy.solve(sympy.Eq(eq2[0], eq2[1]), x)
49 | 
50 |             if not res1 or not res2:
51 |                 return False
52 |             if res1[0] == res2[0]:
53 |                 # print("Excution_true: ", c1, '\t', c2)
54 |                 pass
55 |             return res1[0] == res2[0]
56 | 
57 |         except BaseException:
58 |             # print("Excution_error: ", c1, '\t', c2)
59 |             pass
60 |             return False
61 | 
62 | def compute_accuracy(candidate_list, reference_list, form_manager):
63 |     if len(candidate_list) != len(reference_list):
64 |         print("candidate list has length {}, reference list has length {}\n".format(
65 |             len(candidate_list), len(reference_list)))
66 |     len_min = min(len(candidate_list), len(reference_list))
67 |     c = 0
68 |     for i in range(len_min):
69 |         if is_all_same(candidate_list[i], reference_list[i], form_manager):
70 |             c = c+1
71 |         else:
72 |             pass
73 |     return c/float(len_min)
74 | 
75 | 
76 | def compute_tree_accuracy(candidate_list_, reference_list_, form_manager):
77 |     candidate_list = []
78 |     for i in range(len(candidate_list_)):
79 |         candidate_list.append(candidate_list_[i])
80 |     reference_list = []
81 |     for i in range(len(reference_list_)):
82 |         reference_list.append(reference_list_[i])
83 |     return compute_accuracy(candidate_list, reference_list, form_manager)
84 | 
85 | def prepare_oov(batch_graph, src_vocab, device):
86 |     oov_dict = copy.deepcopy(src_vocab)
87 |     token_matrix = []
88 |     for n in batch_graph.node_attributes:
89 |         node_token = n['token']
90 |         if oov_dict.get_symbol_idx(node_token) == oov_dict.get_symbol_idx(oov_dict.unk_token):
91 |             oov_dict.add_symbol(node_token)
92 |         token_matrix.append(oov_dict.get_symbol_idx(node_token))
93 |     batch_graph.node_features['token_id_oov'] = torch.tensor(token_matrix, dtype=torch.long).to(device)
94 |     return oov_dict


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Deep Learning on Graphs for Natural Language Processing Demo
 2 | 
 3 | 
 4 | The repository contains code examples for [DLG4NLP](https://dlg4nlp.github.io) tutorials at [NAACL 2021](https://2021.naacl.org), [SIGIR 2021](https://sigir.org/sigir2021/), [KDD 2021](https://www.kdd.org/kdd2021/), [IJCAI 2021](http://ijcai-21.org), [AAAI 2022](https://aaai.org/Conferences/AAAI-22/) and [TheWebConf 2022](https://www2022.thewebconf.org/conference-schedule/).
 5 | 
 6 | Slides can be downloaded from [here](https://dlg4nlp.github.io/tutorials.html).
 7 | 
 8 | 
 9 | ## Get Started
10 | 
11 | You will need to install our [graph4nlp library](https://github.com/graph4ai/graph4nlp) in order to run the demo code. Please follow the following environment setup instructions. Please also refer to the [*graph4nlp* repository page](https://github.com/graph4ai/graph4nlp#readme) for more details on how to use the library.
12 | 
13 | 
14 | ### Environment setup
15 | 
16 | 1. Create virtual environment
17 | ```
18 | conda create --name graph4nlp python=3.8
19 | conda activate graph4nlp
20 | ```
21 | 
22 | 2. Install [graph4nlp](https://github.com/graph4ai/graph4nlp) library
23 | - Clone the github repo
24 | ```
25 | git clone -b [branch_version] https://github.com/graph4ai/graph4nlp.git
26 | cd graph4nlp
27 | ```
28 | Please choose the branch version corresponding to the demo version as shown in the table below.
29 | 
30 | | demo version | library branch version |  
31 | | ---- | ---- |  
32 | | DLG4NLP@ICLR 2022 | v0.5.5 |
33 | | TheWebConf 2022 | v0.5.5 |
34 | | AAAI 2022 | v0.5.5 |  
35 | | CLIQ-ai 2021 | stable_nov2021b |  
36 | | IJCAI 2021 | stable_202108 |  
37 | | KDD 2021 | stable_202108 |  
38 | | SIGIR 2021 | stable |  
39 | | NAACL 2021 | stable |  
40 | 
41 | 
42 | - Then run `./configure` (or `./configure.bat` if you are using Windows 10) to config your installation. The configuration program will ask you to specify your CUDA version. If you do not have a GPU, please choose 'cpu'.
43 | ```
44 | ./configure
45 | ```
46 | - Finally, install the package
47 | ```
48 | python setup.py install
49 | ```
50 | 3. Install other packages
51 | ```
52 | pip install torchtext
53 | pip install notebook
54 | ```
55 | 
56 | 4. Set up StanfordCoreNLP (for static graph construction only, unnecessary for this demo because preprocessed data is provided)
57 | - Download [StanfordCoreNLP](https://stanfordnlp.github.io/CoreNLP/)
58 | - Go to the root folder and start the server
59 | ```
60 | java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000
61 | ```
62 | 
63 | 
64 | ### Start Jupyter notebook and run the demo
65 | 
66 | After complete the above steps, you can start the jupyter notebook server to run the demo:
67 | ```
68 | cd graph4nlp_demo/XYZ
69 | jupyter notebook
70 | ```
71 | Note that you will need to change `XYZ` to the specific folder name.
72 | 
73 | ## Additional Resources:
74 | 
75 | * [Graph4NLP library](https://github.com/graph4ai/graph4nlp)
76 | * [DLG4NLP website](https://dlg4nlp.github.io/index.html)
77 | * [DLG4NLP survey](https://arxiv.org/pdf/2106.06090)
78 | * [DLG4NLP literature repo](https://github.com/graph4ai/graph4nlp_literature)
79 | 
80 | <!-- ### Citation: -->
81 | 
82 | 


--------------------------------------------------------------------------------
/TheWebConf2022_demo/Math-word-problem/config.yaml:
--------------------------------------------------------------------------------
 1 | graph_construction_name: "node_emb"
 2 | graph_embedding_name: "graphsage"
 3 | decoder_name: "stdtree"
 4 | 
 5 | graph_construction_args:
 6 |   graph_construction_share:
 7 |     graph_name: 'node_emb'
 8 |     root_dir: "data"
 9 |     topology_subdir: 'NodeEmbGraph'
10 |     thread_number: 4
11 |     port: 9000
12 |     timeout: 15000
13 | 
14 |   graph_construction_private:
15 |     edge_strategy: 'homogeneous'
16 |     merge_strategy: 'tailhead'
17 |     sequential_link: true
18 |     as_node: false
19 |     sim_metric_type: 'weighted_cosine'
20 |     num_heads: 1
21 |     top_k_neigh: null
22 |     epsilon_neigh: 0.5
23 |     smoothness_ratio: 0.1
24 |     connectivity_ratio: 0.05
25 |     sparsity_ratio: 0.1
26 | 
27 | graph_initialization_args:
28 |   input_size: 300
29 |   hidden_size: 300
30 |   word_dropout: 0.1
31 |   rnn_dropout: 0.1
32 |   fix_bert_emb: false
33 |   fix_word_emb: false
34 |   embedding_style:
35 |     single_token_item: true
36 |     emb_strategy: "w2v_bilstm"
37 |     num_rnn_layers: 1
38 |     bert_model_name: null
39 |     bert_lower_case: null
40 | 
41 | graph_embedding_args:
42 |   graph_embedding_share:
43 |     num_layers: 1
44 |     input_size: 300
45 |     hidden_size: 300
46 |     output_size: 300
47 |     direction_option: "undirected"
48 |     feat_drop: 0.0
49 |     attn_drop: 0.0
50 | 
51 |   graph_embedding_private:
52 |     aggregator_type: "lstm"
53 |     bias: true
54 |     norm: null
55 |     activation: "relu"
56 |     use_edge_weight: true
57 | 
58 | decoder_args:
59 |   rnn_decoder_share:
60 |     rnn_type: "lstm"
61 |     input_size: 300
62 |     hidden_size: 300
63 |     rnn_emb_input_size: 300
64 |     use_copy: true
65 |     graph_pooling_strategy: null
66 |     attention_type: "uniform"
67 |     fuse_strategy: "concatenate"
68 |     dropout: 0.3
69 |     teacher_forcing_rate: 1.0
70 | 
71 |   rnn_decoder_private:
72 |     max_decoder_step: 35
73 |     max_tree_depth: 8
74 |     use_sibling: false
75 | 


--------------------------------------------------------------------------------
/TheWebConf2022_demo/Math-word-problem/data/processed/NodeEmbGraph/data.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/TheWebConf2022_demo/Math-word-problem/data/processed/NodeEmbGraph/data.pt


--------------------------------------------------------------------------------
/TheWebConf2022_demo/Math-word-problem/data/processed/NodeEmbGraph/vocab.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/TheWebConf2022_demo/Math-word-problem/data/processed/NodeEmbGraph/vocab.pt


--------------------------------------------------------------------------------
/TheWebConf2022_demo/Math-word-problem/imgs/g2t.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/TheWebConf2022_demo/Math-word-problem/imgs/g2t.png


--------------------------------------------------------------------------------
/TheWebConf2022_demo/Math-word-problem/utils.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import torch
 3 | import sympy
 4 | from random import randint
 5 | from sympy.parsing.sympy_parser import parse_expr
 6 | 
 7 | def convert_to_string(idx_list, form_manager):
 8 |     w_list = []
 9 |     for i in range(len(idx_list)):
10 |         w_list.append(form_manager.get_idx_symbol(int(idx_list[i])))
11 |     return " ".join(w_list)
12 | 
13 | def is_all_same(c1, c2, form_manager):
14 |     all_same = False
15 |     if len(c1) == len(c2):
16 |         all_same = True
17 |         for j in range(len(c1)):
18 |             if c1[j] != c2[j]:
19 |                 all_same = False
20 |                 break
21 |     if all_same == False:
22 |         if is_solution_same(c1, c2, form_manager):
23 |             return True
24 |         return False
25 |     else:
26 |         return True
27 | 
28 | 
29 | def is_solution_same(i1, i2, form_manager):
30 |     c1 = " ".join([form_manager.get_idx_symbol(x) for x in i1])
31 |     c2 = " ".join([form_manager.get_idx_symbol(x) for x in i2])
32 |     if ('=' not in c1) or ('=' not in c2):
33 |         return False
34 |     elif (form_manager.unk_token in c1) or (form_manager.unk_token in c2):
35 |         return False
36 |     else:
37 |         try:
38 |             s1 = c1.split('=')
39 |             s2 = c2.split('=')
40 |             eq1 = []
41 |             eq2 = []
42 |             x = sympy.Symbol('x')
43 |             eq1.append(parse_expr(s1[0]))
44 |             eq1.append(parse_expr(s1[1]))
45 |             eq2.append(parse_expr(s2[0]))
46 |             eq2.append(parse_expr(s2[1]))
47 |             res1 = sympy.solve(sympy.Eq(eq1[0], eq1[1]), x)
48 |             res2 = sympy.solve(sympy.Eq(eq2[0], eq2[1]), x)
49 | 
50 |             if not res1 or not res2:
51 |                 return False
52 |             if res1[0] == res2[0]:
53 |                 # print("Excution_true: ", c1, '\t', c2)
54 |                 pass
55 |             return res1[0] == res2[0]
56 | 
57 |         except BaseException:
58 |             # print("Excution_error: ", c1, '\t', c2)
59 |             pass
60 |             return False
61 | 
62 | def compute_accuracy(candidate_list, reference_list, form_manager):
63 |     if len(candidate_list) != len(reference_list):
64 |         print("candidate list has length {}, reference list has length {}\n".format(
65 |             len(candidate_list), len(reference_list)))
66 |     len_min = min(len(candidate_list), len(reference_list))
67 |     c = 0
68 |     for i in range(len_min):
69 |         if is_all_same(candidate_list[i], reference_list[i], form_manager):
70 |             c = c+1
71 |         else:
72 |             pass
73 |     return c/float(len_min)
74 | 
75 | 
76 | def compute_tree_accuracy(candidate_list_, reference_list_, form_manager):
77 |     candidate_list = []
78 |     for i in range(len(candidate_list_)):
79 |         candidate_list.append(candidate_list_[i])
80 |     reference_list = []
81 |     for i in range(len(reference_list_)):
82 |         reference_list.append(reference_list_[i])
83 |     return compute_accuracy(candidate_list, reference_list, form_manager)
84 | 
85 | def prepare_ext_vocab(batch_graph, src_vocab, device):
86 |     oov_dict = copy.deepcopy(src_vocab)
87 |     token_matrix = []
88 |     for n in batch_graph.node_attributes:
89 |         node_token = n["token"]
90 |         if (n.get("type") is None or n.get("type") == 0) and oov_dict.get_symbol_idx(
91 |             node_token
92 |         ) == oov_dict.get_symbol_idx(oov_dict.unk_token):
93 |             oov_dict.add_symbol(node_token)
94 |         token_matrix.append(oov_dict.get_symbol_idx(node_token))
95 |     batch_graph.node_features["token_id_oov"] = torch.tensor(token_matrix, dtype=torch.long).to(
96 |         device
97 |     )
98 |     return oov_dict


--------------------------------------------------------------------------------
/config/jobs/gat_bi_sep_dynamic_node_emb.yaml:
--------------------------------------------------------------------------------
 1 | graph_construction_name: "node_emb"
 2 | graph_embedding_name: "gat"
 3 | decoder_name: "stdrnn"
 4 | 
 5 | graph_construction_args:
 6 |   graph_construction_share:
 7 |     graph_type: 'node_emb'
 8 |     root_dir: "../data/jobs"
 9 |     topology_subdir: 'node_emb_graph'
10 |     thread_number: 4
11 |     port: 9000
12 |     timeout: 15000
13 | 
14 |   graph_construction_private:
15 |     edge_strategy: 'homogeneous'
16 |     merge_strategy: 'tailhead'
17 |     sequential_link: true
18 |     as_node: false
19 | 
20 |   node_embedding:
21 |     input_size: 300
22 |     hidden_size: 300
23 |     word_dropout: 0.2
24 |     rnn_dropout: 0.3
25 |     fix_bert_emb: false
26 |     fix_word_emb: false
27 |     embedding_style:
28 |       single_token_item: true
29 |       emb_strategy: "w2v_bilstm"
30 |       num_rnn_layers: 1
31 |       bert_model_name: null
32 |       bert_lower_case: null
33 | 
34 |     sim_metric_type: 'weighted_cosine'
35 |     num_heads: 1
36 |     top_k_neigh: null
37 |     epsilon_neigh: 0.5
38 |     smoothness_ratio: 0.1
39 |     connectivity_ratio: 0.05
40 |     sparsity_ratio: 0.1
41 | 
42 | graph_embedding_args:
43 |   graph_embedding_share:
44 |     num_layers: 3
45 |     input_size: 300
46 |     hidden_size: 300
47 |     output_size: 300
48 |     direction_option: "bi_sep"
49 |     feat_drop: 0.2
50 | 
51 |   graph_embedding_private:
52 |     heads: [2, 2, 1]
53 |     attn_drop: 0.2
54 |     negative_slope: 0.2
55 |     residual: false
56 |     activation: "relu"
57 | 
58 | decoder_args:
59 |   rnn_decoder_share:
60 |     rnn_type: "lstm"
61 |     input_size: 300
62 |     hidden_size: 512
63 |     rnn_emb_input_size: 300
64 |     use_copy: true
65 |     use_coverage: true
66 |     graph_pooling_strategy: null
67 |     attention_type: "sep_diff_encoder_type"
68 |     fuse_strategy: "concatenate"
69 |     dropout: 0.3
70 | 
71 |   rnn_decoder_private:
72 |     max_decoder_step: 50
73 |     node_type_num: null
74 |     tgt_emb_as_output_layer: true
75 | 
76 | other_args:
77 |   seed: 1236
78 |   checkpoint_save_path: 'out/jobs/gat_bi_sep_node_emb_ckpt'
79 |   pretrained_word_emb_name: '6B'
80 |   pretrained_word_emb_cache_dir: '.vector_cache'
81 |   share_vocab: True
82 |   word_emb_size: 300
83 |   beam_size: 4
84 |   epochs: 200 # number of maximal training epochs
85 |   batch_size: 24
86 |   learning_rate: 1.e-3
87 |   loss_display_step: 10
88 |   eval_display_number: 10
89 |   lr_start_decay_epoch: 20
90 |   lr_decay_rate: 0.9
91 |   lr_decay_per_epoch: 5
92 |   min_lr: 1.e-3
93 |   val_split_ratio: 0
94 |   num_workers: 0 # number of data loader workers
95 |   use_gpu: 1 # 0 for don't use cuda, 1 for using cuda
96 |   gpu: 0 # gpu id
97 | 


--------------------------------------------------------------------------------
/config/jobs/gat_bi_sep_dynamic_node_emb_v2.yaml:
--------------------------------------------------------------------------------
 1 | graph_construction_name: "node_emb"
 2 | graph_embedding_name: "gat"
 3 | decoder_name: "stdrnn"
 4 | 
 5 | graph_construction_args:
 6 |   graph_construction_share:
 7 |     graph_name: 'node_emb'
 8 |     root_dir: "../data/jobs"
 9 |     topology_subdir: 'NodeEmbGraph'
10 |     thread_number: 4
11 |     port: 9000
12 |     timeout: 15000
13 | 
14 |   graph_construction_private:
15 |     edge_strategy: 'homogeneous'
16 |     merge_strategy: 'tailhead'
17 |     sequential_link: true
18 |     as_node: false
19 |     sim_metric_type: 'weighted_cosine'
20 |     num_heads: 1
21 |     top_k_neigh: null
22 |     epsilon_neigh: 0.5
23 |     smoothness_ratio: 0.1
24 |     connectivity_ratio: 0.05
25 |     sparsity_ratio: 0.1
26 | 
27 | graph_initialization_args:
28 |   input_size: 300
29 |   hidden_size: 300
30 |   word_dropout: 0.2
31 |   rnn_dropout: 0.3
32 |   fix_bert_emb: false
33 |   fix_word_emb: false
34 |   embedding_style:
35 |     single_token_item: true
36 |     emb_strategy: "w2v_bilstm"
37 |     num_rnn_layers: 1
38 |     bert_model_name: null
39 |     bert_lower_case: null
40 | 
41 |     
42 | 
43 | graph_embedding_args:
44 |   graph_embedding_share:
45 |     num_layers: 3
46 |     input_size: 300
47 |     hidden_size: 300
48 |     output_size: 300
49 |     direction_option: "bi_sep"
50 |     feat_drop: 0.2
51 | 
52 |   graph_embedding_private:
53 |     heads: [2, 2, 1]
54 |     attn_drop: 0.2
55 |     negative_slope: 0.2
56 |     residual: true
57 |     activation: "relu"
58 |     allow_zero_in_degree: true
59 | 
60 | decoder_args:
61 |   rnn_decoder_share:
62 |     rnn_type: "lstm"
63 |     input_size: 300
64 |     hidden_size: 512
65 |     rnn_emb_input_size: 300
66 |     use_copy: true
67 |     use_coverage: true
68 |     graph_pooling_strategy: null
69 |     attention_type: "sep_diff_encoder_type"
70 |     fuse_strategy: "concatenate"
71 |     dropout: 0.3
72 | 
73 |   rnn_decoder_private:
74 |     max_decoder_step: 50
75 |     node_type_num: null
76 |     tgt_emb_as_output_layer: true
77 | 
78 | other_args:
79 |   seed: 1236
80 |   checkpoint_save_path: 'out/jobs/gat_bi_sep_node_emb_ckpt'
81 |   pretrained_word_emb_name: '6B'
82 |   pretrained_word_emb_cache_dir: '.vector_cache'
83 |   share_vocab: True
84 |   word_emb_size: 300
85 |   beam_size: 4
86 |   epochs: 200 # number of maximal training epochs
87 |   batch_size: 24
88 |   learning_rate: 1.e-3
89 |   loss_display_step: 10
90 |   eval_display_number: 10
91 |   lr_start_decay_epoch: 20
92 |   lr_decay_rate: 0.9
93 |   lr_decay_per_epoch: 5
94 |   min_lr: 1.e-3
95 |   val_split_ratio: 0
96 |   num_workers: 0 # number of data loader workers
97 |   use_gpu: 1 # 0 for don't use cuda, 1 for using cuda
98 |   gpu: 0 # gpu id
99 | 


--------------------------------------------------------------------------------
/config/trec/graphsage_bi_fuse_static_dependency.yaml:
--------------------------------------------------------------------------------
 1 | # Data
 2 | dataset: 'trec'
 3 | root_data_dir: '../data/trec'
 4 | val_split_ratio: 0.2 # validation set split ratio (default: 0.2)
 5 | pretrained_word_emb_name: '840B'
 6 | out_dir: 'out/trec/graphsage_bi_fuse_dependency_ckpt'
 7 | 
 8 | 
 9 | # Graph construction
10 | graph_type: 'dependency' # graph construction type ('dependency', 'constituency', 'ie', 'node_emb', 'node_emb_refined')
11 | 
12 | # Dynamic graph construction
13 | init_graph_type: null # initial graph construction type ('line', 'dependency', 'constituency', 'ie')
14 | gl_metric_type: null # similarity metric type for dynamic graph construction ('weighted_cosine', 'attention', 'rbf_kernel', 'cosine')
15 | gl_epsilon: null # epsilon for graph sparsification
16 | gl_top_k: null # top k for graph sparsification
17 | gl_num_heads: 1 # num of heads for dynamic graph construction
18 | gl_num_hidden: 300 # number of hidden units for dynamic graph construction
19 | gl_smoothness_ratio: null # smoothness ratio for graph regularization loss
20 | gl_sparsity_ratio: null # sparsity ratio for graph regularization loss
21 | gl_connectivity_ratio: null # connectivity ratio for graph regularization loss
22 | init_adj_alpha: null # alpha ratio for combining initial graph adjacency matrix
23 | 
24 | 
25 | # Graph embedding construction
26 | word_dropout: 0.4 # word embedding dropout
27 | rnn_dropout: 0.1 # RNN dropout
28 | no_fix_word_emb: false # Not fix pretrained word embeddings (default: false)
29 | node_edge_emb_strategy: 'mean' # node edge embedding strategy for graph embedding construction ('mean', 'lstm', 'gru', 'bilstm' and 'bigru')
30 | seq_info_encode_strategy: 'bilstm' # sequence info encoding strategy for graph embedding construction ('none', 'lstm', 'gru', 'bilstm' and 'bigru')
31 | 
32 | 
33 | # GNN
34 | gnn: 'graphsage'
35 | gnn_direction_option: 'bi_fuse' # GNN direction type ('undirected', 'bi_sep', 'bi_fuse')
36 | gnn_num_layers: 1 # number of GNN layers
37 | num_hidden: 300 # number of hidden units
38 | graph_pooling: 'avg_pool' # graph pooling ('avg_pool', 'max_pool')
39 | max_pool_linear_proj: false # use linear projectioni for max pooling
40 | gnn_dropout: 0.3 # 0.3 # GNN input feature dropout
41 | 
42 | # GAT
43 | gat_attn_dropout: null # GAT attention dropout
44 | gat_negative_slope: null # the negative slope of leaky relu
45 | gat_num_heads: null # number of hidden attention heads
46 | gat_num_out_heads: null # number of output attention heads
47 | gat_residual: false # use gat_residual connection
48 | # GraphSAGE
49 | graphsage_aggreagte_type: 'lstm' # graphsage aggreagte type ('mean', 'gcn', 'pool', 'lstm')
50 | 
51 | 
52 | # Training
53 | seed: 1234
54 | batch_size: 50 # batch size
55 | epochs: 500 # number of maximal training epochs
56 | patience: 10
57 | lr: 0.001 # learning rate
58 | lr_patience: 2
59 | lr_reduce_factor: 0.5
60 | num_workers: 1 # number of data loader workers
61 | 
62 | 
63 | gpu: 0
64 | no_cuda: false
65 | 


--------------------------------------------------------------------------------
/config/trec/graphsage_bi_fuse_static_dependency_v2.yaml:
--------------------------------------------------------------------------------
 1 | # Data
 2 | dataset: 'trec'
 3 | val_split_ratio: 0.2 # validation set split ratio (default: 0.2)
 4 | pretrained_word_emb_name: '840B'
 5 | out_dir: 'out/trec/graphsage_bi_fuse_dependency_ckpt'
 6 | 
 7 | 
 8 | # Graph construction
 9 | graph_construction_args:
10 |   graph_construction_share:
11 |     graph_name: 'dependency'
12 |     root_dir: '../data/trec'
13 |     thread_number: 10
14 |     port: 9000
15 |     timeout: 15000
16 | 
17 |   graph_construction_private:
18 |     edge_strategy: 'homogeneous'
19 |     merge_strategy: 'tailhead'
20 |     sequential_link: true
21 |     as_node: false
22 |     dynamic_init_graph_name: null # initial graph construction type ('line', 'dependency', 'constituency', 'ie')
23 | 
24 | 
25 | 
26 | # Dynamic graph construction
27 | gl_metric_type: null # similarity metric type for dynamic graph construction ('weighted_cosine', 'attention', 'rbf_kernel', 'cosine')
28 | gl_epsilon: null # epsilon for graph sparsification
29 | gl_top_k: null # top k for graph sparsification
30 | gl_num_heads: 1 # num of heads for dynamic graph construction
31 | gl_num_hidden: 300 # number of hidden units for dynamic graph construction
32 | gl_smoothness_ratio: null # smoothness ratio for graph regularization loss
33 | gl_sparsity_ratio: null # sparsity ratio for graph regularization loss
34 | gl_connectivity_ratio: null # connectivity ratio for graph regularization loss
35 | init_adj_alpha: null # alpha ratio for combining initial graph adjacency matrix
36 | 
37 | 
38 | # Graph embedding construction
39 | word_dropout: 0.4 # word embedding dropout
40 | rnn_dropout: 0.1 # RNN dropout
41 | no_fix_word_emb: false # Not fix pretrained word embeddings (default: false)
42 | emb_strategy: 'w2v_bilstm'
43 | 
44 | # GNN
45 | gnn: 'graphsage'
46 | gnn_direction_option: 'bi_fuse' # GNN direction type ('undirected', 'bi_sep', 'bi_fuse')
47 | gnn_num_layers: 1 # number of GNN layers
48 | num_hidden: 300 # number of hidden units
49 | graph_pooling: 'avg_pool' # graph pooling ('avg_pool', 'max_pool')
50 | max_pool_linear_proj: false # use linear projectioni for max pooling
51 | gnn_dropout: 0.4 # 0.4 # GNN input feature dropout
52 | 
53 | # GAT
54 | gat_attn_dropout: null # GAT attention dropout
55 | gat_negative_slope: null # the negative slope of leaky relu
56 | gat_num_heads: null # number of hidden attention heads
57 | gat_num_out_heads: null # number of output attention heads
58 | gat_residual: false # use gat_residual connection
59 | # GraphSAGE
60 | graphsage_aggreagte_type: 'lstm' # graphsage aggreagte type ('mean', 'gcn', 'pool', 'lstm')
61 | 
62 | 
63 | # Training
64 | seed: 1234
65 | batch_size: 50 # batch size
66 | epochs: 500 # number of maximal training epochs
67 | patience: 10
68 | lr: 0.001 # learning rate
69 | lr_patience: 2
70 | lr_reduce_factor: 0.5
71 | num_workers: 0 # number of data loader workers
72 | 
73 | 
74 | gpu: -1
75 | no_cuda: false
76 | 


--------------------------------------------------------------------------------
/data/jobs/processed/NodeEmbGraph/data.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/data/jobs/processed/NodeEmbGraph/data.pt


--------------------------------------------------------------------------------
/data/jobs/processed/NodeEmbGraph/vocab.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/data/jobs/processed/NodeEmbGraph/vocab.pt


--------------------------------------------------------------------------------
/data/jobs/processed/node_emb_graph/data.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/data/jobs/processed/node_emb_graph/data.pt


--------------------------------------------------------------------------------
/data/jobs/processed/node_emb_graph/vocab.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/data/jobs/processed/node_emb_graph/vocab.pt


--------------------------------------------------------------------------------
/data/jobs/raw/sequence.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/data/jobs/raw/sequence.pt


--------------------------------------------------------------------------------
/data/jobs/raw/vocab.f.txt:
--------------------------------------------------------------------------------
 1 | ,	1996
 2 | )	1498
 3 | (	1498
 4 | ANS	1470
 5 | job	494
 6 | language	209
 7 | languageid0	195
 8 | loc	167
 9 | locid0	164
10 | req_deg	111
11 | degid0	105
12 | platform	102
13 | platformid0	97
14 | area	87
15 | areaid0	84
16 | titleid0	79
17 | title	79
18 | salary_greater_than	66
19 | num_salary	66
20 | year	65
21 | req_exp	63
22 | \+	62
23 | company	58
24 | companyid0	56
25 | year0	31
26 | application	22
27 | languageid1	19
28 | A	19
29 | applicationid0	14
30 | des_deg	13
31 | des_exp	11
32 | recruiter	6
33 | B	6
34 | recruiterid0	6
35 | degid1	4
36 | ;	3
37 | year1	3
38 | X	3
39 | areaid1	3
40 | languageid2	2
41 | platformid1	2
42 | P	2
43 | 20	1
44 | month	1
45 | salary_less_than	1
46 | hour	1
47 | locid1	1
48 | 


--------------------------------------------------------------------------------
/data/jobs/raw/vocab.q.txt:
--------------------------------------------------------------------------------
  1 | job	452
  2 | in	200
  3 | languageid0	195
  4 | what	188
  5 | are	186
  6 | a	166
  7 | locid0	164
  8 | requir	163
  9 | there	142
 10 | me	135
 11 | show	132
 12 | the	130
 13 | degid0	105
 14 | use	100
 15 | platformid0	98
 16 | for	86
 17 | areaid0	84
 18 | titleid0	79
 19 | that	74
 20 | experi	72
 21 | on	70
 22 | ani	68
 23 | num_salary	66
 24 | with	65
 25 | of	63
 26 | and	63
 27 | companyid0	59
 28 | year	53
 29 | not	32
 30 | year0	31
 31 | list	31
 32 | pay	31
 33 | at	29
 34 | give	26
 35 | degre	25
 36 | desir	22
 37 | but	22
 38 | which	22
 39 | specialist	21
 40 | do	20
 41 | languageid1	19
 42 | salari	19
 43 | avail	17
 44 | no	17
 45 | all	16
 46 | knowledg	16
 47 | posit	16
 48 | have	15
 49 | i	15
 50 | applicationid0	14
 51 | per	14
 52 | tell	13
 53 | offer	12
 54 | than	12
 55 | to	11
 56 | open	11
 57 | is	11
 58 | least	10
 59 | work	10
 60 | program	9
 61 | know	9
 62 | want	9
 63 | who	9
 64 | find	8
 65 | dont	8
 66 | need	8
 67 | area	7
 68 | greater	6
 69 | recruiterid0	6
 70 | can	6
 71 | as	6
 72 | you	6
 73 | doe	6
 74 | over	5
 75 | involv	5
 76 | locat	5
 77 | specialti	5
 78 | like	5
 79 | more	5
 80 | compani	5
 81 | recruit	5
 82 | an	5
 83 | dollar	4
 84 | out	4
 85 | could	4
 86 | comput	4
 87 | degid1	4
 88 | onli	3
 89 | year1	3
 90 | anyth	3
 91 | deal	3
 92 | some	3
 93 | areaid1	3
 94 | or	3
 95 | platform	3
 96 | would	3
 97 | peopl	3
 98 | hire	3
 99 | develop	3
100 | titl	3
101 | everyth	3
102 | move	2
103 | administr	2
104 | see	2
105 | special	2
106 | machin	2
107 | colleg	2
108 | major	2
109 | within	2
110 | art	2
111 | languageid2	2
112 | from	2
113 | system	2
114 | their	2
115 | tool	2
116 | wish	2
117 | employ	2
118 | were	2
119 | applic	2
120 | platformid1	2
121 | concern	2
122 | sure	2
123 | student	2
124 | fresh	2
125 | buzword	1
126 | help	1
127 | month	1
128 | languag	1
129 | field	1
130 | less	1
131 | might	1
132 | familiar	1
133 | vaniti	1
134 | earn	1
135 | name	1
136 | where	1
137 | pleas	1
138 | old	1
139 | locid1	1
140 | someth	1
141 | hardwar	1
142 | learn	1
143 | key	1
144 | 20	1
145 | outsid	1
146 | oper	1
147 | wonder	1
148 | doesnt	1
149 | cs	1
150 | .	1
151 | live	1
152 | much	1
153 | relat	1
154 | hold	1
155 | look	1
156 | hour	1
157 | us	1
158 | near	1
159 | it	1
160 | if	1
161 | make	1
162 | id	1
163 | satiat	1
164 | how	1
165 | anyon	1
166 | someon	1
167 | associ	1
168 | without	1
169 | environ	1
170 | greed	1
171 | 


--------------------------------------------------------------------------------
/data/trec/processed/dependency_graph/data.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/data/trec/processed/dependency_graph/data.pt


--------------------------------------------------------------------------------
/data/trec/processed/dependency_graph/label.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/data/trec/processed/dependency_graph/label.pt


--------------------------------------------------------------------------------
/data/trec/processed/dependency_graph/vocab.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graph4ai/graph4nlp_demo/b5f81226201f202eff3084632a16f2c96b2ce7a4/data/trec/processed/dependency_graph/vocab.pt


--------------------------------------------------------------------------------